Class: Raif::Utils::ReadableContentExtractor

Inherits:
Object
  • Object
show all
Defined in:
lib/raif/utils/readable_content_extractor.rb

Constant Summary collapse

TAG_REMOVE_LIST =
[
  "a",
  "button",
  "form",
  "iframe",
  "img",
  "input",
  "label",
  "nav",
  "noscript",
  "script",
  "style",
  "svg",
  "footer"
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(raw_html) ⇒ ReadableContentExtractor

Returns a new instance of ReadableContentExtractor.



6
7
8
# File 'lib/raif/utils/readable_content_extractor.rb', line 6

def initialize(raw_html)
  @raw_html = raw_html
end

Instance Attribute Details

#raw_htmlObject (readonly)

Returns the value of attribute raw_html.



4
5
6
# File 'lib/raif/utils/readable_content_extractor.rb', line 4

def raw_html
  @raw_html
end

Instance Method Details

#empty_node_scrubberObject



54
55
56
57
58
59
# File 'lib/raif/utils/readable_content_extractor.rb', line 54

def empty_node_scrubber
  # remove empty nodes from the bottom up so any parents that have only empty children also get removed
  @empty_node_scrubber ||= Loofah::Scrubber.new(direction: :bottom_up) do |node|
    node.remove if node.children.empty? && node.text.strip.empty?
  end
end

#extract_readable_contentObject

This will first remove all tags in TAG_REMOVE_LIST and their children. Things in TAG_REMOVE_LIST are things that we do not consider likely to contain readable content. We also call scrub!(:strip) to remove anything unsafe, but leave the text content.



29
30
31
32
33
34
35
36
37
38
39
# File 'lib/raif/utils/readable_content_extractor.rb', line 29

def extract_readable_content
  body_content = Loofah.html5_document(raw_html).at("body")
  return raw_html unless body_content

  scrubbed_html = body_content
    .scrub!(readable_content_scrubber)
    .scrub!(empty_node_scrubber)
    .scrub!(:strip)

  scrubbed_html.inner_html
end

#readable_content_scrubberObject



41
42
43
44
45
46
47
48
49
50
51
52
# File 'lib/raif/utils/readable_content_extractor.rb', line 41

def readable_content_scrubber
  @readable_content_scrubber ||= Loofah::Scrubber.new do |node|
    # if the node is something we don't consider readable content, remove it entirely
    node.remove if TAG_REMOVE_LIST.include?(node.name)

    # strip all attributes from the tags
    node.attributes.each { |attr| node.remove_attribute(attr.first) }

    # remove html comments
    node.remove if node.comment?
  end
end