Class: Raif::Utils::ReadableContentExtractor

Inherits:

Object

Object
Raif::Utils::ReadableContentExtractor

show all

Defined in:: lib/raif/utils/readable_content_extractor.rb

Constant Summary collapse

TAG_REMOVE_LIST =

[
  "a",
  "button",
  "form",
  "iframe",
  "img",
  "input",
  "label",
  "nav",
  "noscript",
  "script",
  "style",
  "svg",
  "footer"
]

Instance Attribute Summary collapse

#raw_html ⇒ Object readonly

Returns the value of attribute raw_html.

Instance Method Summary collapse

#empty_node_scrubber ⇒ Object
#extract_readable_content ⇒ Object

This will first remove all tags in TAG_REMOVE_LIST and their children.
#initialize(raw_html) ⇒ ReadableContentExtractor constructor

A new instance of ReadableContentExtractor.
#readable_content_scrubber ⇒ Object

Constructor Details

#initialize(raw_html) ⇒ `ReadableContentExtractor`

Returns a new instance of ReadableContentExtractor.



6
7
8

# File 'lib/raif/utils/readable_content_extractor.rb', line 6

def initialize(raw_html)
  @raw_html = raw_html
end

Instance Attribute Details

#raw_html ⇒ `Object` (readonly)

Returns the value of attribute raw_html.



4
5
6

# File 'lib/raif/utils/readable_content_extractor.rb', line 4

def raw_html
  @raw_html
end

Instance Method Details

#empty_node_scrubber ⇒ `Object`

# File 'lib/raif/utils/readable_content_extractor.rb', line 54

def empty_node_scrubber
  # remove empty nodes from the bottom up so any parents that have only empty children also get removed
  @empty_node_scrubber ||= Loofah::Scrubber.new(direction: :bottom_up) do |node|
    node.remove if node.children.empty? && node.text.strip.empty?
  end
end

#extract_readable_content ⇒ `Object`

This will first remove all tags in TAG_REMOVE_LIST and their children. Things in TAG_REMOVE_LIST are things that we do not consider likely to contain readable content. We also call scrub!(:strip) to remove anything unsafe, but leave the text content.

# File 'lib/raif/utils/readable_content_extractor.rb', line 29

def extract_readable_content
  body_content = Loofah.html5_document(raw_html).at("body")
  return raw_html unless body_content

  scrubbed_html = body_content
    .scrub!(readable_content_scrubber)
    .scrub!(empty_node_scrubber)
    .scrub!(:strip)

  scrubbed_html.inner_html
end

#readable_content_scrubber ⇒ `Object`

# File 'lib/raif/utils/readable_content_extractor.rb', line 41

def readable_content_scrubber
  @readable_content_scrubber ||= Loofah::Scrubber.new do |node|
    # if the node is something we don't consider readable content, remove it entirely
    node.remove if TAG_REMOVE_LIST.include?(node.name)

    # strip all attributes from the tags
    node.attributes.each { |attr| node.remove_attribute(attr.first) }

    # remove html comments
    node.remove if node.comment?
  end
end