Class: Raif::Utils::ReadableContentExtractor
- Inherits:
-
Object
- Object
- Raif::Utils::ReadableContentExtractor
- Defined in:
- lib/raif/utils/readable_content_extractor.rb
Constant Summary collapse
- TAG_REMOVE_LIST =
[ "a", "button", "form", "iframe", "img", "input", "label", "nav", "noscript", "script", "style", "svg", "footer" ]
Instance Attribute Summary collapse
-
#raw_html ⇒ Object
readonly
Returns the value of attribute raw_html.
Instance Method Summary collapse
- #empty_node_scrubber ⇒ Object
-
#extract_readable_content ⇒ Object
This will first remove all tags in TAG_REMOVE_LIST and their children.
-
#initialize(raw_html) ⇒ ReadableContentExtractor
constructor
A new instance of ReadableContentExtractor.
- #readable_content_scrubber ⇒ Object
Constructor Details
#initialize(raw_html) ⇒ ReadableContentExtractor
Returns a new instance of ReadableContentExtractor.
6 7 8 |
# File 'lib/raif/utils/readable_content_extractor.rb', line 6 def initialize(raw_html) @raw_html = raw_html end |
Instance Attribute Details
#raw_html ⇒ Object (readonly)
Returns the value of attribute raw_html.
4 5 6 |
# File 'lib/raif/utils/readable_content_extractor.rb', line 4 def raw_html @raw_html end |
Instance Method Details
#empty_node_scrubber ⇒ Object
54 55 56 57 58 59 |
# File 'lib/raif/utils/readable_content_extractor.rb', line 54 def empty_node_scrubber # remove empty nodes from the bottom up so any parents that have only empty children also get removed @empty_node_scrubber ||= Loofah::Scrubber.new(direction: :bottom_up) do |node| node.remove if node.children.empty? && node.text.strip.empty? end end |
#extract_readable_content ⇒ Object
This will first remove all tags in TAG_REMOVE_LIST and their children. Things in TAG_REMOVE_LIST are things that we do not consider likely to contain readable content. We also call scrub!(:strip) to remove anything unsafe, but leave the text content.
29 30 31 32 33 34 35 36 37 38 39 |
# File 'lib/raif/utils/readable_content_extractor.rb', line 29 def extract_readable_content body_content = Loofah.html5_document(raw_html).at("body") return raw_html unless body_content scrubbed_html = body_content .scrub!(readable_content_scrubber) .scrub!(empty_node_scrubber) .scrub!(:strip) scrubbed_html.inner_html end |
#readable_content_scrubber ⇒ Object
41 42 43 44 45 46 47 48 49 50 51 52 |
# File 'lib/raif/utils/readable_content_extractor.rb', line 41 def readable_content_scrubber @readable_content_scrubber ||= Loofah::Scrubber.new do |node| # if the node is something we don't consider readable content, remove it entirely node.remove if TAG_REMOVE_LIST.include?(node.name) # strip all attributes from the tags node.attributes.each { |attr| node.remove_attribute(attr.first) } # remove html comments node.remove if node.comment? end end |