A simple HTML tokenizer object class that can be used to break HTML down into tokens.
Create a new Arrow::HtmlTokenizer object.
# File /Users/ged/source/ruby/Arrow/lib/arrow/htmltokenizer.rb, line 45
45: def initialize( source )
46: @source = source
47: @scanner = StringScanner.new( source )
48: end
Enumerable interface: Iterates over parsed tokens, calling the supplied block with each one.
# File /Users/ged/source/ruby/Arrow/lib/arrow/htmltokenizer.rb, line 64
64: def each
65: @scanner.reset
66:
67: until @scanner.empty?
68: if @scanner.peek(1) == '<'
69: tag = @scanner.scan_until( />/ )
70:
71: case tag
72: when /^<!--/
73: token = HTMLComment.new( tag )
74: when /^<!/
75: token = DocType.new( tag )
76: when /^<\?/
77: token = ProcessingInstruction.new( tag )
78: else
79: token = HTMLTag.new( tag )
80: end
81: else
82: text = @scanner.scan( /[^<]+/ )
83: token = HTMLText.new( text )
84: end
85:
86: yield( token )
87: end
88: end
--- SEC00055
--- ""
---
- name: scanner
rw: R
a_desc: |+
The StringScanner doing the tokenizing
- name: source
rw: R
a_desc: |+
The HTML source being tokenized
---
- methods:
- visibility: public
aref: M000236
name: new
sourcecode: " <span class=\"ruby-comment cmt\"># File /Users/ged/source/ruby/Arrow/lib/arrow/htmltokenizer.rb, line 45</span>\n\
45: <span class=\"ruby-keyword kw\">def</span> <span class=\"ruby-identifier\">initialize</span>( <span class=\"ruby-identifier\">source</span> )\n\
46: <span class=\"ruby-ivar\">@source</span> = <span class=\"ruby-identifier\">source</span>\n\
47: <span class=\"ruby-ivar\">@scanner</span> = <span class=\"ruby-constant\">StringScanner</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">source</span> )\n\
48: <span class=\"ruby-keyword kw\">end</span>"
m_desc: |-
<p>
Create a <a href="HTMLTokenizer.html#M000236">new</a> Arrow::HtmlTokenizer
object.
</p>
params: ( source )
category: Class
type: Public
- methods:
- visibility: public
aref: M000237
name: each
sourcecode: " <span class=\"ruby-comment cmt\"># File /Users/ged/source/ruby/Arrow/lib/arrow/htmltokenizer.rb, line 64</span>\n\
64: <span class=\"ruby-keyword kw\">def</span> <span class=\"ruby-identifier\">each</span>\n\
65: <span class=\"ruby-ivar\">@scanner</span>.<span class=\"ruby-identifier\">reset</span>\n\
66: \n\
67: <span class=\"ruby-keyword kw\">until</span> <span class=\"ruby-ivar\">@scanner</span>.<span class=\"ruby-identifier\">empty?</span>\n\
68: <span class=\"ruby-keyword kw\">if</span> <span class=\"ruby-ivar\">@scanner</span>.<span class=\"ruby-identifier\">peek</span>(<span class=\"ruby-value\">1</span>) <span class=\"ruby-operator\">==</span> <span class=\"ruby-value str\">'<'</span>\n\
69: <span class=\"ruby-identifier\">tag</span> = <span class=\"ruby-ivar\">@scanner</span>.<span class=\"ruby-identifier\">scan_until</span>( <span class=\"ruby-regexp re\">/>/</span> )\n\
70: \n\
71: <span class=\"ruby-keyword kw\">case</span> <span class=\"ruby-identifier\">tag</span>\n\
72: <span class=\"ruby-keyword kw\">when</span> <span class=\"ruby-regexp re\">/^<!--/</span>\n\
73: <span class=\"ruby-identifier\">token</span> = <span class=\"ruby-constant\">HTMLComment</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">tag</span> )\n\
74: <span class=\"ruby-keyword kw\">when</span> <span class=\"ruby-regexp re\">/^<!/</span>\n\
75: <span class=\"ruby-identifier\">token</span> = <span class=\"ruby-constant\">DocType</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">tag</span> )\n\
76: <span class=\"ruby-keyword kw\">when</span> <span class=\"ruby-regexp re\">/^<\\?/</span>\n\
77: <span class=\"ruby-identifier\">token</span> = <span class=\"ruby-constant\">ProcessingInstruction</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">tag</span> )\n\
78: <span class=\"ruby-keyword kw\">else</span>\n\
79: <span class=\"ruby-identifier\">token</span> = <span class=\"ruby-constant\">HTMLTag</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">tag</span> )\n\
80: <span class=\"ruby-keyword kw\">end</span>\n\
81: <span class=\"ruby-keyword kw\">else</span>\n\
82: <span class=\"ruby-identifier\">text</span> = <span class=\"ruby-ivar\">@scanner</span>.<span class=\"ruby-identifier\">scan</span>( <span class=\"ruby-regexp re\">/[^<]+/</span> )\n\
83: <span class=\"ruby-identifier\">token</span> = <span class=\"ruby-constant\">HTMLText</span>.<span class=\"ruby-identifier\">new</span>( <span class=\"ruby-identifier\">text</span> )\n\
84: <span class=\"ruby-keyword kw\">end</span>\n\
85: \n\
86: <span class=\"ruby-keyword kw\">yield</span>( <span class=\"ruby-identifier\">token</span> )\n\
87: <span class=\"ruby-keyword kw\">end</span>\n\
88: <span class=\"ruby-keyword kw\">end</span>"
m_desc: |-
<p>
Enumerable interface: Iterates over parsed tokens, calling the supplied
block with <a href="HTMLTokenizer.html#M000237">each</a> one.
</p>
params: () {|token| ...}
category: Instance
type: Public
---
---
- name: SVNRev
desc: |+
SVN Revision
value: "%q$Rev: 437 $"
- name: SVNId
desc: |+
SVN Id
value: "%q$Id: htmltokenizer.rb 437 2008-03-28 00:49:20Z deveiant $"
Generated with the Darkfish Rdoc Generator.