method
tokenize

v1_9_3_392 - Show latest stable - Class: RDoc::Markup::Parser
tokenize(input)
public
Turns text input into a stream of tokens
# File lib/rdoc/markup/parser.rb, line 391
  def tokenize input
    s = StringScanner.new input

    @line = 0
    @line_pos = 0

    until s.eos? do
      pos = s.pos

      # leading spaces will be reflected by the column of the next token
      # the only thing we loose are trailing spaces at the end of the file
      next if s.scan(/ +/)

      # note: after BULLET, LABEL, etc.,
      # indent will be the column of the next non-newline token

      @tokens << case
                 # [CR]LF => :NEWLINE
                 when s.scan(/\r?\n/) then
                   token = [:NEWLINE, s.matched, *token_pos(pos)]
                   @line_pos = s.pos
                   @line += 1
                   token
                 # === text => :HEADER then :TEXT
                 when s.scan(/(=+)(\s*)/) then
                   level = s[1].length
                   header = [:HEADER, level, *token_pos(pos)]

                   if s[2] =~ /^\r?\n/ then
                     s.pos -= s[2].length
                     header
                   else
                     pos = s.pos
                     s.scan(/.*/)
                     @tokens << header
                     [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
                   end
                 # --- (at least 3) and nothing else on the line => :RULE
                 when s.scan(/(-{3,}) *$/) then
                   [:RULE, s[1].length - 2, *token_pos(pos)]
                 # * or - followed by white space and text => :BULLET
                 when s.scan(/([*-]) +(\S)/) then
                   s.pos -= s[2].bytesize # unget \S
                   [:BULLET, s[1], *token_pos(pos)]
                 # A. text, a. text, 12. text => :UALPHA, :LALPHA, :NUMBER
                 when s.scan(/([a-z]|\d+)\. +(\S)/) then
                   # FIXME if tab(s), the column will be wrong
                   # either support tabs everywhere by first expanding them to
                   # spaces, or assume that they will have been replaced
                   # before (and provide a check for that at least in debug
                   # mode)
                   list_label = s[1]
                   s.pos -= s[2].bytesize # unget \S
                   list_type =
                     case list_label
                     when /[a-z]/ then :LALPHA
                     when /[A-Z]/ then :UALPHA
                     when /\d/    then :NUMBER
                     else
                       raise ParseError, "BUG token #{list_label}"
                     end
                   [list_type, list_label, *token_pos(pos)]
                 # [text] followed by spaces or end of line => :LABEL
                 when s.scan(/\[(.*?)\]( +|$)/) then
                   [:LABEL, s[1], *token_pos(pos)]
                 # text:: followed by spaces or end of line => :NOTE
                 when s.scan(/(.*?)::( +|$)/) then
                   [:NOTE, s[1], *token_pos(pos)]
                 # anything else: :TEXT
                 else s.scan(/.*/)
                   [:TEXT, s.matched.sub(/\r$/, ''), *token_pos(pos)]
                 end
    end

    self
  end
tokenize

Related methods