require 'lib/grammar'
module Abnf
  
    class Tokenizer
        def initialize
      @rex = [
        [ /\A\r?\n/m, :newline ],
        [ /\A[ \t\f]+/m, :space ],
        [ /\A(;[^\r\n]*\r?\n)/m, [ 
                             [ /\A;([^\r\n]*)/m, :comment ], 
                             [ /\A\r?\n/m,   :newline ] 
                           ]
        ],
        [ /\AALPHA/m, :_alpha ],
        [ /\ABIT/m, :_bit ],
        [ /\AVCHAR/m, :_vchar ],       
        [ /\ACHAR/m, :_char ],
        [ /\ACRLF/m, :_crlf ],
        [ /\ACR/m, :_cr ],
        [ /\ALF/m, :_lf ],
        [ /\ACTL/m, :_ctl ],
        [ /\ADIGIT/m, :_digit ],
        [ /\ADQUOTE/m, :_dquote ],
        [ /\AHEXDIG/m, :_hexdig ],
        [ /\AHTAB/m, :_htab ],
        [ /\ALWSP/m, :_lwsp ],
        [ /\AWSP/m, :_wsp ],
        [ /\ASP/m, :_sp ],
        [ /\AOCTET/m, :_octet ],
        [ /\A([A-Za-z][\w\-]*)/m, :symbol ],
        [ /\A<([A-Za-z][\w\-]*)>/m, :symbol ],
        [ /\A"([^"]*?)"/m, :literal ],
        [ /\A%b([01][01\-\.]*)/m, [
                                   [ /\A([01]+-[01]+)/m, :range_bin ],
                                   [ /\A([01]+)/m, :entity_bin ],
                                   [ /\A\./m, :dot ]                            
                                  ]
        ],
        [ /\A%d([\d][\d\-\.]*)/m, [
                                   [ /\A(\d+-\d+)/m, :range_dec ], 
                                   [ /\A(\d+)/m, :entity_dec ],
                                   [ /\A\./m, :dot ]                            
                                  ]
        ],
        [ /\A%x([a-fA-F\d][a-fA-F\d\-\.]*)/m, [
                                               [ /\A([a-fA-F\d]+-[a-fA-F\d]+)/m, :range_hex ],
                                               [ /\A([a-fA-F\d]+)/m, :entity_hex ],
                                               [ /\A\./m, :dot ]                           
                                              ]
        ],
     
        [ /\A(\d+)/m, :number ],
        [ /\A\*/m, :asterisk ],
        [ /\A=\//m, :eq_slash ],
        [ /\A=/m, :equals ],
        [ /\A\(/m, :seq_begin ],
        [ /\A\)/m, :seq_end ],
        [ /\A\[/m, :opt_begin ],
        [ /\A\]/m, :opt_end ],
       
        [ /\A\//m, :slash ]
      ]
    end
            def tokenize( txt )
      tokens = tokenize_internal( txt, @rex )
      tokens.push Mapper::Token.new( :eof )    
    end
    protected
    def tokenize_internal( txt, rex )
      indentation = /^(\s*)/.match(txt).to_s.size     
      text = txt.gsub( /^[ \t\f]{0,#{indentation}}/, '' )
      tokens = []
      matched = nil
      until text.empty?
        rex.each do |rule| 
          expr, = rule
          matched = expr.match( text )
          next if matched.nil?
          consumed = matched[0].to_s 
          data = (matched.size>1) ? matched[1].to_s : nil
          if .class == Array
            tokens.concat tokenize_internal( data,  )           else
            tokens.push Mapper::Token.new( , data )
          end
          text = text[ consumed.size...text.size ]
          break
        end
        raise "Tokenizer: unexpected tokens near '#{text.slice(0..10)}'" if matched.nil? 
      end
      
      tokens
    end
  end
end