Recursive Descent Parser in Ruby

嗯嗯嗯… 用 Ruby 來寫 recursive descent parser 真是妙，因為所有的 grammar 都可以用 Ruby 的語法來寫，換句話說，就是把 code 當成 data 來使用，例如：

start :expr do
  match(:expr, '+', :term) {|a, op, b| a + b}
  match(:expr, '-', :term) {|a, op, b| a - b}
  match(:term)
end

要增刪一條 rule 非常地直覺與簡單，同樣的方式要用 C 來寫就麻煩多了。本來是想自己寫一份新的，但是看了 Dennis Ranke 的方法後，不知不覺就跟著他的方法一直抄了，我覺得不太可能寫得比他的版本更好了。下面是我的版本：

class Parser
  class Lexer
    TokenMatch = Struct.new(:name, :pattern, :eval)

    attr_accessor :pos
    attr_reader :matches

    def initialize
      @matches = []
    end

    def add_match(name, pattern, eval)
      @matches << TokenMatch.new(name, pattern, eval)
    end

    def next_token
      @pos += 1
      return @tokens[@pos - 1]
    end

    def lex(code)
      @tokens = []
      code = code.gsub(/\s/, "")
      loop do
        break unless extract_token(code) do |token, remaining|
          @tokens << token
          code = remaining
        end
      end
      @pos = 0
    end

    def extract_token(code)
      @matches.each do |match|
        if match.pattern =~ code
          token = match.eval.call($~[0])
          yield(token, $~.post_match) if block_given?
          return token
        end
      end
      nil
    end
  end

  class Production

    DerivationMatch = Struct.new(:pattern, :eval)

    def initialize(name, parser)
      @name = name
      @parser = parser
      @matches = []
      @lrmatches = []
    end

    def match(*pattern, &eval)
      match = DerivationMatch.new(pattern, eval)
      if pattern[0] == @name
        pattern.shift
        @lrmatches << match
      else
        @matches << match
      end
    end

    def parse(indent = 0)
      # print " " * indent, "parse #{@name}\n"
      match_result = try_matches(@matches, nil, indent + 1)
      return nil unless match_result

      loop do
        result = try_matches(@lrmatches, match_result, indent + 1)
        unless result
          # print " " * indent, "result is #{match_result}\n"
          return match_result
        end
        match_result = result
      end
      return match_result
    end

    def try_matches(matches, pre_result, indent)
      start = @parser.lexer.pos
      matches.each do |match|
        results = pre_result ? [pre_result] : []
        # print " " * indent, "try #{match.pattern.inspect}\n"
        match.pattern.each do |expected|
          if Symbol === expected
            result = @parser.productions[expected].parse(indent + 1)
            results << result
            unless results.last
              results.clear
              break
            end
          else
            result = @parser.lexer.next_token
            if not expected === result
              # print " " * (indent + 1), "'#{expected}' rather than '#{result}' is expected. backtrack!\n"
              results = []
              break
            else
              results << result
            end
          end
        end

        if results.empty?
          @parser.lexer.pos = start
        elsif match.eval.nil?
          return results.first
        else
          # print " " * indent, "eval #{results.inspect}\n"
          return match.eval.call(*results)
        end
      end
      nil
    end

  end

  attr_reader :lexer
  attr_reader :productions

  def initialize(&block)
    @lexer = Lexer.new
    @productions = {}
    instance_eval(&block)
  end

  def token(name, pattern, &eval)
    @lexer.add_match(name, pattern, eval)
  end

  def production(name, &block)
    prod = Production.new(name, self)
    prod.instance_eval(&block)
    @productions[name] = prod
  end

  def start(name, &block)
    @start = production(name, &block)
  end

  def parse(code)
    @lexer.lex(code)
    @start.parse(0)
  end

end

這邊是測試碼：

class TestCompiler < Test::Unit::TestCase

  def setup
    @parser = Parser.new do
      # lexer here
      token(:literal, /^\d+/) { |l| l.to_i }
      token(:op, /^(\*\*|[\+\-\*\/\%\(|)])/) { |op| op }

      # grammar here
      start :expr do
        match(:expr, '+', :term) {|a, op, b| a + b}
        match(:expr, '-', :term) {|a, op, b| a - b}
        match(:term)
      end

      production :term do
        match(:term, '*', :pow) {|a, op, b| a * b}
        match(:term, '/', :pow) {|a, op, b| a / b}
        match(:term, '%', :pow) {|a, op, b| a % b}
        match(:pow)
      end

      production :pow do
        match(:pow, '**', :primary)  {|a, op, b| a ** b}
        match(:primary)
      end

      production :primary do
        match(Integer)
        match('-', Integer) {|minus, i| 0 - i}
        match('(', :expr, ')') {|lp, expr, rp| expr}
      end
    end
  end

  def test_01
    assert_equal 2+2, @parser.parse('2+2')
    assert_equal 2-2, @parser.parse('2-2')
    assert_equal 2*2, @parser.parse('2*2')
    assert_equal 2**2, @parser.parse('2**2')
    assert_equal 2/2, @parser.parse('2/2')
    assert_equal 2%2, @parser.parse('2%2')
    assert_equal 3%2, @parser.parse('3%2')
  end

  def test_02
    assert_equal 2+2+2, @parser.parse('2+2+2')
    assert_equal 2-2-2, @parser.parse('2-2-2')
    assert_equal 2*2*2, @parser.parse('2*2*2')
    assert_equal 2**2**2, @parser.parse('2**2**2')
    assert_equal 4/2/2, @parser.parse('4/2/2')
    assert_equal 7%2%1, @parser.parse('7%2%1')
  end

  def test_03
    assert_equal 2+2-2, @parser.parse('2+2-2')
    assert_equal 2-2+2, @parser.parse('2-2+2')
    assert_equal 2*2+2, @parser.parse('2*2+2')
    assert_equal 2**2+2, @parser.parse('2**2+2')
    assert_equal 4/2+2, @parser.parse('4/2+2')
    assert_equal 7%2+1, @parser.parse('7%2+1')
  end

  def test_04
    assert_equal 2+(2-2), @parser.parse('2+(2-2)')
    assert_equal 2-(2+2), @parser.parse('2-(2+2)')
    assert_equal 2+(2*2), @parser.parse('2+(2*2)')
    assert_equal 2*(2+2), @parser.parse('2*(2+2)')
    assert_equal 2**(2+2), @parser.parse('2**(2+2)')
    assert_equal 4/(2+2), @parser.parse('4/(2+2)')
    assert_equal 7%(2+1), @parser.parse('7%(2+1)')
  end

  def test_05
    assert_equal -2+(2-2), @parser.parse('-2+(2-2)')
    assert_equal 2-(-2+2), @parser.parse('2-(-2+2)')
    assert_equal 2+(2*-2), @parser.parse('2+(2*-2)')
  end

  def test_06
    assert_equal (3/3)+(8-2), @parser.parse('(3/3)+(8-2)')
    assert_equal (1+3)/(2/2)*(10-8), @parser.parse('(1+3)/(2/2)*(10-8)')
    assert_equal (1*3)*4*(5*6), @parser.parse('(1*3)*4*(5*6)')
    assert_equal (10%3)*(2+2), @parser.parse('(10%3)*(2+2)')
    assert_equal 2**(2+(3/2)**2), @parser.parse('2**(2+(3/2)**2)')
    assert_equal (10/(2+3)*4), @parser.parse('(10/(2+3)*4)')
    assert_equal 5+((5*4)%(2+1)), @parser.parse('5+((5*4)%(2+1))')
  end

end

“A language that doesn’t affect the way you think about programming is not worth knowing” — Alan Perlis

Ruby 正在改變我對 programming 的想法。