Lua Balanced

lua-users home
wiki

The following code provides some functions that match delimited snippets of Lua code in a string. For example, it can match a Lua string, Lua comment, or Lua expression. It is useful in particular for source filters or parsing Lua snippets embedded in another language.

It is inspired by Damian Conway's Text::Balanced[1] in Perl.

This code is used by ListComprehensions.

The unique feature of this implementation is that that it does not rigorously lex and parse the Lua grammar. It doesn't need to. It assumes during the parse that the Lua code is syntactically correct (which can be verified later using loadstring). By assuming this, extraction of delimited sequences is significantly simplified yet can still be robust, and it also supports supersets of the Lua grammar. The code, which is written entirely in Lua, is just under 200 lines of Lua code (compare to Yueliang used in MetaLua, where the lexer along is a few hundred lines).

Examples

local lb = require "luabalanced"

-- Extract Lua expression starting at position 4.
print(lb.match_expression("if x^2 + x > 5 then print(x) end", 4))
--> x^2 + x > 5     16

-- Extract Lua string starting at (default) position 1.
print(lb.match_string([["test\"123" .. "more"]]))
--> "test\"123"     12

-- Break Lua code into code types.
lb.gsub([[
  local x = 1  -- test
  print("x=", x)
]], function(u, s)
  print(u .. '[' .. s .. ']')
end)
--[[output:
e[  local x = 1  ]
c[-- test
]
e[  print(]
s["x="]
e[, x)
]
]]

Tests/Examples

-- luabalanced_test.lua
-- test for luabalanced.lua 

local lb = require "luabalanced"
local tuple = require "tuple"

-- utility function for test suite.
local function asserteq(a, b)
  if a ~= b then
    error(tostring(a) .. ' == ' .. tostring(b) .. ' failed', 2)
  end
end

-- utility function (wrap function: store return in tuple and protect)
local function wrap2(f)
  return function(s, pos)
    local res = tuple(pcall(function() return f(s, pos) end))
    if not res[1] then
      return 'error'
    else
      return tuple(unpack(res, 2, res.n))
    end
  end
end

--## match_bracketed tests

-- test wrapper for lb.match_bracketed
local mb = wrap2(lb.match_bracketed)

-- trivial tests
asserteq(mb'', tuple(nil, 1))
asserteq(mb'a', tuple(nil, 1))
asserteq(mb'{', 'error')
asserteq(mb'}', tuple(nil, 1))
asserteq(mb'{[}]', 'error')
asserteq(mb('[{}]'), tuple('[{}]', 5))

-- test with pos
asserteq(mb('[][a(a)a].', 3), tuple('[a(a)a]', 10))

-- test with strings
asserteq(mb('[ "]" ]'), tuple('[ "]" ]', 8))
asserteq(mb("[ '[' ]"), tuple("[ '[' ]", 8))
asserteq(mb("[ [=[ ]=] ]"), tuple("[ [=[ ]=] ]", 12))
asserteq(mb("[[ ] ]]"), tuple("[[ ] ]]", 8))
asserteq(mb("[=[ [ ]=]"), tuple("[=[ [ ]=]", 10))

--## match_expression tests

-- test wrapper for lb.match_expression
local me = wrap2(lb.match_expression)
asserteq(me'a', tuple('a', 2))
asserteq(me'a b=c', tuple('a ', 3))
asserteq(me'a and b', tuple('a and b', 8))
asserteq(me'a and b ', tuple('a and b ', 9))
asserteq(me'a and b c', tuple('a and b ', 9))
asserteq(me'a+b', tuple('a+b', 4))
asserteq(me'a+b b=c', tuple('a+b ', 5))
asserteq(me'{function()end}+b c', tuple('{function()end}+b ', 19))
asserteq(me'{} e', tuple('{} ', 4))
asserteq(me'() e', tuple('() ', 4))
asserteq(me'"" e', tuple('"" ', 4))
asserteq(me"'' e", tuple("'' ", 4))
asserteq(me'a[1] e', tuple('a[1] ', 6))
asserteq(me'ab.cd e', tuple('ab.cd ', 7))
asserteq(me'ab:cd() e', tuple('ab:cd() ', 9))
asserteq(me'(x) (y) z', tuple('(x) (y) ', 9))
asserteq(me'x >= y', tuple('x >= y', 7))

-- numbers
asserteq(me'1e2 a', tuple('1e2 ', 5))
asserteq(me'1e+2 a', tuple('1e+2 ', 6))
asserteq(me'1.2e+2 a', tuple('1.2e+2 ', 8))
asserteq(me'.2e+2 a', tuple('.2e+2 ', 7))

-- comments
asserteq(me'a+ -- b\nc', tuple('a+ -- b\nc', 10))
asserteq(me'a --[[]] b', tuple('a --[[]] ', 10))
asserteq(me'a+ --[[]] b', tuple('a+ --[[]] b', 12))
asserteq(me'a --[[]] + b', tuple('a --[[]] + b', 13))
asserteq(me'a+ --[[]] --[=[]=] b', tuple('a+ --[[]] --[=[]=] b', 21))
asserteq(me'a+ -- b\n -- b\n b c', tuple('a+ -- b\n -- b\n b ', 18))

-- check for exceptions giving lots of possibly not syntactically
-- correct data.
local text = io.open'luabalanced_test.lua':read'*a'
for i=1,#text do
  local res = me(text,i)
  if res[1] == 'error' and not res[2]:match('syntax error') then
    error(res[2])
  end
end

--## match_explist tests

local ml = function(...)
  local res = wrap2(lb.match_explist)(...)
  res[1] = table.concat(res[1], '|')
  return res
end
asserteq(ml ' d', tuple(' ', 2))
asserteq(ml 'a+b,b*c d', tuple('a+b|b*c ', 9))

--## match_namelist tests

local ml = function(...)
  local res = wrap2(lb.match_namelist)(...)
  res[1] = table.concat(res[1], '|')
  return res
end
asserteq(ml ' ', tuple('', 1))
asserteq(ml 'a b', tuple('a', 3))
asserteq(ml 'a,b d', tuple('a|b', 5))
asserteq(ml 'a,b+d', tuple('a|b', 4))


--## gsub tests

local ls = lb.gsub

local function f(u, s)
  return '[' .. u .. ':' .. s .. ']'
end

asserteq(ls('', f), '')
asserteq(ls(' ', f), '[e: ]')
asserteq(ls(' "z" ;', f), '[e: ][s:"z"][e: ;]')
asserteq(ls(' --[[z]] ;', f), '[e: ][c:--[[z]]][e: ;]')
asserteq(ls(' --z\n ;', f), '[e: ][c:--z\n][e: ;]')
asserteq(ls(' --z', f), '[e: ][c:--z]')
asserteq(ls('[][=[ ] ]=] ;', f), '[e:[]][s:[=[ ] ]=]][e: ;]')
asserteq(ls('a - b --[[d]] .. "--"', f), '[e:a - b ][c:--[[d]]][e: .. ][s:"--"]')

print 'DONE'

Implementation

-- luabalanced.lua
-- Extracted delimited Lua sequences from strings.[1]
-- Inspired by Damian Conway's Text::Balanced[2] in Perl.
--
--   [1] http://lua-users.org/wiki/LuaBalanced
--   [2] http://search.cpan.org/dist/Text-Balanced/lib/Text/Balanced.pm
--
-- (c) 2008, David Manura, Licensed under the same terms as Lua (MIT license).
--

local M = {}

local assert = assert
local table_concat = table.concat

-- map opening brace <-> closing brace.
local ends = { ['('] = ')', ['{'] = '}', ['['] = ']' }
local begins = {}; for k,v in pairs(ends) do begins[v] = k end


-- Match Lua string in string <s> starting at position <pos>.
-- Returns <string>, <posnew>, where <string> is the matched
-- string (or nil on no match) and <posnew> is the character
-- following the match (or <pos> on no match).
-- Supports all Lua string syntax: "...", '...', [[...]], [=[...]=], etc.
local function match_string(s, pos)
  pos = pos or 1
  local posa = pos
  local c = s:sub(pos,pos)
  if c == '"' or c == "'" then
    pos = pos + 1
    while 1 do
      pos = assert(s:find("[" .. c .. "\\]", pos), 'syntax error')
      if s:sub(pos,pos) == c then
        local part = s:sub(posa, pos)
        return part, pos + 1
      else
        pos = pos + 2
      end
    end
  else
    local sc = s:match("^%[(=*)%[", pos)
    if sc then
      local _; _, pos = s:find("%]" .. sc .. "%]", pos)
      assert(pos)
      local part = s:sub(posa, pos)
      return part, pos + 1
    else
      return nil, pos
    end
  end
end
M.match_string = match_string


-- Match bracketed Lua expression, e.g. "(...)", "{...}", "[...]", "[[...]]",
-- [=[...]=], etc.
-- Function interface is similar to match_string.
local function match_bracketed(s, pos)
  pos = pos or 1
  local posa = pos
  local ca = s:sub(pos,pos)
  if not ends[ca] then
    return nil, pos
  end
  local stack = {}
  while 1 do
    pos = s:find('[%(%{%[%)%}%]\"\']', pos)
    assert(pos, 'syntax error: unbalanced')
    local c = s:sub(pos,pos)
    if c == '"' or c == "'" then
      local part; part, pos = match_string(s, pos)
      assert(part)
    elseif ends[c] then -- open
      local mid, posb
      if c == '[' then mid, posb = s:match('^%[(=*)%[()', pos) end
      if mid then
        pos = s:match('%]' .. mid .. '%]()', posb)
        assert(pos, 'syntax error: long string not terminated')
        if #stack == 0 then
          local part = s:sub(posa, pos-1)
          return part, pos
        end
      else
        stack[#stack+1] = c
        pos = pos + 1
      end
    else -- close
      assert(stack[#stack] == assert(begins[c]), 'syntax error: unbalanced')
      stack[#stack] = nil
      if #stack == 0 then
        local part = s:sub(posa, pos)
        return part, pos+1
      end
      pos = pos + 1
    end
  end
end
M.match_bracketed = match_bracketed


-- Match Lua comment, e.g. "--...\n", "--[[...]]", "--[=[...]=]", etc.
-- Function interface is similar to match_string.
local function match_comment(s, pos)
  pos = pos or 1
  if s:sub(pos, pos+1) ~= '--' then
    return nil, pos
  end
  pos = pos + 2
  local partt, post = match_string(s, pos)
  if partt then
    return '--' .. partt, post
  end
  local part; part, pos = s:match('^([^\n]*\n?)()', pos)
  return '--' .. part, pos
end


-- Match Lua expression, e.g. "a + b * c[e]".
-- Function interface is similar to match_string.
local wordop = {['and']=true, ['or']=true, ['not']=true}
local is_compare = {['>']=true, ['<']=true, ['~']=true}
local function match_expression(s, pos)
  pos = pos or 1
  local posa = pos
  local lastident
  local poscs, posce
  while pos do
    local c = s:sub(pos,pos)
    if c == '"' or c == "'" or c == '[' and s:find('^[=%[]', pos+1) then
      local part; part, pos = match_string(s, pos)
      assert(part, 'syntax error')
    elseif c == '-' and s:sub(pos+1,pos+1) == '-' then
      -- note: handle adjacent comments in loop to properly support
      -- backtracing (poscs/posce).
      poscs = pos
      while s:sub(pos,pos+1) == '--' do
        local part; part, pos = match_comment(s, pos)
        assert(part)
        pos = s:match('^%s*()', pos)
        posce = pos
      end
    elseif c == '(' or c == '{' or c == '[' then
      local part; part, pos = match_bracketed(s, pos)
    elseif c == '=' and s:sub(pos+1,pos+1) == '=' then
      pos = pos + 2  -- skip over two-char op containing '='
    elseif c == '=' and is_compare[s:sub(pos-1,pos-1)] then
      pos = pos + 1  -- skip over two-char op containing '='
    elseif c:match'^[%)%}%];,=]' then
      local part = s:sub(posa, pos-1)
      return part, pos
    elseif c:match'^[%w_]' then
      local newident,newpos = s:match('^([%w_]+)()', pos)
      if pos ~= posa and not wordop[newident] then -- non-first ident
        local pose = ((posce == pos) and poscs or pos) - 1
        while s:match('^%s', pose) do pose = pose - 1 end
        local ce = s:sub(pose,pose)
        if ce:match'[%)%}\'\"%]]' or
           ce:match'[%w_]' and not wordop[lastident]
        then
          local part = s:sub(posa, pos-1)
          return part, pos
        end
      end
      lastident, pos = newident, newpos
    else
      pos = pos + 1
    end
    pos = s:find('[%(%{%[%)%}%]\"\';,=%w_%-]', pos)
  end
  local part = s:sub(posa, #s)
  return part, #s+1
end
M.match_expression = match_expression


-- Match name list (zero or more names).  E.g. "a,b,c"
-- Function interface is similar to match_string,
-- but returns array as match.
local function match_namelist(s, pos)
  pos = pos or 1
  local list = {}
  while 1 do
    local c = #list == 0 and '^' or '^%s*,%s*'
    local item, post = s:match(c .. '([%a_][%w_]*)%s*()', pos)
    if item then pos = post else break end
    list[#list+1] = item
  end
  return list, pos
end
M.match_namelist = match_namelist


-- Match expression list (zero or more expressions).  E.g. "a+b,b*c".
-- Function interface is similar to match_string,
-- but returns array as match.
local function match_explist(s, pos)
  pos = pos or 1
  local list = {}
  while 1 do
    if #list ~= 0 then
      local post = s:match('^%s*,%s*()', pos)
      if post then pos = post else break end
    end
    local item; item, pos = match_expression(s, pos)
    assert(item, 'syntax error')
    list[#list+1] = item
  end
  return list, pos
end
M.match_explist = match_explist


-- Replace snippets of code in Lua code string <s>
-- using replacement function f(u,sin) --> sout.
-- <u> is the type of snippet ('c' = comment, 's' = string,
-- 'e' = any other code).
-- Snippet is replaced with <sout> (unless <sout> is nil or false, in
-- which case the original snippet is kept)
-- This is somewhat analogous to string.gsub .
local function gsub(s, f)
  local pos = 1
  local posa = 1
  local sret = ''
  while 1 do
    pos = s:find('[%-\'\"%[]', pos)
    if not pos then break end
    if s:match('^%-%-', pos) then
      local exp = s:sub(posa, pos-1)
      if #exp > 0 then sret = sret .. (f('e', exp) or exp) end
      local comment; comment, pos = match_comment(s, pos)
      sret = sret .. (f('c', assert(comment)) or comment)
      posa = pos
    else
      local posb = s:find('^[\'\"%[]', pos)
      local str
      if posb then str, pos = match_string(s, posb) end
      if str then
        local exp = s:sub(posa, posb-1)
        if #exp > 0 then sret = sret .. (f('e', exp) or exp) end
        sret = sret .. (f('s', str) or str)
        posa = pos
      else
        pos = pos + 1
      end
    end
  end
  local exp = s:sub(posa)
  if #exp > 0 then sret = sret .. (f('e', exp) or exp) end
  return sret
end
M.gsub = gsub


return M

The following file is used by the test suite:

-- tuple.lua
-- Simple tuple implementation using tables.
-- (c) 2008, David Manura, Licensed under the same terms as Lua (MIT license).

local select = select
local tostring = tostring
local setmetatable = setmetatable
local table_concat = table.concat

local mt = {}
local function tuple(...)
  local t = setmetatable({n=select('#',...), ...}, mt)
  return t
end
function mt:__tostring()
  local ts = {}
  for i=1,self.n do local v = self[i]
    ts[#ts+1] = type(v) == 'string' and string.format('%q', v) or tostring(self[i])
  end
  return 'tuple(' .. table_concat(ts, ',') .. ')'
end
function mt.__eq(a, b)
  if a.n ~= b.n then return false end
  for i=1,a.n do
    if a[i] ~= b[i] then return false end
  end
  return true
end

return tuple
--DavidManura

Status

This module is new and likely still has some bugs.


FindPage · RecentChanges · preferences
edit · history
Last edited September 10, 2008 4:45 pm GMT (diff)