diff options
| author | unwox <me@unwox.com> | 2024-09-26 17:46:38 +0600 |
|---|---|---|
| committer | unwox <me@unwox.com> | 2024-09-26 17:46:38 +0600 |
| commit | 9b82db238f9e2e02a76f95c793f8d6ef2387ecfd (patch) | |
| tree | cdb2a16d01f09553b560ab1034d53392d07bae42 /vendor/lpeglj/lpcode.lua | |
init
Diffstat (limited to 'vendor/lpeglj/lpcode.lua')
| -rw-r--r-- | vendor/lpeglj/lpcode.lua | 1057 |
1 files changed, 1057 insertions, 0 deletions
diff --git a/vendor/lpeglj/lpcode.lua b/vendor/lpeglj/lpcode.lua new file mode 100644 index 0000000..365d80b --- /dev/null +++ b/vendor/lpeglj/lpcode.lua @@ -0,0 +1,1057 @@ +--[[ +LPEGLJ +lpcode.lua +Generating code from tree +Copyright (C) 2014 Rostislav Sacek. +based on LPeg v1.0 - PEG pattern matching for Lua +Lua.org & PUC-Rio written by Roberto Ierusalimschy +http://www.inf.puc-rio.br/~roberto/lpeg/ + +** Permission is hereby granted, free of charge, to any person obtaining +** a copy of this software and associated documentation files (the +** "Software"), to deal in the Software without restriction, including +** without limitation the rights to use, copy, modify, merge, publish, +** distribute, sublicense, and/or sell copies of the Software, and to +** permit persons to whom the Software is furnished to do so, subject to +** the following conditions: +** +** The above copyright notice and this permission notice shall be +** included in all copies or substantial portions of the Software. +** +** THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +** EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +** MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +** IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +** CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +** SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +** +** [ MIT license: http://www.opensource.org/licenses/mit-license.php ] +--]] +local ffi = require "ffi" +require "lpvm" + +local band, bor, bnot, rshift, lshift = bit.band, bit.bor, bit.bnot, bit.rshift, bit.lshift + +local TChar = 0 +local TSet = 1 +local TAny = 2 -- standard PEG elements +local TTrue = 3 +local TFalse = 4 +local TRep = 5 +local TSeq = 6 +local TChoice = 7 +local TNot = 8 +local TAnd = 9 +local TCall = 10 +local TOpenCall = 11 +local TRule = 12 -- sib1 is rule's pattern, sib2 is 'next' rule +local TGrammar = 13 -- sib1 is initial (and first) rule +local TBehind = 14 -- match behind +local TCapture = 15 -- regular capture +local TRunTime = 16 -- run-time capture + + +local IAny = 0 -- if no char, fail +local IChar = 1 -- if char != val, fail +local ISet = 2 -- if char not in val, fail +local ITestAny = 3 -- in no char, jump to 'offset' +local ITestChar = 4 -- if char != val, jump to 'offset' +local ITestSet = 5 -- if char not in val, jump to 'offset' +local ISpan = 6 -- read a span of chars in val +local IBehind = 7 -- walk back 'val' characters (fail if not possible) +local IRet = 8 -- return from a rule +local IEnd = 9 -- end of pattern +local IChoice = 10 -- stack a choice; next fail will jump to 'offset' +local IJmp = 11 -- jump to 'offset' +local ICall = 12 -- call rule at 'offset' +local IOpenCall = 13 -- call rule number 'offset' (must be closed to a ICall) +local ICommit = 14 -- pop choice and jump to 'offset' +local IPartialCommit = 15 -- update top choice to current position and jump +local IBackCommit = 16 -- "fails" but jump to its own 'offset' +local IFailTwice = 17 -- pop one choice and then fail +local IFail = 18 -- go back to saved state on choice and jump to saved offset +local IGiveup = 19 -- internal use +local IFullCapture = 20 -- complete capture of last 'off' chars +local IOpenCapture = 21 -- start a capture +local ICloseCapture = 22 +local ICloseRunTime = 23 + + +local Cclose = 0 +local Cposition = 1 +local Cconst = 2 +local Cbackref = 3 +local Carg = 4 +local Csimple = 5 +local Ctable = 6 +local Cfunction = 7 +local Cquery = 8 +local Cstring = 9 +local Cnum = 10 +local Csubst = 11 +local Cfold = 12 +local Cruntime = 13 +local Cgroup = 14 + + +local PEnullable = 0 +local PEnofail = 1 +local RuleLR = 0x10000 +local NOINST = -2 + + +local MAXBEHINDPREDICATE = 255 +local MAXRULES = 200 +local MAXOFF = 0xF + +-- number of siblings for each tree +local numsiblings = { + 0, 0, 0, -- char, set, any + 0, 0, -- true, false + 1, -- rep + 2, 2, -- seq, choice + 1, 1, -- not, and + 0, 0, 2, 1, -- call, opencall, rule, grammar + 1, -- behind + 1, 1 -- capture, runtime capture +} + + +local patternelement = ffi.typeof('PATTERN_ELEMENT') +local pattern = ffi.typeof('PATTERN') +local settype = ffi.typeof('int32_t[8]') +local fullset = settype(-1, -1, -1, -1, -1, -1, -1, -1) + +-- {====================================================== +-- Analysis and some optimizations +-- ======================================================= + +local codegen + + +-- Check whether a charset is empty (IFail), singleton (IChar), +-- full (IAny), or none of those (ISet). + +local function charsettype(cs) + local count = 0; + local candidate = -1; -- candidate position for a char + for i = 0, 8 - 1 do + local b = cs[i]; + if b == 0 then + if count > 1 then + return ISet; -- else set is still empty + end + elseif b == -1 then + if count < (i * 32) then + return ISet; + else + count = count + 32; -- set is still full + end + -- byte has only one bit? + elseif band(b, (b - 1)) == 0 then + if count > 0 then + return ISet; -- set is neither full nor empty + -- set has only one char till now; track it + else + count = count + 1; + candidate = i; + end + else + return ISet; -- byte is neither empty, full, nor singleton + end + end + if count == 0 then + return IFail, 0 -- empty set + -- singleton; find character bit inside byte + elseif count == 1 then + local b = cs[candidate]; + local c = candidate * 32; + for i = 1, 32 do + if b == 1 then + c = c + i - 1 + break + end + b = rshift(b, 1) + end + return IChar, c + elseif count == 256 then + return IAny, 0 -- full set + else + assert(false) -- should have returned by now + end +end + + +-- A few basic operations on Charsets + +local function cs_complement(cs) + for i = 0, 8 - 1 do + cs[i] = bnot(cs[i]) + end +end + + +local function cs_equal(cs1, cs2) + for i = 0, 8 - 1 do + if cs1[i] ~= cs2[i] then + return + end + end + return true +end + + +-- computes whether sets st1 and st2 are disjoint + +local function cs_disjoint(st1, st2) + for i = 0, 8 - 1 do + if band(st1[i], st2[i]) ~= 0 then + return + end + end + return true +end + + +-- Convert a 'char' pattern (TSet, TChar, TAny) to a charset + +local function tocharset(tree, index, valuetable) + local val = settype() + if tree.p[index].tag == TSet then + ffi.copy(val, valuetable[tree.p[index].val], ffi.sizeof(val)) + return val + elseif tree.p[index].tag == TChar then + local b = tree.p[index].val + -- only one char + -- add that one + val[rshift(b, 5)] = lshift(1, band(b, 31)) + return val + elseif tree.p[index].tag == TAny then + ffi.fill(val, ffi.sizeof(val), 0xff) + return val + end +end + + +-- checks whether a pattern has captures + +local function hascaptures(tree, index) + if tree.p[index].tag == TCapture or tree.p[index].tag == TRunTime then + return true + elseif tree.p[index].tag == TCall then + return hascaptures(tree, index + tree.p[index].ps) + else + local ns = numsiblings[tree.p[index].tag + 1] + if ns == 0 then + return + elseif ns == 1 then + return hascaptures(tree, index + 1) + elseif ns == 2 then + if hascaptures(tree, index + 1) then + return true + elseif tree.p[index].tag ~= TRule then + return hascaptures(tree, index + tree.p[index].ps) + end + else + assert(false) + end + end +end + + +-- Checks how a pattern behaves regarding the empty string, +-- in one of two different ways: +-- A pattern is *nullable* if it can match without consuming any character; +-- A pattern is *nofail* if it never fails for any string +-- (including the empty string). +-- The difference is only for predicates; for patterns without +-- predicates, the two properties are equivalent. +-- (With predicates, &'a' is nullable but not nofail. Of course, +-- nofail => nullable.) +-- These functions are all convervative in the following way: +-- p is nullable => nullable(p) +-- nofail(p) => p cannot fail +-- (The function assumes that TOpenCall and TRunTime are not nullable: +-- TOpenCall must be checked again when the grammar is fixed; +-- TRunTime is an arbitrary choice.) + +local function checkaux(tree, pred, index, lrcall) + lrcall = lrcall or {} + local tag = tree.p[index].tag + if tag == TChar or tag == TSet or tag == TAny or + tag == TFalse or tag == TOpenCall then + return -- not nullable + elseif tag == TRep or tag == TTrue then + return true -- no fail + elseif tag == TNot or tag == TBehind then + -- can match empty, but may fail + if pred == PEnofail then + return + else + return true -- PEnullable + end + elseif tag == TAnd then + -- can match empty; fail iff body does + if pred == PEnullable then + return true + else + return checkaux(tree, pred, index + 1, lrcall) + end + -- can fail; match empty iff body does + elseif tag == TRunTime then + if pred == PEnofail then + return + else + return checkaux(tree, pred, index + 1, lrcall) + end + elseif tag == TSeq then + if not checkaux(tree, pred, index + 1, lrcall) then + return + else + return checkaux(tree, pred, index + tree.p[index].ps, lrcall) + end + elseif tag == TChoice then + if checkaux(tree, pred, index + tree.p[index].ps, lrcall) then + return true + else + return checkaux(tree, pred, index + 1, lrcall) + end + elseif tag == TCapture or tag == TGrammar or tag == TRule then + return checkaux(tree, pred, index + 1, lrcall) + elseif tag == TCall then + --left recursive rule + if bit.band(tree.p[index].cap, 0xffff) ~= 0 then + local lr = index + tree.p[index].ps + if lrcall[lr] then + return + end + lrcall[lr] = true + end + return checkaux(tree, pred, index + tree.p[index].ps, lrcall) + else + assert(false) + end +end + + +-- number of characters to match a pattern (or -1 if variable) +-- ('count' avoids infinite loops for grammars) + +local function fixedlenx(tree, count, len, index) + local tag = tree.p[index].tag + if tag == TChar or tag == TSet or tag == TAny then + return len + 1; + elseif tag == TFalse or tag == TTrue or tag == TNot or tag == TAnd or tag == TBehind then + return len; + elseif tag == TRep or tag == TRunTime or tag == TOpenCall then + return -1; + elseif tag == TCapture or tag == TRule or tag == TGrammar then + return fixedlenx(tree, count, len, index + 1) + elseif tag == TCall then + if count >= MAXRULES then + return -1; -- may be a loop + else + return fixedlenx(tree, count + 1, len, index + tree.p[index].ps) + end + elseif tag == TSeq then + len = fixedlenx(tree, count, len, index + 1) + if (len < 0) then + return -1; + else + return fixedlenx(tree, count, len, index + tree.p[index].ps) + end + elseif tag == TChoice then + local n1 = fixedlenx(tree, count, len, index + 1) + if n1 < 0 then return -1 end + local n2 = fixedlenx(tree, count, len, index + tree.p[index].ps) + if n1 == n2 then + return n1 + else + return -1 + end + else + assert(false) + end +end + + +-- Computes the 'first set' of a pattern. +-- The result is a conservative aproximation: +-- match p ax -> x' for some x ==> a in first(p). +-- match p '' -> '' ==> returns 1. +-- The set 'follow' is the first set of what follows the +-- pattern (full set if nothing follows it) + +local function getfirst(tree, follow, index, valuetable, lrcall) + lrcall = lrcall or {} + local tag = tree.p[index].tag + if tag == TChar or tag == TSet or tag == TAny then + local firstset = tocharset(tree, index, valuetable) + return 0, firstset + elseif tag == TTrue then + local firstset = settype() + ffi.copy(firstset, follow, ffi.sizeof(firstset)) + return 1, firstset + elseif tag == TFalse then + local firstset = settype() + return 0, firstset + elseif tag == TChoice then + local e1, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) + local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) + for i = 0, 8 - 1 do + firstset[i] = bor(firstset[i], csaux[i]) + end + return bor(e1, e2), firstset + elseif tag == TSeq then + if not checkaux(tree, PEnullable, index + 1) then + return getfirst(tree, fullset, index + 1, valuetable, lrcall) + -- FIRST(p1 p2, fl) = FIRST(p1, FIRST(p2, fl)) + else + local e2, csaux = getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) + local e1, firstset = getfirst(tree, csaux, index + 1, valuetable, lrcall) + if e1 == 0 then -- 'e1' ensures that first can be used + return 0, firstset + -- one of the children has a matchtime? + elseif band(bor(e1, e2), 2) == 2 then + return 2, firstset -- pattern has a matchtime capture + else + return e2, firstset -- else depends on 'e2' + end + end + elseif tag == TRep then + local _, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) + for i = 0, 8 - 1 do + firstset[i] = bor(firstset[i], follow[i]) + end + return 1, firstset -- accept the empty string + elseif tag == TCapture or tag == TGrammar or tag == TRule then + return getfirst(tree, follow, index + 1, valuetable, lrcall) + -- function invalidates any follow info. + elseif tag == TRunTime then + local e, firstset = getfirst(tree, fullset, index + 1, valuetable, lrcall) + if e ~= 0 then + return 2, firstset -- function is not "protected"? + else + return 0, firstset -- pattern inside capture ensures first can be used + end + elseif tag == TCall then + -- left recursive rule + if bit.band(tree.p[index].cap, 0xffff) ~= 0 then + local lr = index + tree.p[index].ps + if lrcall[lr] then + return 0, settype() + else + lrcall[lr] = true + end + end + return getfirst(tree, follow, index + tree.p[index].ps, valuetable, lrcall) + elseif tag == TAnd then + local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) + for i = 0, 8 - 1 do + firstset[i] = band(firstset[i], follow[i]) + end + return e, firstset + elseif tag == TNot then + local firstset = tocharset(tree, index + 1, valuetable) + if firstset then + cs_complement(firstset) + return 1, firstset + end + local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) + ffi.copy(firstset, follow, ffi.sizeof(firstset)) + return bor(e, 1), firstset -- always can accept the empty string + -- instruction gives no new information + elseif tag == TBehind then + -- call 'getfirst' to check for math-time captures + local e, firstset = getfirst(tree, follow, index + 1, valuetable, lrcall) + ffi.copy(firstset, follow, ffi.sizeof(firstset)) + return bor(e, 1), firstset -- always can accept the empty string + else + assert(false) + end +end + + +-- If it returns true, then pattern can fail only depending on the next +-- character of the subject + +local function headfail(tree, index, lrcall) + lrcall = lrcall or {} + local tag = tree.p[index].tag + if tag == TChar or tag == TSet or tag == TAny or tag == TFalse then + return true + elseif tag == TTrue or tag == TRep or tag == TRunTime or tag == TNot or tag == TBehind then + return + elseif tag == TCapture or tag == TGrammar or tag == TRule or tag == TAnd then + return headfail(tree, index + 1, lrcall) + elseif tag == TCall then + -- left recursive rule + if bit.band(tree.p[index].cap, 0xffff) ~= 0 then + local lr = index + tree.p[index].ps + if lrcall[lr] then + return true + else + lrcall[lr] = true + end + end + return headfail(tree, index + tree.p[index].ps, lrcall) + elseif tag == TSeq then + if not checkaux(tree, PEnofail, index + tree.p[index].ps) then + return + else + return headfail(tree, index + 1, lrcall) + end + elseif tag == TChoice then + if not headfail(tree, index + 1, lrcall) then + return + else + return headfail(tree, index + tree.p[index].ps, lrcall) + end + else + assert(false) + end +end + + +-- Check whether the code generation for the given tree can benefit +-- from a follow set (to avoid computing the follow set when it is +-- not needed) + +local function needfollow(tree, index) + local tag = tree.p[index].tag + if tag == TChar or tag == TSet or tag == TAny or tag == TFalse or tag == TTrue or tag == TAnd or tag == TNot or + tag == TRunTime or tag == TGrammar or tag == TCall or tag == TBehind then + return + elseif tag == TChoice or tag == TRep then + return true + elseif tag == TCapture then + return needfollow(tree, index + 1) + elseif tag == TSeq then + return needfollow(tree, index + tree.p[index].ps) + else + assert(false) + end +end + +-- ====================================================== + + +-- {====================================================== +-- Code generation +-- ======================================================= + + +-- code generation is recursive; 'opt' indicates that the code is +-- being generated under a 'IChoice' operator jumping to its end. +-- 'tt' points to a previous test protecting this code. 'fl' is +-- the follow set of the pattern. + + +local function addinstruction(code, op, val) + local size = code.size + if size >= code.allocsize then + code:doublesize() + end + code.p[size].code = op + code.p[size].val = val + code.size = size + 1 + return size +end + + +local function setoffset(code, instruction, offset) + code.p[instruction].offset = offset; +end + + +-- Add a capture instruction: +-- 'op' is the capture instruction; 'cap' the capture kind; +-- 'key' the key into ktable; 'aux' is optional offset + +local function addinstcap(code, op, cap, key, aux) + local i = addinstruction(code, op, bor(cap, lshift(aux, 4))) + setoffset(code, i, key) + return i +end + + +local function jumptothere(code, instruction, target) + if instruction >= 0 then + setoffset(code, instruction, target - instruction) + end +end + + +local function jumptohere(code, instruction) + jumptothere(code, instruction, code.size) +end + + +-- Code an IChar instruction, or IAny if there is an equivalent +-- test dominating it + +local function codechar(code, c, tt) + assert(tt ~= -1) + if tt >= 0 and code.p[tt].code == ITestChar and + code.p[tt].val == c then + addinstruction(code, IAny, 0) + else + addinstruction(code, IChar, c) + end +end + + +-- Code an ISet instruction + +local function coderealcharset(code, cs, valuetable) + local ind = #valuetable + 1 + valuetable[ind] = cs + return addinstruction(code, ISet, ind) +end + + +-- code a char set, optimizing unit sets for IChar, "complete" +-- sets for IAny, and empty sets for IFail; also use an IAny +-- when instruction is dominated by an equivalent test. + +local function codecharset(code, cs, tt, valuetable) + local op, c = charsettype(cs) + if op == IChar then + codechar(code, c, tt) + elseif op == ISet then + assert(tt ~= -1) + if tt >= 0 and code.p[tt].code == ITestSet and + cs_equal(cs, valuetable[code.p[tt].val]) then + addinstruction(code, IAny, 0) + else + coderealcharset(code, cs, valuetable) + end + else + addinstruction(code, op, c) + end +end + + +-- code a test set, optimizing unit sets for ITestChar, "complete" +-- sets for ITestAny, and empty sets for IJmp (always fails). +-- 'e' is true iff test should accept the empty string. (Test +-- instructions in the current VM never accept the empty string.) + +local function codetestset(code, cs, e, valuetable) + if e ~= 0 then + return NOINST -- no test + else + local pos = code.size + codecharset(code, cs, NOINST, valuetable) + local inst = code.p[pos] + local code = inst.code + if code == IFail then + inst.code = IJmp -- always jump + elseif code == IAny then + inst.code = ITestAny + elseif code == IChar then + inst.code = ITestChar + elseif code == ISet then + inst.code = ITestSet + else + assert(false) + end + return pos + end +end + + +-- Find the final destination of a sequence of jumps + +local function finaltarget(code, i) + while code.p[i].code == IJmp do + i = i + code.p[i].offset + end + return i +end + + +-- final label (after traversing any jumps) + +local function finallabel(code, i) + return finaltarget(code, i + code.p[i].offset) +end + +-- <behind(p)> == behind n; <p> (where n = fixedlen(p)) + +local function codebehind(code, tree, index, valuetable) + if tree.p[index].val > 0 then + addinstruction(code, IBehind, tree.p[index].val) + end + codegen(code, tree, fullset, false, NOINST, index + 1, valuetable) -- NOINST +end + + +-- Choice; optimizations: +-- - when p1 is headfail +-- - when first(p1) and first(p2) are disjoint; than +-- a character not in first(p1) cannot go to p1, and a character +-- in first(p1) cannot go to p2 (at it is not in first(p2)). +-- (The optimization is not valid if p1 accepts the empty string, +-- as then there is no character at all...) +-- - when p2 is empty and opt is true; a IPartialCommit can resuse +-- the Choice already active in the stack. + +local function codechoice(code, tree, fl, opt, p1, p2, valuetable) + local emptyp2 = tree.p[p2].tag == TTrue + local e1, st1 = getfirst(tree, fullset, p1, valuetable) + local _, st2 = getfirst(tree, fl, p2, valuetable) + if headfail(tree, p1) or (e1 == 0 and cs_disjoint(st1, st2)) then + -- <p1 / p2> == test (fail(p1)) -> L1 ; p1 ; jmp L2; L1: p2; L2: + local test = codetestset(code, st1, 0, valuetable) + local jmp = NOINST; + codegen(code, tree, fl, false, test, p1, valuetable) + if not emptyp2 then + jmp = addinstruction(code, IJmp, 0) + end + jumptohere(code, test) + codegen(code, tree, fl, opt, NOINST, p2, valuetable) + jumptohere(code, jmp) + elseif opt and emptyp2 then + -- p1? == IPartialCommit; p1 + jumptohere(code, addinstruction(code, IPartialCommit, 0)) + codegen(code, tree, fullset, true, NOINST, p1, valuetable) + else + -- <p1 / p2> == + -- test(fail(p1)) -> L1; choice L1; <p1>; commit L2; L1: <p2>; L2: + local test = codetestset(code, st1, e1, valuetable) + local pchoice = addinstruction(code, IChoice, 0) + codegen(code, tree, fullset, emptyp2, test, p1, valuetable) + local pcommit = addinstruction(code, ICommit, 0) + jumptohere(code, pchoice) + jumptohere(code, test) + codegen(code, tree, fl, opt, NOINST, p2, valuetable) + jumptohere(code, pcommit) + end +end + + +-- And predicate +-- optimization: fixedlen(p) = n ==> <&p> == <p>; behind n +-- (valid only when 'p' has no captures) + +local function codeand(code, tree, tt, index, valuetable) + local n = fixedlenx(tree, 0, 0, index) + if n >= 0 and n <= MAXBEHINDPREDICATE and not hascaptures(tree, index) then + codegen(code, tree, fullset, false, tt, index, valuetable) + if n > 0 then + addinstruction(code, IBehind, n) + end + else + -- default: Choice L1; p1; BackCommit L2; L1: Fail; L2: + local pchoice = addinstruction(code, IChoice, 0) + codegen(code, tree, fullset, false, tt, index, valuetable) + local pcommit = addinstruction(code, IBackCommit, 0) + jumptohere(code, pchoice) + addinstruction(code, IFail, 0) + jumptohere(code, pcommit) + end +end + + +-- Captures: if pattern has fixed (and not too big) length, use +-- a single IFullCapture instruction after the match; otherwise, +-- enclose the pattern with OpenCapture - CloseCapture. + +local function codecapture(code, tree, fl, tt, index, valuetable) + local len = fixedlenx(tree, 0, 0, index + 1) + if len >= 0 and len <= MAXOFF and not hascaptures(tree, index + 1) then + codegen(code, tree, fl, false, tt, index + 1, valuetable) + addinstcap(code, IFullCapture, tree.p[index].cap, tree.p[index].val, len) + else + addinstcap(code, IOpenCapture, tree.p[index].cap, tree.p[index].val, 0) + codegen(code, tree, fl, false, tt, index + 1, valuetable) + addinstcap(code, ICloseCapture, Cclose, 0, 0) + end +end + + +local function coderuntime(code, tree, tt, index, valuetable) + addinstcap(code, IOpenCapture, Cgroup, tree.p[index].val, 0) + codegen(code, tree, fullset, false, tt, index + 1, valuetable) + addinstcap(code, ICloseRunTime, Cclose, 0, 0) +end + + +-- Repetion; optimizations: +-- When pattern is a charset, can use special instruction ISpan. +-- When pattern is head fail, or if it starts with characters that +-- are disjoint from what follows the repetions, a simple test +-- is enough (a fail inside the repetition would backtrack to fail +-- again in the following pattern, so there is no need for a choice). +-- When 'opt' is true, the repetion can reuse the Choice already +-- active in the stack. + +local function coderep(code, tree, opt, fl, index, valuetable) + local st = tocharset(tree, index, valuetable) + if st then + local op = coderealcharset(code, st, valuetable) + code.p[op].code = ISpan; + else + local e1, st = getfirst(tree, fullset, index, valuetable) + if headfail(tree, index) or (e1 == 0 and cs_disjoint(st, fl)) then + -- L1: test (fail(p1)) -> L2; <p>; jmp L1; L2: + local test = codetestset(code, st, 0, valuetable) + codegen(code, tree, fullset, false, test, index, valuetable) + local jmp = addinstruction(code, IJmp, 0) + jumptohere(code, test) + jumptothere(code, jmp, test) + else + -- test(fail(p1)) -> L2; choice L2; L1: <p>; partialcommit L1; L2: + -- or (if 'opt'): partialcommit L1; L1: <p>; partialcommit L1; + local test = codetestset(code, st, e1, valuetable) + local pchoice = NOINST; + if opt then + jumptohere(code, addinstruction(code, IPartialCommit, 0)) + else + pchoice = addinstruction(code, IChoice, 0) + end + local l2 = code.size + codegen(code, tree, fullset, false, NOINST, index, valuetable) + local commit = addinstruction(code, IPartialCommit, 0) + jumptothere(code, commit, l2) + jumptohere(code, pchoice) + jumptohere(code, test) + end + end +end + + +-- Not predicate; optimizations: +-- In any case, if first test fails, 'not' succeeds, so it can jump to +-- the end. If pattern is headfail, that is all (it cannot fail +-- in other parts); this case includes 'not' of simple sets. Otherwise, +-- use the default code (a choice plus a failtwice). + +local function codenot(code, tree, index, valuetable) + local e, st = getfirst(tree, fullset, index, valuetable) + local test = codetestset(code, st, e, valuetable) + -- test (fail(p1)) -> L1; fail; L1: + if headfail(tree, index) then + addinstruction(code, IFail, 0) + else + -- test(fail(p))-> L1; choice L1; <p>; failtwice; L1: + local pchoice = addinstruction(code, IChoice, 0) + codegen(code, tree, fullset, false, NOINST, index, valuetable) + addinstruction(code, IFailTwice, 0) + jumptohere(code, pchoice) + end + jumptohere(code, test) +end + + +-- change open calls to calls, using list 'positions' to find +-- correct offsets; also optimize tail calls + +local function correctcalls(code, positions, from, to) + for i = from, to - 1 do + if code.p[i].code == IOpenCall then + local n = code.p[i].offset; -- rule number + local rule = positions[n]; -- rule position + assert(rule == from or code.p[rule - 1].code == IRet) + -- call; ret ? + if bit.band(code.p[i].val, 0xffff) == 0 and code.p[finaltarget(code, i + 1)].code == IRet then + code.p[i].code = IJmp; -- tail call + else + code.p[i].code = ICall; + end + jumptothere(code, i, rule) -- call jumps to respective rule + end + end +end + + +-- Code for a grammar: +-- call L1; jmp L2; L1: rule 1; ret; rule 2; ret; ...; L2: + +local function codegrammar(code, tree, index, valuetable) + local positions = {} + local rulenumber = 1; + -- tree.p[rule].tag + local rule = index + 1 + assert(tree.p[rule].tag == TRule) + local LR = 0 + if band(RuleLR, tree.p[rule].cap) ~= 0 then LR = 1 end + local firstcall = addinstruction(code, ICall, LR) -- call initial rule + code.p[firstcall].aux = tree.p[rule].val + local jumptoend = addinstruction(code, IJmp, 0) -- jump to the end + jumptohere(code, firstcall) -- here starts the initial rule + while tree.p[rule].tag == TRule do + positions[rulenumber] = code.size -- save rule position + rulenumber = rulenumber + 1 + codegen(code, tree, fullset, false, NOINST, rule + 1, valuetable) -- code rule + addinstruction(code, IRet, 0) + rule = rule + tree.p[rule].ps + end + assert(tree.p[rule].tag == TTrue) + jumptohere(code, jumptoend) + correctcalls(code, positions, firstcall + 2, code.size) +end + + +local function codecall(code, tree, index, val) + local c = addinstruction(code, IOpenCall, tree.p[index].cap) -- to be corrected later + code.p[c].aux = val + assert(tree.p[index + tree.p[index].ps].tag == TRule) + setoffset(code, c, band(tree.p[index + tree.p[index].ps].cap, 0x7fff)) -- offset = rule number +end + + +local function codeseq(code, tree, fl, opt, tt, p1, p2, valuetable) + if needfollow(tree, p1) then + local _, fll = getfirst(tree, fl, p2, valuetable) -- p1 follow is p2 first + codegen(code, tree, fll, false, tt, p1, valuetable) + else + -- use 'fullset' as follow + codegen(code, tree, fullset, false, tt, p1, valuetable) + end + -- can p1 consume anything? + if (fixedlenx(tree, 0, 0, p1) ~= 0) then + tt = NOINST; -- invalidate test + end + return codegen(code, tree, fl, opt, tt, p2, valuetable) +end + + +-- Main code-generation function: dispatch to auxiliar functions +-- according to kind of tree + +-- code generation is recursive; 'opt' indicates that the code is being +-- generated as the last thing inside an optional pattern (so, if that +-- code is optional too, it can reuse the 'IChoice' already in place for +-- the outer pattern). 'tt' points to a previous test protecting this +-- code (or NOINST). 'fl' is the follow set of the pattern. + +function codegen(code, tree, fl, opt, tt, index, valuetable) + local tag = tree.p[index].tag + if tag == TChar then + return codechar(code, tree.p[index].val, tt) + elseif tag == TAny then + return addinstruction(code, IAny, 0) + elseif tag == TSet then + return codecharset(code, valuetable[tree.p[index].val], tt, valuetable) + elseif tag == TTrue then + elseif tag == TFalse then + return addinstruction(code, IFail, 0) + elseif tag == TSeq then + return codeseq(code, tree, fl, opt, tt, index + 1, index + tree.p[index].ps, valuetable) + elseif tag == TChoice then + return codechoice(code, tree, fl, opt, index + 1, index + tree.p[index].ps, valuetable) + elseif tag == TRep then + return coderep(code, tree, opt, fl, index + 1, valuetable) + elseif tag == TBehind then + return codebehind(code, tree, index, valuetable) + elseif tag == TNot then + return codenot(code, tree, index + 1, valuetable) + elseif tag == TAnd then + return codeand(code, tree, tt, index + 1, valuetable) + elseif tag == TCapture then + return codecapture(code, tree, fl, tt, index, valuetable) + elseif tag == TRunTime then + return coderuntime(code, tree, tt, index, valuetable) + elseif tag == TGrammar then + return codegrammar(code, tree, index, valuetable) + elseif tag == TCall then + return codecall(code, tree, index, tree.p[index].val) + else + assert(false) + end +end + + +-- Optimize jumps and other jump-like instructions. +-- * Update labels of instructions with labels to their final +-- destinations (e.g., choice L1; ... L1: jmp L2: becomes +-- choice L2) +-- * Jumps to other instructions that do jumps become those +-- instructions (e.g., jump to return becomes a return; jump +-- to commit becomes a commit) + +local function peephole(code) + local i = 0 + while i < code.size do + local tag = code.p[i].code + if tag == IChoice or tag == ICall or tag == ICommit or tag == IPartialCommit or + tag == IBackCommit or tag == ITestChar or tag == ITestSet or tag == ITestAny then + -- instructions with labels + jumptothere(code, i, finallabel(code, i)) -- optimize label + + elseif tag == IJmp then + local ft = finaltarget(code, i) + local tag = code.p[ft].code -- jumping to what? + -- instructions with unconditional implicit jumps + if tag == IRet or tag == IFail or tag == IFailTwice or tag == IEnd then + ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction + elseif tag == ICommit or tag == IPartialCommit or tag == IBackCommit then + -- inst. with unconditional explicit jumps + local fft = finallabel(code, ft) + ffi.copy(code.p + i, code.p + ft, ffi.sizeof(patternelement)) -- jump becomes that instruction... + jumptothere(code, i, fft) -- but must correct its offset + i = i - 1 -- reoptimize its label + else + jumptothere(code, i, ft) -- optimize label + end + end + i = i + 1 + end +end + + +-- Compile a pattern + +local function compile(tree, index, valuetable) + local code = pattern() + codegen(code, tree, fullset, false, NOINST, index, valuetable) + addinstruction(code, IEnd, 0) + peephole(code) + ffi.C.free(tree.code) + tree.code = code +end + +local function pat_new(ct, size) + size = size or 0 + local allocsize = size + if allocsize < 10 then + allocsize = 10 + end + local pat = ffi.cast('PATTERN*', ffi.C.malloc(ffi.sizeof(pattern))) + assert(pat ~= nil) + pat.allocsize = allocsize + pat.size = size + pat.p = ffi.C.malloc(ffi.sizeof(patternelement) * allocsize) + assert(pat.p ~= nil) + ffi.fill(pat.p, ffi.sizeof(patternelement) * allocsize) + return pat +end + +local function doublesize(ct) + ct.p = ffi.C.realloc(ct.p, ffi.sizeof(patternelement) * ct.allocsize * 2) + assert(ct.p ~= nil) + ffi.fill(ct.p + ct.allocsize, ffi.sizeof(patternelement) * ct.allocsize) + ct.allocsize = ct.allocsize * 2 +end + +local pattreg = { + doublesize = doublesize, +} + +local metareg = { + ["__new"] = pat_new, + ["__index"] = pattreg +} + +ffi.metatype(pattern, metareg) + +return { + checkaux = checkaux, + tocharset = tocharset, + fixedlenx = fixedlenx, + hascaptures = hascaptures, + compile = compile, +} |
