;;;; Recursive Descent Parser Generator ;;;; Copyright 2008 Peter Goodman, all rights reserved (defmacro as-list (lst) "Return a list; if the argument isn't a list then box it in a list." `(if (listp ,lst) ,lst (list ,lst))) (defmacro push-to-end (obj lst) "Push obj onto the end of lst." `(setf ,lst (append ,lst (as-list ,obj)))) (defun map-dfs (tree &key (list-fn #'identity) (elm-fn #'identity)) "Go as deep into the tree and apply the function to any lists, then gradually bubble up to the top by applying the function to lists." (declare (type list tree) (type function list-fn elm-fn)) (if (null tree) (return-from map-dfs nil)) (funcall list-fn (mapcar #'(lambda (elm) (if (listp elm) (map-dfs elm :list-fn list-fn :elm-fn elm-fn) (funcall elm-fn elm))) tree))) (defun default-for-type (type) "Return a 'default' value for a given type." (case type (string "") (character #\ ) ; hrmm ((number fixnum) 0) (list nil))) (defun concatenate* (elm-type lst) "Concatenate all cars within a tree." (declare (type list lst)) (let ((default-val (default-for-type elm-type))) (map-dfs lst :elm-fn #'(lambda (elm) (if (typep elm elm-type) elm default-val)) :list-fn #'(lambda (ls) (apply #'concatenate elm-type ls))))) (defmacro letg (symbol-list &rest body) "Takes a list of variable names and assigns them gensyms." `(let (,@(loop for sym in symbol-list collect (list sym '(gensym)))) ,@body)) (defparameter *parser-ops* (make-hash-table :size 14) "A hashtable of program defined parser operators.") (defparameter *parser-fncs* (make-hash-table) "A hashtable of user-defined parser functions.") (defparameter *intermediate-parser-fncs* (make-hash-table :size 20) "Intermediate parser functions.") ;;; Parser State struct and abbreviation macros (defstruct parser-state "The current state that the parser is in." (buffer "" :type string) (index 0 :type fixnum) (matches nil :type list)) (defmacro $state-buffer () `(parser-state-buffer $state)) (defmacro $state-index () `(parser-state-index $state)) (defmacro $state-matches () `(parser-state-matches $state)) ;;; Ways to pass parser state out of blocks (defmacro return-match (block-id &key (sub-matches nil) (matched t) (consumed t)) "Abbreviation of 'return-from' and 'values'." `(return-from ,block-id (values ,sub-matches ,matched , consumed))) (defmacro return-failed-match (block-id) "Abbreviation for a failed match." `(return-from ,block-id (values (list nil) nil nil))) (defmacro parser-error (str) (if (not str) (setf str "")) `(error (format nil "Expression Error: ~A" ,str))) ;;;; Parser functions (defun parser-make-string-matcher (str) "Make the part of a function to match the front of the text in the buffer against a string." (declare (type string str)) (let* ((str-len (length str)) (bool-consumed (> str-len 0))) (letg (block-id start-index end-index) `(block ,block-id (let* ((,start-index ($state-index)) (,end-index (+ ,start-index ,str-len))) (if (<= (+ ,str-len ,start-index) $len) ; bounds checking (if (string= ,str ($state-buffer) :start1 0 :start2 ,start-index :end1 ,str-len :end2 ,end-index) (progn (incf ($state-index) ,str-len) (return-match ,block-id :sub-matches (subseq ($state-buffer) ,start-index ,end-index) :consumed ,bool-consumed))))) (return-failed-match ,block-id))))) (defun parser-make-find-next (ops) "Find everything up to the next instance of a string, or repeatedly test the parser state buffer against a certain sub-parser function." (declare (type list ops)) (if (or (= 0 (length ops)) (> (length ops) 1)) (parser-error "find-next expects one and only one operand.")) (let ((op (car ops))) (if (null op) (parser-error "find-next expects one and only one parameter of type expression or string.")) (cond ;; continually test a function, without returning the results of that function ((symbolp op) (letg (block-id test-fn curr-index start-index) `(block ,block-id (let ((,test-fn #'(lambda nil ,op)) ; encapsulate the block of code into a test function (,start-index ($state-index))) (do ((,curr-index ,start-index (+ ,curr-index 1))) ((> ,curr-index $len) (return-failed-match ,block-id)) (multiple-value-bind (sub-matches matched? consumed?) (funcall ,test-fn) (if (and matched? consumed?) ;; we've matched something *and* consumed part of the buffer. note: ;; operators such as 'maybe' will match but won't consume. ;; ;; at this point any changes to the state index done by the sub-parser ;; function need to be undone so that we can return the proper matched ;; text (progn (setf ($state-index) ,curr-index) (return-match ,block-id :sub-matches (subseq ($state-buffer) ,start-index (+ ,curr-index 1)) :consumed (/= ,start-index ,curr-index)))))))))) ;; find the first occurence of a string ((or (stringp op) (characterp op)) (setf op (string op)) (let* ((str-len (length op)) (bool-consumed (> str-len 0))) (letg (block-id pos start-index) `(block ,block-id (let ((,start-index ($state-index)) (,pos (search ,op ($state-buffer) :test #'string= :start2 ($state-index) :end1 ,str-len :end2 $len))) (if (numberp ,pos) ;; matched the string (progn (setf ($state-index) ,pos) (return-match ,block-id :sub-matches (subseq ($state-buffer) ,start-index ,pos) :consumed (and (> ,pos ,start-index) ,bool-consumed))) ;; no match (return-failed-match ,block-id))))))) (t (parser-error "Unrecognized argument passed to find-next."))))) (defun parser-make-lambda-list (ops) "Given a list of tokenized operators, parse out the remaining string literals as string matchers and then encapsulate each sub-parser in a lambda." (declare (type list ops)) (cons 'list (mapcar #'(lambda (op) `#'(lambda nil ,op)) (parse-for-remaining-fns ops)))) (defun parser-make-or (fns) "Return the match results of whichever parser sub-functions succeeds to match." (declare (type list fns)) (if (< (length fns) 2) (parser-error "or expects at least two operands.")) (letg (block-id fn-lst fn start-index) `(block ,block-id (let ((,fn-lst ,(parser-make-lambda-list fns)) (,start-index ($state-index))) (dolist (,fn ,fn-lst) (multiple-value-bind (sub-matches matched? consumed?) (funcall ,fn) (if matched? (return-match ,block-id ; return the first successful match :sub-matches sub-matches :consumed consumed?) (setf ($state-index) ,start-index)))) ; return to a valid parser state (return-failed-match ,block-id))))) (defun parser-make-and (fns) "Collect the matches of any sub-operations. If a single sub-operation fails then the entire operation necessarily fails and no changes to buffer index are kept." (declare (type list fns)) (if (< (length fns) 2) (parser-error "and expects at least two operands.")) (letg (fn-lst fn start-index bool-consumed matches block-id) `(block ,block-id (let ((,fn-lst ,(parser-make-lambda-list fns)) (,bool-consumed nil) (,matches nil) (,start-index ($state-index))) (dolist (,fn ,fn-lst) (multiple-value-bind (sub-matches matched? consumed?) (funcall ,fn) (if consumed? (setf ,bool-consumed t)) (if (not matched?) (progn (setf ($state-index) ,start-index) (return-failed-match ,block-id)) (push-to-end sub-matches ,matches)))) (return-match ,block-id :sub-matches ,matches :consumed ,bool-consumed))))) (defun parser-make-maybe (fns) "Make the maybe operator. The maybe operator is defined in terms of an OR with an empty string as the last function." (parser-make-or (push-to-end "" fns))) (defun parser-make-pass (&rest ops) "The pass operator, it is defined by searching for an empty string." (parser-make-string-matcher "")) (defun parser-make-group (ops) "Collect the matches of any sub-operations and then concatenate them into one single match string." (declare (type list ops)) (letg (fn-list block-id fn bool-consumed bool-matched match matches) `(block ,block-id (let* ((,bool-consumed nil) (,bool-matched nil)) (return-match ,block-id :sub-matches (mapcar #'(lambda (,fn) (multiple-value-bind (sub-matches matched? consumed?) (funcall ,fn) (if consumed? (setf ,bool-consumed t)) (if matched? (setf ,bool-matched t)) sub-matches)) ,(parser-make-lambda-list ops)) :matched ,bool-matched :consumed ,bool-consumed))))) (defun parser-make-concat (ops) "Collect the matches of any sub-operations and then concatenate them into one single match string." (declare (type list ops)) (letg (block-id) `(block ,block-id (multiple-value-bind (sub-matches matched? consumed?) ,(parser-make-group ops) (return-from ,block-id (values (concatenate* 'string sub-matches) matched? consumed?)))))) (defun parser-make-no-capture (ops) "Do everything but return matches." (declare (type list ops)) (letg (block-id) `(block ,block-id (multiple-value-bind (sub-matches matched? consumed?) ,(parser-make-group ops) (return-from ,block-id (values nil matched? consumed?)))))) (defun parser-make-if (ops) "Create a function to conditionally match sequences. Note: the matches found from the conditions of an if statement are not recorded." (declare (type list ops)) (if (or (< (length ops) 2) (> (length ops) 3)) (parser-error "if takes between 2 and 3 operands.")) (if (= (length ops) 2) (push-to-end "" ops)) (letg (block-id fns) `(block ,block-id (let ((,fns ,(parser-make-lambda-list ops))) (multiple-value-bind (sub-matches matched? consumed?) (funcall (first ,fns)) (if matched? (funcall (second ,fns)) (funcall (third ,fns)))))))) (defun make-parser-make-repeat (lst) (declare (type list lst)) (if (< (length lst) 3) (parser-error "repeat expects at least three parameters.")) (parser-make-repeat (cddr lst) :min (abs (coerce (first lst) 'fixnum)) :max (if (eql (second lst) 'nil) nil (abs (coerce (second lst) 'fixnum))))) (defun parser-make-repeat (ops &key min max) "Repeatedly test the buffer against a function composed of all of ops. The test function is defined in terms of the group result of each function in ops." (declare (type list ops)) (if (and min max (or (< max min) (> min max))) (setf max min)) (letg (block-id lower-bound upper-bound matches fn bool-consumed start-index) `(block ,block-id (let ((,fn #'(lambda nil ,(parser-make-group ops))) (,lower-bound 0) (,bool-consumed nil) (,start-index ($state-index)) (,matches nil)) (loop for ,upper-bound from 0 by 1 do (multiple-value-bind (sub-matches matched? consumed?) (funcall ,fn) (if (or (not (and matched? consumed?)) (and ,max (>= ,upper-bound ,max))) (return nil)) (if consumed? (progn (setf ,bool-consumed t) (setf ,start-index ($state-index)))) (push-to-end (car sub-matches) ,matches) ; car because it was grouped (incf ,lower-bound))) (setf ($state-index) ,start-index) ; put the index back into a valid state (if (or (not ,min) (>= ,lower-bound ,min)) (return-match ,block-id :sub-matches ,matches :consumed ,bool-consumed) (return-failed-match ,block-id)))))) (defun parser-make-range (ops) "Match the next character if it is within the range of the two characters." (declare (type list ops)) (if (/= (length ops) 2) (parser-error "range expects two parameters, each of which must be characters.")) (let ((geq (first ops)) (leq (second ops))) (if (not (and (characterp geq) (characterp leq))) (parser-error "range expects both of its parameters to be characters.")) (letg (block-id ch) `(block ,block-id (if (>= ($state-index) $len) (return-failed-match ,block-id)) (let ((,ch (char ($state-buffer) ($state-index)))) (if (and (char>= ,ch ,geq) (char<= ,ch ,leq)) (progn (incf ($state-index)) (return-match ,block-id :sub-matches (string ,ch))) (return-failed-match ,block-id))))))) (defun parser-make-any-char (ops) "Match one of any of the characters in the list of characters OR if no list of characters is provided then accept any character as a match." (declare (type list ops)) (if (= (length ops) 0) (letg (block-id) `(block ,block-id (if (>= ($state-index) $len) (return-failed-match ,block-id)) (incf ($state-index)) (return-match ,block-id :sub-matches (string (char ($state-buffer) (- ($state-index) 1)))))) (progn (if (not (stringp (first ops))) (parser-error "any-char expects either no parameters or first (and only) parameter to be string.")) (parser-make-or (loop for ch across (first ops) collect (string ch)))))) (defun parser-make-error (ops) "Allow people to define and throw parse errors from within the grammar. The proper use of an error is with with an or. For example: (or (find-next ...) (error \"could not find next\"))." (declare (type list ops)) `(progn (throw 'parse-error ,(first ops)))) (defun make-parser-replace-symbols nil "Replace various symbols deep in a function. This function allows us to use some often used variables by meaningful names and have then automatically replaced by gensyms instead of having to do the gensyms for every parser function. This also goes through and looks for symbols that refer to sub code blocks (i.e. sub parser functions) and then inserts that code in." (letg (sym-sm sym-cm? sym-c?) #'(lambda (elm) (case elm (matched? sym-cm?) (consumed? sym-c?) (sub-matches sym-sm) (otherwise (if (and (symbolp elm) (gethash elm *intermediate-parser-fncs*)) (let ((code (gethash elm *intermediate-parser-fncs*))) (remhash elm *intermediate-parser-fncs*) code) elm)))))) (defun tokenize-sub-parser (code) "Associate a gensym with a block of code for a sub-parser function so that sub-parser functions don't interfere with any sub-parser functions they may be within during expression parsing." (letg (fn-name) (setf (gethash fn-name *intermediate-parser-fncs*) code) fn-name)) (defun parser-replace-funcall (fn-name) "Given a symbol name, figure out if it looks like a call to another parsing function and then replace it with code to call that." ;; this isn't a call to another parser function (if (or (gethash fn-name *intermediate-parser-fncs*) (gethash fn-name *parser-ops*)) (return-from parser-replace-funcall fn-name)) (let ((fn-error (format nil "Expression Error: parser function doesn't exist: ~A" fn-name))) (letg (fn sub-state sub-index sub-matches) (tokenize-sub-parser `(let ((,fn (gethash ',fn-name *parser-fncs*))) (if ,fn (let ((,sub-state (make-parser-state :buffer (subseq ($state-buffer) ($state-index))))) (funcall ,fn ,sub-state (- $len ($state-index))) (let ((,sub-index (parser-state-index ,sub-state)) (,sub-matches (parser-state-matches ,sub-state))) (incf ($state-index) ,sub-index) (values (list (cons ',fn-name ,sub-matches)) ; sub-matches t ; matched? (/= 0 ,sub-index)))) ; consumed? (error ,fn-error))))))) (defun parse-for-remaining-fns (lst) "Go through and parse any strings that were assumed as parameters to operators as string matcher sub-parsers. Also, look for calls to other parser functions and link to them." (map-dfs lst :elm-fn #'(lambda (elm) (cond ((or (stringp elm) (characterp elm)) (tokenize-sub-parser (parser-make-string-matcher (string elm)))) ((and (symbolp elm)) (parser-replace-funcall elm)) (t elm))))) (defun make-sub-parser (&rest fn-lst) "Go through all ops recursively and replace instances of 'sub-matches', 'matched?', and 'consumed?' with the appropriate gensyms. No variables of sub-parser will be overwritten because the parser constructs the pattern function bottom-up." (declare (type list fn-lst)) (if (and (length fn-lst) (listp (car fn-lst))) (setf fn-lst (car fn-lst))) (tokenize-sub-parser (map-dfs fn-lst :elm-fn (make-parser-replace-symbols)))) (defun parse-operator (lst) "Handle a list of tokens either as an call to a pattern operator or a string literal. If the type of the first element of lst is neither a symbol nor a string then error." (declare (type list lst)) (let ((a (car lst)) (d (cdr lst))) (cond ((or (characterp a) (stringp a)) (tokenize-sub-parser (parser-make-string-matcher (string a)))) ((and (symbolp a) (gethash a *parser-ops*)) (make-sub-parser (funcall (gethash a *parser-ops*) (as-list d)))) (t (parser-error "Unknown type encountered."))))) (defun parse-operators (elm) "For every car in the parser function parse for operators. Note: this function is applied to every car by means of a mapcar in defun-parse." (cond ((listp elm) `(multiple-value-bind (sub-matches matched? consumed?) ,(map-dfs elm :list-fn #'(lambda (lst) (parse-operator (as-list lst)))) (if matched? (push-to-end sub-matches ($state-matches))))) ((symbolp elm) (parser-replace-funcall elm)) (t elm))) (defmacro defun-parser (name &rest toks) "Define a named parser function." (declare (type symbol name) (type list toks)) (let* ((block-id (gensym)) (parser-fnc-name (make-sub-parser (mapcar #'parse-operators toks))) ; get the parser function we just created (code (gethash parser-fnc-name *intermediate-parser-fncs*))) ; empty out the hash table for the next parser function to use (remhash parser-fnc-name *intermediate-parser-fncs*) ; the final built-up parser function `(setf (gethash ',name *parser-fncs*) #'(lambda ($state $len) ,(push 'progn code) $state)))) (defun parse (fn-name str) "Parse a string using the specified parser function." (declare (type symbol fn-name) (type string str)) (let ((parse-fn (gethash fn-name *parser-fncs*))) (if parse-fn (let ((state (make-parser-state :buffer str)) (len (length str))) (parser-state-matches (funcall parse-fn state len))) (error (format nil "The parser function '~A' does not exist." fn-name))))) (defmacro set-parser-op (&rest lst) "Register a parser operator." (if (not (eq (mod (length lst) 2) 0)) (parser-error "set-parser-op only accepts a multiple of 2 arguments.")) `(setf ,@(loop while (> (length lst) 0) collect `(gethash ',(first lst) *parser-ops*) collect (second lst) do (setf lst (subseq lst 2))))) (set-parser-op find-next #'parser-make-find-next repeat #'make-parser-make-repeat range #'parser-make-range any-char #'parser-make-any-char ;; error reporting error #'parser-make-error ;; match modifiers group #'parser-make-group concat #'parser-make-concat no-capture #'parser-make-no-capture ;; conditionals if #'parser-make-if or #'parser-make-or and #'parser-make-and maybe #'parser-make-maybe pass #'parser-make-pass) ;;;; Mini grammar for splitting and HTML document by the HTML tags (defun-parser :html-attr (no-capture (repeat 0 nil " ")) (and (concat (repeat 0 nil (range #\a #\z)) (and ":" (repeat 0 nil (concat (range #\a #\z))))) (no-capture #\= #\") (find-next #\") (no-capture #\"))) (defun-parser :html-attrs (repeat 0 nil :html-attr)) (defun-parser :html-tag-name (concat (maybe "!") (repeat 0 nil (range #\a #\z)) (and ":" (repeat 0 nil (range #\a #\z))))) (defun-parser :html-tag (and (no-capture "<") (maybe "/") :html-tag-name (no-capture (repeat 0 nil " ")) :html-attrs (no-capture (repeat 0 nil " ")) (maybe "/") (no-capture ">"))) (defun-parser :html (repeat 0 nil (and (find-next "<") ; everything before or between two tags :html-tag)) ; tag (concat (repeat 0 nil (any-char)))) ; everything after the last tag matched (print (parse ':html "prefix text
{$comment.content}
postfix text"))