;;;; Lisp Regular Expression Parser / Language ;;;; Copyright 2008 Peter Goodman, all rights reserved ;;;; http://ioreader.com (defmacro as-list (lst) "Return a list; if the argument isn't a list then box it in a list." `(if (listp ,lst) ,lst (list ,lst))) (defmacro push-to-end (obj lst) "Push obj onto the end of lst." `(setf ,lst (append ,lst (as-list ,obj)))) (defmacro ltrim-subseq (seq sub) "Trim 'sub' from the start of 'seq'." `(setf ,seq (subseq ,seq (length ,sub)))) (defmacro to-int (expr) "Coerce an expression to an integer." `(coerce ,expr 'integer)) (defun concatenate* (type &rest lst) "Concatenate all cars within a tree." (setf lst (remove nil lst)) (if (null lst) "" ; nothing to concatenate, return (let ((a (car lst)) (d (cdr lst))) (if (listp a) (setf a (concatenate* type (car a) (cdr a)))) (if (not (typep a type)) (error "Invalid type passed to concatenate*.")) (concatenate type a (concatenate* type d))))) (defparameter *reg-exp-fns* (make-hash-table) "A hashtable holding all registered regular expression pattern parsers.") (defparameter *reg-exps* (make-hash-table) "A hashtable holding regular expression all pattern functions.") (defun reg-exp-make-repeat (ops) "Parse out any quantifiers for min, max, and length from the operands and then create a repeat function. TODO: the way this parses out parameters is ugly." (let ((min 0) (max nil) (length nil)) ; remove all (min #), (max #), and (length #) from the ops list and store their values. (setf ops (reduce #'(lambda (lst elm) (if (not (listp elm)) (push-to-end elm lst) (let ((a (first elm))) (cond ((eql a 'min) (setf min (to-int (second elm)))) ((eql a 'max) (setf max (to-int (second elm)))) ((eql a 'length) (setf length (to-int (second elm)))) ((push-to-end elm lst))))) lst) ops :initial-value nil)) (reg-exp-repeat-test (parse-reg-exp ops) :min min :max max :length length :ops ops))) (defun reg-exp-repeat-test (test &key (min 0) (max nil) (length nil) (ops nil)) "Rpeatedly test the start of a string against a function." (if length (setf min length max length)) (if (and min max (< max min)) (setf max min)) #'(lambda (matches seq) ;; Repeatedly call the test function on 'seq' and record the results in ;; the 'matches' list. (let ((result nil) (sub-matches nil) (lower-bound 0)) (loop for upper-bound from 0 by 1 do (setf result (funcall test nil seq)) ; no match, break out of the loop if (or (not result) (not (third result)) (and max (>= upper-bound max)) ; check to see that we haven't gone above our max repetitions (= (length seq) (length (second result)))); stop infinite loops for (repeat (maybe ..)) or (repeat (if ..)) return nil ; we've matched something, add the result-matches into sub-matches ; and replace the seq with new-seq do (incf lower-bound) (push-to-end (first result) sub-matches) (setf seq (second result))) ; only return if min <= (the number of matches) <= max (if (or (not min) (>= lower-bound min)) (list (push-to-end sub-matches matches) seq t))))) (defun reg-exp-and (ops) "Serially execute some regular expressions. If return a failed match for all of them." (let ((fns (mapcar #'(lambda (elm) (parse-reg-exp elm)) ops))) #'(lambda (matches seq) (let ((result nil) (sub-matches nil) (num-matches 0)) ; loop until one of the functions fails to match something (loop for fn in fns do (setf result (funcall fn nil seq)) ; no match (note: failed maybe is considered a no match) if (or (not result) (not (third result))) ; **** return nil ; matched, update the seq do (setf seq (second result)) (push-to-end (first result) sub-matches) (incf num-matches)) ; and requires that we match as much as the number of pattern functions (if (= num-matches (length fns)) (list (push-to-end sub-matches matches) seq t)))))) (defun reg-exp-or (ops) "Serially execute some regular expressions until one matches." (let ((fns (mapcar #'(lambda (elm) (parse-reg-exp (as-list elm))) ops))) #'(lambda (matches seq) (let ((result nil) (matched nil)) ; loop until one of the functions matches something (loop for fn in fns do (setf result (funcall fn nil seq)) ; we've matched something if (and result (third result)) ; **** return (setf matched t)) (if matched (list (push-to-end (first result) matches) (second result) t)))))) (defun reg-exp-maybe (ops) "Attempt to match something, but if the match isn't successful then it isn't a problem. Returns a closure to force it to always appear to match." (let ((parser-fn (reg-exp-or (list (as-list ops) (list 'pass (list 'maybe ops)))))) #'(lambda (matches seq) (let ((result (funcall parser-fn nil seq))) (list (push-to-end (first result) matches) (second result) t))))) (defun reg-exp-find-next (ops) "Find everything up to the next instance of 'str'." (if (stringp (first ops)) ; find everything up to but not including the first occurrence of str ; in the sequence #'(lambda (matches seq) (let* ((str (first ops)) (pos (search str seq :test #'string=))) (if pos (let ((prefix-seq (subseq seq 0 pos))) (list (push-to-end prefix-seq matches) (ltrim-subseq seq prefix-seq) t))))) ; a sequence of patterns was passed to fin next, we will need to test for the ; pattern until we find it (let ((parser-fn (parse-reg-exp ops))) #'(lambda (matches seq) (let ((result nil) (match-length 0) (matched nil) (old-seq seq)) ; try to find a match (loop while (> (length seq) 0) do (setf result (funcall parser-fn nil seq)) ; either failed match OR successful match but nothing matched, ; so consume a character of test-seq if (or (not result) (not (third result)) (= (length seq) (length (second result)))) ; successful but nothing matched do (setf seq (subseq seq 1) match-length (+ match-length 1)) else return (setf matched t)) ; return the match (if matched (let* ((match (subseq old-seq 0 match-length))) (list (push-to-end match matches) seq t)))))))) (defun reg-exp-any-char (chars) "Test if the next character is any one of the characters in 'chars'" (setf chars (first chars)) (if (not (stringp chars)) ; no list of characters were passed, allow this to match any character #'(lambda (matches seq) (if (> (length seq) 0) (list (push-to-end (string (char seq 0)) matches) (subseq seq 1) t))) ; any-char works by creating an or of the characters in the list (reg-exp-or (mapcar #'string (coerce chars 'list))))) (defun reg-exp-string (str) "Check if the first few characters in a string are the same as 'word'" (if (not (stringp str)) (error "Invalid argument type to reg-exp-string.")) (let ((len (length str)) (search-seq str)) ; wtf, why won't 'str' carry down into the lambda (w/o the let) #'(lambda (matches seq) (if (and (<= len (length seq)) (string= seq search-seq :start1 0 :start2 0 :end1 len :end2 len)) (list (push-to-end search-seq matches) (ltrim-subseq seq search-seq) t))))) (defun reg-exp-range (str) "Test if the next character falls between two other characters." (setf str (car str)) (if (or (not (stringp str)) (< (length str) 2)) (error "A string of length 2 must be passed to :range.")) (let ((min (char str 0)) (max (char str 1))) #'(lambda (matches seq) (if (> (length seq) 0) (let* ((ch (char seq 0)) (ch->string (string ch))) ; check if the first character, ch, in seq is such that min <= ch <= max. (if (and (char>= ch min) (char<= ch max)) (list (push-to-end ch->string matches) (ltrim-subseq seq ch->string) t))))))) (defun reg-exp-no-capture (ops) "Create a function that will return the same matches as were passed in and the parsed sequence, i.e. it caputures the sequence updates but doesn't report them in the matches list." (let ((parser-fn (parse-reg-exp ops))) #'(lambda (matches seq) (let ((result (funcall parser-fn matches seq))) (list matches (second result) (third result)))))) (defun reg-exp-group (ops) "Create a function that groups matches." (let ((parser-fn (parse-reg-exp ops))) #'(lambda (matches seq) (let ((result (funcall parser-fn matches seq))) (if (and result (third result)) (list (push-to-end (list (first result)) matches) (second result) t)))))) (defun reg-exp-if (ops) "Create a function to conditionally match sequences." (let ((condition (first ops)) (yes-ops (second ops)) (no-ops (third ops))) (if (or (null condition) (null yes-ops)) (error "Not enough or incorrectly made arguments to reg-exp-if.")) (let ((cond-fn (parse-reg-exp condition)) (cond-true-fn (parse-reg-exp yes-ops)) (cond-false-fn (if (null no-ops) #'(lambda (matches seq) (list matches seq t)) (parse-reg-exp no-ops)))) #'(lambda (matches seq) (let ((result (funcall cond-fn matches seq))) (if (and result (third result)) (funcall cond-true-fn (first result) (second result)) (funcall cond-false-fn matches seq))))))) (defun reg-exp-pass (&optional ops) "Create a function that matches nothing." #'(lambda (matches seq) (list matches seq t))) (defun reg-exp-concat (ops) "Create a function that concatenates any sub-matches into a single string." (let ((parser-fn (parse-reg-exp ops))) #'(lambda (matches seq) (let ((result (funcall parser-fn matches seq))) (if (and result (third result)) ; *** (list (push-to-end (concatenate* 'string (first result)) matches) (second result) t)))))) (defun reg-exp-func-call (fn-name) "Call a regular expression pattern function." #'(lambda (matches seq) ; checking is done at runtime to make sure all defined pattern functions are ; visible to each-other (if (not (gethash fn-name *reg-exps*)) (error (format nil "Unsupport operator or pattern function: ~A" fn-name))) ; call the pattern function (funcall (gethash fn-name *reg-exps*) matches seq))) (defun parse-reg-exp (tokens) "Recursively build up a function to parse a string based on the tokens of the regular expression." ; matched everything we can, stop (if (or (null tokens) (not tokens)) (return-from parse-reg-exp #'(lambda (matches seq) (list matches seq nil)))) ; make sure we were passed a list (if (not (listp tokens)) (setf tokens (list tokens))) (let ((a (car tokens))) ; we've found a symbol, wrap it in a list to parse it (if (symbolp a) (progn (setf a tokens) (setf tokens (list tokens)))) ; figure out what to do with this list (let* ((d (cdr tokens)) ; handle the various regex operators (i (cond ((listp a) (let ((op (car a)) (operands (cdr a))) (if (listp op) (parse-reg-exp op) (let ((fn (gethash op *reg-exp-fns*))) (if fn (funcall fn operands) (reg-exp-func-call op)))))) ((or (characterp a) (stringp a)) (reg-exp-string (string a))) ((error "Unrecognized terminal type.")))) (j (if (not (null d)) (parse-reg-exp d) (reg-exp-pass)))) ; recursively call our string matchers by sequentially chaining them #'(lambda (matches seq) (let ((ret (list matches seq nil))) (if (> (length seq) 0) (let* ((result (funcall i nil seq))) ; we matched something, call the next parser in line (if (and result (third result)) (setf ret (funcall j (push-to-end (first result) matches) (second result)))))) ret))))) (defmacro defun-reg-exp (name &rest tokens) "Define a regular expression pattern function." `(setf (gethash ',name *reg-exps*) ,(parse-reg-exp (as-list tokens)))) (defmacro defmacro-reg-exp (&rest lst) "Register a regular expression pattern handler." (if (not (eq (mod (length lst) 2) 0)) (error "defmacro-reg-exp only accepts a multiple of 2 arguments.")) (let ((ret (loop while (> (length lst) 0) collect `(setf (gethash ',(first lst) *reg-exp-fns*) ,(second lst)) do (setf lst (subseq lst 2))))) (push 'progn ret))) (defmacro reg-exp-match (name str) "Match as much as possible in str given the symbolic name of a pattern function." `(car (funcall (reg-exp-func-call ,name) nil ,str))) ;; register the pattern matching function handlers (defmacro-reg-exp repeat #'reg-exp-make-repeat ;; concat #'reg-exp-concat ;; maybe #'reg-exp-maybe ;; find-next #'reg-exp-find-next ;; or #'reg-exp-or ;; and #'reg-exp-and ;; range #'reg-exp-range ;; any-char #'reg-exp-any-char ;; no-capture #'reg-exp-no-capture ;; group #'reg-exp-group ;; if #'reg-exp-if ;; pass #'reg-exp-pass) ;; ;;;; Mini grammar for simple XML tags (defun-reg-exp :html-tag (no-capture "<") (group (maybe "/") ; closing tag? (group (concat (repeat (or (range "az") (range "09")))) ; namespace / tag name (if (no-capture ":") (concat (repeat (or (range "az") (range "09")))))) ; tag name (group (repeat (group (and (no-capture (repeat (min 1) " ")) ; spaces before parameter (find-next "=") ; parameter name (no-capture "=\"") ; get and ignore the =\" (find-next "\"") ; parameter value (no-capture "\""))))) ; close the \" (no-capture (find-next (or "/>" ">"))) (maybe "/") ; non-closing tag? (no-capture ">"))) (defun-reg-exp :split-html (repeat (and (find-next "<") :html-tag)) (concat (repeat (any-char)))) (print (reg-exp-match :split-html "prefix text
{$comment.content}
postfix text"))