--- closure/src/util/clex.lisp 2006-12-31 12:38:11.000000000 +0100 +++ cxml-rng/clex.lisp 2007-04-29 20:10:41.000000000 +0200 @@ -25,9 +25,19 @@ ;;; CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, ;;; TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE ;;; SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +;;; + +;;; Changes + +;; When Who What +;; ---------------------------------------------------------------------------- +;; 2007-04-29 DFL - Represent RANGE directly to cope with character +;; set sizes typical for Unicode. +;; - Disable *full-table-p* by default. +;; - Added SBCL case to the CMUCL workarounds. (defpackage :clex - (:use :cl :glisp :runes) + (:use :cl :runes) (:export #:deflexer #:backup #:begin #:initial #:bag)) @@ -45,16 +55,67 @@ id ;numeric id of state eps-transitions) ;list of all states reached by epsilon (empty transitions) -(defun state-add-link (this char that) - "Add a transition to state `this'; reading `char' proceeds to `that'." - (cond ((eq char 'eps) +(defun destructure-range (x) + (if (listp x) + (values (car x) (cadr x)) + (values x x))) + +(defun range- (a b) + (multiple-value-bind (amin amax) (destructure-range a) + (multiple-value-bind (bmin bmax) (destructure-range b) + (incf amax) + (incf bmax) + (let ((result nil)) + (flet ((range* (min max) + (when (< min max) + (push (list min (1- max)) result)))) + (range* amin (min bmin amax)) + (range* (max amin bmax) amax)) + result)))) + +(defun ranges- (aa b) + (mapcan (lambda (a) (range- a b)) aa)) + +(defun partition-range (a pos) + (multiple-value-bind (min max) (destructure-range a) + (if (and (< min pos) (<= pos max)) + (list (list min (1- pos)) + (list pos max)) + (list a)))) + +(defun code (x) + (typecase x + (integer x) + (character (char-code x)))) + +(defun parse-range (range) + (if (listp range) + (list (code (car range)) (code (cadr range))) + (list (code range) (code range)))) + +(defun state-add-link (this range that) + "Add a transition to state `this'; reading `range' proceeds to `that'." + (cond ((eq range 'eps) (pushnew that (state-eps-transitions this))) - (t - (dolist (k (state-transitions this) - (push (cons (list char) that) (state-transitions this))) - (when (eq (cdr k) that) - (pushnew char (car k)) - (return nil))) ))) + (t + (let ((new (list (parse-range range)))) + (dolist (k (state-transitions this) + (push (cons new that) (state-transitions this))) + (when (eq (cdr k) that) + (dolist (l (car k)) ;avoid duplicates + (setf new (ranges- new l))) + (setf (car k) (append new (car k))) + (return nil))) + ;; split existing ranges to remove overlap + (dolist (k (state-transitions this)) + (flet ((doit (pos) + (setf (car k) + (mapcan (lambda (l) + (partition-range l pos)) + (car k))))) + (dolist (n new) + (doit (car n)) + (doit (1+ (cadr n)))))))))) ;;; When constructing FSA's from regular expressions we abstract by the notation ;;; of FSA's as boxen with an entry and an exit state. @@ -134,14 +195,13 @@ (setf term (regexp-expand-splicing term)) (cond ((and (atom term) (not (stringp term))) (fsa-trivial term)) + ((loose-eq (car term) 'RANGE) + (fsa-trivial (cdr term))) ((loose-eq (car term) 'AND) (regexp/and->fsa term)) ((loose-eq (car term) 'OR) (regexp/or->fsa term)) ((loose-eq (car term) '*) (fsa-iterate (regexp->fsa (cadr term)))) ((loose-eq (car term) '+) (regexp->fsa `(AND ,(cadr term) (* ,(cadr term))))) ((loose-eq (car term) '?) (regexp->fsa `(OR (AND) ,(cadr term)))) - ((loose-eq (car term) 'RANGE) - (regexp->fsa `(OR .,(loop for i from (char-code (cadr term)) to (char-code (caddr term)) - collect (code-char i))))) (t (regexp->fsa `(AND .,term))) )) @@ -251,7 +311,7 @@ (push state-set batch) new))) (add-state-set (state-set) - (let ((new-tr nil) + (let ((new-tr (make-hash-table :test 'equal)) (new-tr-real nil) (name (name-state-set state-set)) (new-final 0)) @@ -261,14 +321,13 @@ (dolist (tr (state-transitions s)) (let ((to (cdr tr))) (dolist (z (car tr)) - (let ((looked (getf new-tr z nil))) + (let ((looked (gethash z new-tr))) (if looked (fsa-epsilon-closure/set to looked) (let ((sts (make-empty-set n))) (fsa-epsilon-closure/set to sts) - (setf (getf new-tr z) sts) )))))))) - (setq new-tr (frob2 new-tr)) - (do ((q new-tr (cddr q))) + (setf (gethash z new-tr) sts))))))))) + (do ((q (frob2 new-tr) (cddr q))) ((null q)) (let ((z (car q)) (to (cadr q))) @@ -286,14 +345,15 @@ (add-state-set (pop batch)))) )))) (defun frob2 (res &aux res2) - (do ((q res (cddr q))) - ((null q) res2) - (do ((p res2 (cddr p))) - ((null p) - (setf res2 (list* (list (car q)) (cadr q) res2))) - (when (equal (cadr q) (cadr p)) - (setf (car p) (cons (car q) (car p))) - (return))))) + (maphash (lambda (z to) + (do ((p res2 (cddr p))) + ((null p) + (setf res2 (list* (list z) to res2))) + (when (equal to (cadr p)) + (setf (car p) (cons z (car p))) + (return)))) + res) + res2) ;;;; ---------------------------------------------------------------------------------------------------- ;;;; API @@ -317,11 +377,11 @@ ;;; - identifing sub-expression of regexps (ala \(..\) and \n) ;;; -#-(OR CMU GCL) +#-(OR CMU SBCL GCL) (defun loadable-states-form (starts) `',starts) -#+(OR CMU GCL) +#+(OR CMU SBCL GCL) ;; Leider ist das CMUCL so dumm, dass es scheinbar nicht faehig ist die ;; selbstbezuegliche Structur ',starts in ein FASL file zu dumpen ;-( ;; Deswegen hier dieser read-from-string Hack. @@ -332,124 +392,6 @@ (*print-pretty* nil)) (prin1-to-string starts))))) -(defmacro old/deflexer (name macro-defs &rest rule-defs) - (let ((macros nil) starts clauses (n-fin 0)) - (dolist (k macro-defs) - (push (cons (car k) (sublis macros (cadr k))) macros)) - ;;canon clauses -- each element of rule-defs becomes (start expr end action) - (setq rule-defs - (mapcar #'(lambda (x) - (cond ((and (consp (car x)) (eq (caar x) 'in)) - (list (cadar x) (sublis macros (caddar x)) (progn (incf n-fin) n-fin) (cdr x))) - ((list 'initial (sublis macros (car x)) (progn (incf n-fin) n-fin) (cdr x))))) - (reverse rule-defs))) - ;;collect all start states in alist ( . ) - (setq starts (mapcar #'(lambda (name) - (cons name (make-state))) - (remove-duplicates (mapcar #'car rule-defs)))) - ;;build the nd-fsa's - (dolist (r rule-defs) - (destructuring-bind (start expr end action) r - (let ((q0 (cdr (assoc start starts))) - (fsa (regexp->fsa `(and ,expr)))) - ;;link start state - (state-add-link q0 'eps (fsa-start fsa)) - ;;mark final state - (setf (state-final (fsa-end fsa)) end) - ;; build a clause for CASE - (push `((,end) .,action) clauses)))) - ;; hmm... we have to sort the final states after building the dfsa - ;; or introduce fixnum identifier and instead of union take the minimum - ;; above in ndfsa->dfsa. - (progn - (mapcar #'(lambda (x y) (setf (cdr x) y)) - starts (ndfsa->dfsa (mapcar #'cdr starts)))) - ;; (print (number-states starts)) - `(DEFUN ,(intern (format nil "MAKE-~A-LEXER" name)) (INPUT) - (LET* ((STARTS ,(loadable-states-form starts)) - (SUB-STATE 'INITIAL) - (STATE NIL) - (LOOK-AHEAD NIL) - (BAGG/CH (G/MAKE-STRING 100 :FILL-POINTER 0 :ADJUSTABLE T)) - (BAGG/STATE (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T)) - (CH NIL)) - #'(LAMBDA () - (BLOCK NIL - (LABELS ((BEGIN (X) - (SETQ SUB-STATE X)) - (BACKUP (CH) - (COND ((STRINGP CH) - (WHEN (> (LENGTH CH) 0) - (PUSH (CONS 0 CH) LOOK-AHEAD))) - (T (PUSH CH LOOK-AHEAD)))) - (PUSH* (CH STATE) - (VECTOR-PUSH-EXTEND CH BAGG/CH 10) - (VECTOR-PUSH-EXTEND STATE BAGG/STATE 10) ) - (POP*/CH () - (LET ((FP (LENGTH BAGG/CH))) - (PROG1 (AREF BAGG/CH (1- FP)) - (SETF (FILL-POINTER BAGG/STATE) (1- FP)) - (SETF (FILL-POINTER BAGG/CH) (1- FP))))) - (TOS*/STATE () - (AREF BAGG/STATE (1- (LENGTH BAGG/STATE))) ) - (EMPTY*? () - (= (LENGTH BAGG/CH) 0)) - (REWIND* () - (SETF (FILL-POINTER BAGG/CH) 0) - (SETF (FILL-POINTER BAGG/STATE) 0) ) - (STRING* () - (COPY-SEQ BAGG/CH)) - #+(OR) - (FIND-NEXT-STATE (CH STATE) - (DOLIST (K (STATE-TRANSITIONS STATE)) - (WHEN (MEMBER CH (CAR K)) - (RETURN (CDR K))))) - (GETCH () - (COND ((NULL LOOK-AHEAD) (READ-CHAR INPUT NIL NIL)) - ((CONSP (CAR LOOK-AHEAD)) - (LET ((S (CDAR LOOK-AHEAD))) - (PROG1 - (AREF S (CAAR LOOK-AHEAD)) - (INCF (CAAR LOOK-AHEAD)) - (WHEN (= (CAAR LOOK-AHEAD) (LENGTH S)) - (POP LOOK-AHEAD))))) - (T (POP LOOK-AHEAD)) ))) - (DECLARE (INLINE BACKUP GETCH)) - (TAGBODY - START (SETQ STATE (CDR (ASSOC SUB-STATE STARTS))) - (WHEN (NULL STATE) - (ERROR "Sub-state ~S is not defined." SUB-STATE)) - (REWIND*) - LOOP (SETQ CH (GETCH)) - (LET ((NEXT-STATE - (BLOCK FOO - (DOLIST (K (STATE-TRANSITIONS STATE)) - (DOLIST (Q (CAR K)) - (WHEN (EQL CH Q) - (RETURN-FROM FOO (CDR K)))))) )) - (COND ((NULL NEXT-STATE) - (BACKUP CH) - (DO () - ((OR (EMPTY*?) (NOT (EQ 0 (TOS*/STATE))))) - (BACKUP (POP*/CH))) - (COND ((AND (EMPTY*?) (NULL CH)) - (RETURN :EOF)) - ((EMPTY*?) - (ERROR "oops ~S ~S" ch (mapcar #'car (state-transitions state)))) - (T - (LET ((HALTING-STATE (TOS*/STATE))) - (LET ((BAG* NIL)) - (SYMBOL-MACROLET ((BAG (IF BAG* - BAG* - (SETF BAG* (STRING*))))) - (CASE HALTING-STATE - ,@clauses))) - (GO START))))) - (T - (PUSH* CH (STATE-FINAL NEXT-STATE)) - (SETQ STATE NEXT-STATE) - (GO LOOP)))))))))))) - ;;;; ---------------------------------------------------------------------------------------------------- ;;;; @@ -475,14 +417,16 @@ ;;;; ------------------------------------------------------------------------------------------ -(defparameter *full-table-p* t) +(defparameter *full-table-p* nil) (defun mungle-transitions (trs) (if *full-table-p* (let ((res (make-array 256 :initial-element nil))) (dolist (tr trs) - (dolist (ch (car tr)) - (setf (aref res (char-code ch)) (cdr tr)))) + (dolist (range (car tr)) + (loop + for code from (car range) to (cadr range) + do (setf (aref res code) (cdr tr))))) res) trs)) @@ -543,7 +487,8 @@ (SUB-STATE 'INITIAL) (STATE NIL) (LOOK-AHEAD NIL) - (BAGG/CH (G/MAKE-STRING 100 :FILL-POINTER 0 :ADJUSTABLE T)) + (BAGG/CH (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T + :ELEMENT-TYPE 'CHARACTER)) (BAGG/STATE (MAKE-ARRAY 100 :FILL-POINTER 0 :ADJUSTABLE T)) (CH NIL)) #'(LAMBDA () @@ -588,11 +533,12 @@ (SVREF (STATE-TRANSITIONS STATE) (CHAR-CODE CH)) NIL)) `(FIND-NEXT-STATE (STATE CH) - (BLOCK FOO - (DOLIST (K (STATE-TRANSITIONS STATE)) - (DOLIST (Q (CAR K)) - (WHEN (CHAR= CH Q) - (RETURN-FROM FOO (CDR K)))))))) ) + (WHEN ch + (BLOCK FOO + (DOLIST (K (STATE-TRANSITIONS STATE)) + (DOLIST (Q (CAR K)) + (WHEN (<= (CAR Q) (CHAR-CODE CH) (CADR q)) + (RETURN-FROM FOO (CDR K))))))))) ) (DECLARE (INLINE BACKUP GETCH FIND-NEXT-STATE)) (TAGBODY START (SETQ STATE (CDR (ASSOC SUB-STATE STARTS))) @@ -609,7 +555,10 @@ (COND ((AND (EMPTY*?) (NULL CH)) (RETURN :EOF)) ((EMPTY*?) - (ERROR "oops ~S ~S" ch (mapcar #'car (state-transitions state)))) + (ERROR "oops at ~A: ~S ~S" + (FILE-POSITION INPUT) + ch + (mapcar #'car (state-transitions state)))) (T (LET ((HALTING-STATE (TOS*/STATE))) (LET ((BAG* NIL))