-------------------------------------------------------------------------------- -- Partial Derivative Regular Expression Pattern Matching -- (or XHaskell light: Regular expression pattern matching -- interpreter style) -- Martin Sulzmanan and Kenny Z.M. Lu -------------------------------------------------------------------------------- module RegExpPatMatch where import Monad ------------------------------------------------------------------------- -- Step 1: An implementation of the word problem via partial derivatives -- -- (l,w) in L(r) iff w in L(pd(r,l)) -- where L(pd(r,l)) = { w' | (l,w') in L(r) } -- pd(r,l) is the partial derivative of r wrt l -- -- we can straightforwardly build a word matching algorithm -- by assuming that -- regexpressions r denote states -- each r where isEmpty r is an acceptances state -- state transitions are computed by building the partial derivative -- the regular expression language, parameterized over some alphabet data RE a where Phi :: RE a -- empty language Empty :: RE a -- empty word L :: a -> RE a -- single letter taken from alphabet a Choice :: RE a -> RE a -> RE a -- r1 + r2 Seq :: RE a -> RE a -> RE a -- (r1,r2) Star :: RE a -> RE a -- r* deriving Eq -- some combinators re_star r = Star r re_empty = Empty re_letter l = L l re_choice r1 r2 = Choice r1 r2 re_pair r1 r2 = Seq r1 r2 -- a word is a list of alphabet letters type Word a = [a] -- pretty printing of regular expressions instance Show a => Show (RE a) where show Phi = "{}" show Empty = "<>" show (L c) = show c show (Choice r1 r2) = ("(" ++ show r1 ++ "|" ++ show r2 ++ ")") show (Seq r1 r2) = ("<" ++ show r1 ++ "," ++ show r2 ++ ">") show (Star r) = (show r ++ "*") -- match r w iff the word w is in the language denoted by r matchWord :: Eq a => RE a -> Word a -> Bool matchWord r [] = isEmpty r matchWord r (l:w) = matchWord (partDeriv r l) w -- isEmpty r iff the language denoted by r contains the empty word isEmpty :: RE a -> Bool isEmpty Phi = False isEmpty Empty = True isEmpty (Choice r1 r2) = (isEmpty r1) || (isEmpty r2) isEmpty (Seq r1 r2) = (isEmpty r1) && (isEmpty r2) isEmpty (Star r) = True isEmpty (L _) = False -- partDeriv r l denotes the regular expression where the -- "leading l has been removed" partDeriv :: Eq a => RE a -> a -> RE a partDeriv Phi _ = Phi partDeriv Empty _ = Phi partDeriv (L l1) l2 | l1 == l2 = Empty | otherwise = Phi partDeriv (Choice r1 r2) l = Choice (partDeriv r1 l) (partDeriv r2 l) partDeriv (Seq r1 r2) l = if isEmpty r1 then Choice (Seq (partDeriv r1 l) r2) (partDeriv r2 l) else Seq (partDeriv r1 l) r2 partDeriv (this@(Star r)) l = Seq (partDeriv r l) this -- some examples r1 = (Choice (Star (Seq (L 'A') (L 'B'))) (L 'B')) -- ((A,B)*|B) r2 = Seq (Star (L 'A')) (Star (L 'A')) -- (A*,A*) r3 = Star (Choice (L 'A') (L 'B')) -- (A|B)* w1 = "AABBAAA" m1 = matchWord r1 w1 m2 = matchWord r2 w1 m3 = matchWord r3 w1 ------------------------------------------------------------------------- -- Step 2: A partial derivative regular expression pattern matching -- -- (l,w) <| p ~> Env iff w <| pdPat l p ~> Env -- pronounced as -- word (l,w) matches the pattern p and yields environment Env -- iff w matches the partial pattern derivative p wrt l {- pdPat l ( x : r) = x : pd(l | r) variable pattern ( x : r) says that we have already consumed w, does l fit in? We simply append l and build the partial derivative of r wrt l pdPat l = if () \in p1 then | -- (1) else if p1 is empty we need to cover the case that p1 is matched against the empty word, mkEmpty p1 replaces all empty regexp in p1 by Empty, otherwise we replace by Phi Case (1) makes pattern matching indeterministic, we compute all possible matchings. For longest match, we first match against , if the match fails we try pdPat l (p1|p2)) = (pdPat l p1) | (pdPat l p2) -} -- the pattern language, we assume that patterns are linear data Pat a where PVar :: Int -> Word a -> RE a-> Pat a -- PVar var w r -- variables var are represented by Ints -- w represents the part we have already seen -- r represents the remaining part we yet have to match PPair :: Pat a -> Pat a -> Pat a PChoice :: Pat a -> Pat a -> Pat a -- combinators pat_var n r = PVar n [] r pat_pair p1 p2 = PPair p1 p2 pat_choice p1 p2 = PChoice p1 p2 -- pretty printing instance Show a => Show (Pat a) where show (PVar i w r) = ("x"++show i ++ "::" ++ show r) show (PPair p1 p2) = ("<" ++ show p1 ++ "," ++ show p2 ++ ">") show (PChoice p1 p2) = "(" ++ show p1 ++ "|" ++ show p2 ++ ")" -- the binding of pattern variables to words type Env a = [(Int,Word a)] -- we compute all possible matchings -- indet arise due to pat choice -- pdPat yields pat choice patMatch :: Eq a => Pat a -> Word a -> [Env a] patMatch p (l:w) = patMatch (pdPat p l) w patMatch (PVar x w r) [] = if isEmpty r then [[(x,w)]] else [] patMatch (PChoice p1 p2) [] = (patMatch p1 []) ++ (patMatch p2 []) -- indet choice patMatch (PPair p1 p2) [] = (patMatch p1 []) `combine` (patMatch p2 []) -- build all possible combinations where combine xss yss = [ xs ++ ys | xs <- xss, ys <- yss] longPatMatch :: Eq a => Pat a -> Word a -> Maybe (Env a) longPatMatch p w = first (patMatch p w) where first (env:_) = return env first _ = Nothing shortPatMatch :: Eq a => Pat a -> Word a -> Maybe (Env a) shortPatMatch p w = last (patMatch p w) where last [env] = return env last (_:xs) = last xs last _ = Nothing -- partial derivative of patterns pdPat :: Eq a => Pat a -> a -> Pat a pdPat (PVar x w r) l = PVar x (w ++ [l]) (partDeriv r l) pdPat (PPair p1 p2) l = if (isEmpty (strip p1)) then PChoice (PPair (pdPat p1 l) p2) (PPair (mkEmpPat p1) (pdPat p2 l)) else PPair (pdPat p1 l) p2 pdPat (PChoice p1 p2) l = PChoice (pdPat p1 l) (pdPat p2 l) strip :: Pat a -> RE a strip (PVar _ w r) = r strip (PPair p1 p2) = Seq (strip p1) (strip p2) strip (PChoice p1 p2) = Choice (strip p1) (strip p2) -- replace all ( x : r) by ( x: <>) if isEmpty r -- otherwise yield ( x: Phi) mkEmpPat :: Pat a -> Pat a mkEmpPat (PVar x w r) | isEmpty r = PVar x w Empty | otherwise = PVar x w Phi mkEmpPat (PPair p1 p2) = PPair (mkEmpPat p1) (mkEmpPat p2) mkEmpPat (PChoice p1 p2) = PChoice (mkEmpPat p1) (mkEmpPat p2) -- examples pvar n = (PVar n [] (Star (L 'A'))) -- Burak's example, r = (Choice (Star (Seq (L 'A') (L 'B'))) (L 'B')) -- ((A,B)*|B) p = PPair (pvar 1) (PVar 2 [] r) -- pm = longPatMatch p "AB" pm' = patMatch p "AB" p2 = PPair (pvar 1) (pvar 2) -- p3 = PPair (PVar 1 [] r3) (PVar 2 [] r3) -- -- p4 = << x : (A|), y : (>|A) >, z : (|C) > p4 = PPair (PPair p_x p_y) p_z where p_x = PVar 1 [] (Choice (L 'A') (Seq (L 'A') (L 'B'))) p_y = PVar 2 [] (Choice (Seq (L 'B') (Seq (L 'A') (L 'A'))) (L 'A')) p_z = PVar 3 [] (Choice (Seq (L 'A') (L 'C')) (L 'C')) pm4 = longPatMatch p4 "ABAAC" p5 = PPair p_x (PPair p_y p_z) where p_x = PVar 1 [] (Choice (L 'A') (Seq (L 'A') (L 'B'))) p_y = PVar 2 [] (Choice (Seq (L 'B') (Seq (L 'A') (L 'A'))) (L 'A')) p_z = PVar 3 [] (Choice (Seq (L 'A') (L 'C')) (L 'C')) p6 = PPair p_x (PPair p_y p_z) where p_x = PVar 1 [] (L 'A') p_y = PVar 2 [] (L 'B') p_z = PVar 3 [] (L 'C') p7 = PPair (PPair p_x p_y) p_z where p_x = PVar 1 [] (L 'A') p_y = PVar 2 [] (L 'B') p_z = PVar 3 [] (L 'C') p7A = pdPat p7 'A' p7AB = pdPat p7A 'B' p7ABC = pdPat p7AB 'C' -- a possible integration into Haskell class View a b | a -> b, b -> a where view :: a -> Word b reverse_view :: Word b -> a instance View String Char where view = id reverse_view = id -- (y:(A|B)*, z:(A|B)*) -> (y,z) f1 x = case longPatMatch pat (view x) of Just env -> let Just y' = (lookup y_p env) Just z' = (lookup z_p env) y = reverse_view y' z = reverse_view z' in (y,z) where regexp = re_star ((re_letter 'A') `re_choice` (re_letter 'B')) -- (A|B)* y_p = 1::Int z_p = 2::Int pat = pat_pair (pat_var y_p regexp) (pat_var z_p regexp) {- lastWord :: Word Char -> Nothing Char lastWord (x1::Empty) = Nothing lastWord (x2::Char) = Just x2 lastWord (x3::Char, x4:Char*) = Just (lastWord x4) -} lastWord :: Word Char -> Maybe Char lastWord x = case longPatMatch p1 x of Just env1 -> Nothing _ -> case longPatMatch p2 x of Just env2 -> do { x <- lookup x2 env2 ; return (cast x) } _ -> case longPatMatch p3 x of Just env3 -> do { xs <- lookup x4 env3 ; lastWord xs } where x1 = 1::Int x2 = 2::Int x3 = 3::Int x4 = 4::Int p1 = pat_var x1 Empty -- (x1::Empty) p2 = pat_var x2 (L 'A') -- (x2::'A') p3 = pat_pair (pat_var x3 (L 'A')) (pat_var x4 (Star (L 'A'))) -- (x3::'A',x4::'A'*) cast :: Word Char -> Char cast [x] = x