// F# program to recognize non-context free pattern: implements a FSM ////////// The Story: // DNA sequences are represented using the symbols A, C, G and T. As part // of a multi-million dollar research project, a group of geneticists // collected numerous DNA samples from tigers, lions, panthers and other // members of the feline family. After years of research they discovered, // to their shock, that all DNA samples contained a certain pattern: a // number of C's, followed by the same number of A's, followed by the same // number of T's. For example, small felines had sequences like CCCAAATTT, // while large ones (lions and tigers) had DNA that looked more like // CCCCCCCCCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAAAAAAAAATTTTTTTTTTTTTTTTTTTTTTTT. // To confirm their incredible finding, they need a computer program to // automatically detect the presence of this pattern. To their disappointment, // they found that none of the available, off-the-shelf pattern detection // programs can help them. While our geneticists are well-trained in biology, // none of them are computer scientists, so they couldn't possibly know about // the **context free pumping lemma**. This theoretical result implies that // the pattern C^nA^nT^n cannot be described by a *context free grammar*. // They will have to write a specific program to do the job. // Thinking of you as a well-educated computer science student, they have // turned to you for help. Whether you think their hypothesis is ludicrous // is beside the point. You have to help them by implementing a finite state // machine (a finite automaton with memory and actions). open System; type state = Start | Cstate | Astate | Tstate | Accept;; // states of the FSM Console.Write("Enter DNA sequence: "); let instring = Console.ReadLine() + "."; // add . at end to mark end of input let mutable i = 0; // current input index let si = ref -1; // size of pattern found, using reference for illustration let mutable currentState = Start; // current state // global counters let mutable cc = 0; // counts number of C's let mutable ac = 0; // counts number of A's and T's let reset(c,a) = cc<- c ac <- a;; // one to one match between arrows and pattern cases: // given current state, input, action, return next state: // Since currentState is external, don't need to pass it to fsm: let fsm input = match (currentState,input) with | (Start,'C') -> (cc <- 1; Cstate) | (Start,_) -> Start | (Cstate,'C') -> (cc <- cc+1; Cstate) | (Cstate,'A') -> (ac <- 1; Astate) | (Cstate,_) -> reset(0,0) Start | (Astate,'C') -> (reset(1,0); Cstate) | (Astate,'A') -> (ac <- ac+1; Astate) | (Astate,'T') when ac=1 -> (ac<-0; si := 3; Accept) | (Astate,'T') when (ac<=cc && ac>1) -> si := ac*3 // record size of match before ac changes ac <- ac-1 Tstate | (Astate,_) -> (reset(0,0); Start) | (Tstate,'T') when ac>1 -> (ac <- ac-1; Tstate) | (Tstate,'T') when ac=1 -> (ac<-0; Accept) | (Tstate,'C') -> (reset(1,0); Cstate) | (Tstate,_) -> (reset(0,0); Start) | (Accept,_) -> raise(Exception("can't leave accept state"));; /// for tracing let printstate () = Console.Write("current state: "+string(currentState)); Console.WriteLine(", i:chr = "+string(i)+":"+string(instring.[i]));; /// process input string one char at a time: while (currentState <> Accept) && (i< instring.Length) do printstate(); // trace currentState <- fsm(instring.[i]); i <- i+1;; //// Report result: match currentState with | Accept -> printf "Pattern found starting at position %d, size %d\n" (i - !si) (!si) | _ -> printf "Pattern not found\n";;