# This script must be embedded for the arguments to work # $0 = this program # $1 = file to review # $2 = phrase to find # Read "file" document, compute and print its least match penalty for # the given phrase ;(reader-event (fn (x) (write-line (string x)) x)) # Map a UTF8 text into its bigram sequence (define (bigrams TEXT) (let ((i -1) (N (utf8len TEXT))) (map (fn (s) (replace "([?.*])" s (string "\\" $1) 0)) (clean (fn (x) (< (length x) 2)) (collect (when (< (inc i) N) (i 2 TEXT))))))) # Replace newlines and successions of spaces with single spaces (define (read-text FILE) (replace "\\s\\s+" (replace "\n" (read-file FILE) " " 0) " " 0)) ; Determine the placements for bigram P in DATA after index i. This ; returns the placment options in ascending order. (define (indexes P DATA (i -1)) (collect (setf i (find P DATA 1 (inc i))))) ; Find the first of IL placement lists after i, prepend i to that and ; return. Returns nil if all IL starts at or before i. This is used ; for finding the "best" placement for an antecedent bigram at i, wrt ; the alternative successor bigram "best" placements. (define (last-before i IL) (if (find i IL (fn (x y) (< x (y 0)))) (cons i (IL $it)) (list i))) ; Combine the placement options PL of bigram P relative to best ; placements options of successor bigrams IL. This appends each ; placement option to its "best" successor placement list. The choice ; of dropping this bigram also yields all IL successor placements ; options. The resulting placment options is sorted by ascending first ; placement. (define (combine P PL IL) (if (null? PL) IL IL (sort (append (map (fn (i) (last-before i IL)) PL) IL)) (map list PL))) ; Process the bigrams PL to be placed into the text DATA. Returns a ; list of placement lists to indicate alternative ways of placing the ; given bigrams. (define (chicks PL DATA) (if (null? PL) (list) (combine (PL 0) (indexes (PL 0) DATA) (chicks (1 PL) DATA)))) ; Compute the "gap sum" for the given placement list, which tells how ; much bigrams are separated from each other in the placement list. ; Returns a triplet of 1) the number of discarded bigrams, 2) the ; accumulated separation count and 3) a placment description of where ; and which bigrams are placed. (define (gapsum L) (let ((N (- (length PHRASE) (length L)))) (list N (- (apply + (map - L (cons (L 0) L))) (length L) -1)))) ;;; Load arguments (setf PHRASE (bigrams (main-args 1)) CLIP (/ (* 45 (length PHRASE)) 100) ) (dolist (FILE (2 (main-args))) (when (file? FILE) (setf LINES '() COUNT 0) (dolist (DATA (parse (read-file FILE) "\n")) (let ((BEST (if (sort (map gapsum (chicks PHRASE DATA))) (first $it)))) (inc COUNT) (when (and BEST (< (BEST 0) CLIP)) (push (list (append BEST (list COUNT)) DATA) LINES -1)))) (map (fn (x) (println FILE ":" (x -1))) (sort LINES)))) (exit 0)