1 # This script must be embedded for the arguments to work
6 # Read "file" document, compute and print its least match penalty for
9 ;(reader-event (fn (x) (write-line (string x)) x))
11 # Map a UTF8 text into its bigram sequence
12 (define (bigrams TEXT)
13 (let ((i -1) (N (utf8len TEXT)))
14 (map (fn (s) (replace "([?.*])" s (string "\\" $1) 0))
15 (clean (fn (x) (< (length x) 2))
16 (collect (when (< (inc i) N) (i 2 TEXT)))))))
18 # Replace newlines and successions of spaces with single spaces
19 (define (read-text FILE)
20 (replace "\\s\\s+" (replace "\n" (read-file FILE) " " 0) " " 0))
22 ; Determine the placements for bigram P in DATA after index i. This
23 ; returns the placment options in ascending order.
24 (define (indexes P DATA (i -1))
25 (collect (setf i (find P DATA 1 (inc i)))))
27 ; Find the first of IL placement lists after i, prepend i to that and
28 ; return. Returns nil if all IL starts at or before i. This is used
29 ; for finding the "best" placement for an antecedent bigram at i, wrt
30 ; the alternative successor bigram "best" placements.
31 (define (last-before i IL)
32 (if (find i IL (fn (x y) (< x (y 0)))) (cons i (IL $it)) (list i)))
34 ; Combine the placement options PL of bigram P relative to best
35 ; placements options of successor bigrams IL. This appends each
36 ; placement option to its "best" successor placement list. The choice
37 ; of dropping this bigram also yields all IL successor placements
38 ; options. The resulting placment options is sorted by ascending first
40 (define (combine P PL IL)
42 (sort (append (map (fn (i) (last-before i IL)) PL) IL))
45 ; Process the bigrams PL to be placed into the text DATA. Returns a
46 ; list of placement lists to indicate alternative ways of placing the
48 (define (chicks PL DATA)
50 (combine (PL 0) (indexes (PL 0) DATA) (chicks (1 PL) DATA))))
52 ; Compute the "gap sum" for the given placement list, which tells how
53 ; much bigrams are separated from each other in the placement list.
54 ; Returns a triplet of 1) the number of discarded bigrams, 2) the
55 ; accumulated separation count and 3) a placment description of where
56 ; and which bigrams are placed.
58 (let ((N (- (length PHRASE) (length L))))
59 (list N (- (apply + (map - L (cons (L 0) L))) (length L) -1))))
63 PHRASE (bigrams (main-args 1))
64 CLIP (/ (* 45 (length PHRASE)) 100)
67 (dolist (FILE (2 (main-args)))
69 (setf LINES '() COUNT 0)
70 (dolist (DATA (parse (read-file FILE) "\n"))
71 (let ((BEST (if (sort (map gapsum (chicks PHRASE DATA))) (first $it))))
73 (when (and BEST (< (BEST 0) CLIP))
74 (push (list (append BEST (list COUNT)) DATA) LINES -1))))
75 (map (fn (x) (println FILE ":" (x -1))) (sort LINES))))