[plt-scheme] (fast) reading of data files into a hash-table - how?
Jens Axel Søgaard wrote:
> For fun I decided to try inserting 14000 lists of 6 strings
> of length 10 into the finite-map data structure provided by
> Galore. And then do 10000 random lookups.
And here is the same test with a trie.
;;; trie.scm -- Jens Axel Søgaard
; A quick implementation of tries inspired by page 163-165 in
; Chris Okasaki's "Purely Functional Data Structures".
; A trie is a finite map which maps keys in form of lists over
; a base type to values.
; We represent as a triple:
(define-struct trie (end value map) (make-inspector))
; where
; end is a boolean
; value is of the base type
; map is a finite map from the base type to tries
; A trie containing "ca", "car" and "o" can be drawn as
; a tree, where end and value is drawn at the nodes, and
; the map contains the labels of the subtrees.
; #f,-
; /\
; c / \ o
; / \
; #f,- #t,3
; /
; a /
; /
; #t,1
; |
; r |
; |
; #t,2
; End indicates that a certain key sequence has an associated value.
; In the above trie, the sequence (list #\c) has no associated value,
; where as (list #\c #\a) is associated to a 1.
; Here is one way to represent the finite map:
#;
(begin
(require (prefix fm:
(planet "finite-map.scm" ("soegaard" "galore.plt"))))
(define (fm:lookup k m)
(cond
[(fm:get k m) => cdr]
[else #f]))
(define (fm:bind k v m)
(fm:insert k v m))
)
; and here is another (fast with few branches):
(begin
(define (fm:empty)
'())
(define (fm:empty? m)
(null? m))
(define (fm:lookup k m)
(cond
[(assoc k m) => cdr]
[else #f]))
(define (fm:bind k v m)
(cons (cons k v) m))
)
; empty : -> trie
; return an empty trie
(define empty
(let ([e (make-trie #f #f (fm:empty))])
(lambda () e)))
; empty? : trie -> boolean
; determine whether the trie is empty
(define (empty? t)
(and (not (trie-end t))
(fm:empty? (trie-map t))))
(define (handle-not-found t)
(if (eq? t #f)
(empty)
t))
; bind : (list base) object trie -> trie
; extend the trie t with a binding of the key
; sequence ks to the value x
(define (bind ks x t)
(cond
[(null? ks)
(make-trie #t x (trie-map t))]
[else
(let ([k (car ks)]
[ks (cdr ks)]
[m (trie-map t)]
[v (trie-value t)]
[e (trie-end t)])
(let* ([t (handle-not-found (fm:lookup k m))]
[t1 (bind ks x t)])
(make-trie e v (fm:bind k t1 m))))]))
; lookup : (list base) trie -> (union value #f)
; return either the value associated to the key sequence ks,
; or return #f if no association is found
(define (lookup ks t)
(cond
[(and (null? ks) (not (trie-end t)))
#f] ; not found
[(null? ks)
(trie-value t)]
[else
(lookup (cdr ks)
(handle-not-found (fm:lookup (car ks) (trie-map t))))]))
;;; TEST
(define words '(("car" 1)
("cart" 2)
("cat" 3)
("dog" 4)))
(print-struct #t)
; insert all keys with associated values into the trie t
(define t (foldl (lambda (word trie)
(bind (string->list (car word)) (cadr word) trie))
(empty)
words))
; lookup all the inserted words
(map (lambda (word)
(lookup (string->list (car word)) t))
words)
; check #f is returned from unassociated words
(lookup (string->list "carton") t)
(lookup (string->list "ca") t)
(lookup (string->list "") t)
;;;
;;;
;;;
;;; The test from yesterday
(require (lib "67.ss" "srfi")
(lib "42.ss" "srfi")
(lib "27.ss" "srfi"))
; random-char : -> char
; return a random char
(define random-char
(let* ([alphabet "abcdeghijklmnopqrstuvwxyz"]
[len (string-length alphabet)])
(lambda ()
(string-ref alphabet (random len)))))
; random-string : natural -> string
; return a random string of length n
(define (random-string n)
(string-ec (: i n) (random-char)))
; random-strings : natural -> (list string)
; return a random list of length m of strings of length n
(define (random-strings m n)
(list-ec (: i m) (random-string n)))
; 14000 lists of 6 strings of length 10
(define lists-of-strings
(list-ec (: i 14000) (random-strings 6 1)))
; 10000 random keys for lookup timing
(define random-keys
(list-ec (: i 10000)
(list-ref lists-of-strings
(random (length lists-of-strings)))))
;;; TRIE TEST
'TRIE
(collect-garbage)
(define trie
; make a finite map, with the above random keys,
; use (random 10) to generate the associated values
(time (foldl (lambda (s t)
(bind s (random 10) t))
(empty)
lists-of-strings)))
(collect-garbage)
; time 10000 random lookups
(time (for-each (lambda (key)
(lookup key trie))
random-keys))
;;; TEST FINITE MAP
'FINITE-MAP
(require (planet "finite-map.scm" ("soegaard" "galore.plt")))
(collect-garbage)
(define finite-map
; make a finite map, with the above random keys,
; use (random 10) to generate the associated values
(time (foldl (lambda (s fm)
(insert s (random 10) fm))
(empty)
lists-of-strings)))
(collect-garbage)
; time 10000 random lookups
(time (for-each (lambda (key)
(get key finite-map))
random-keys))
--
Jens Axel Søgaard