[plt-scheme] (fast) reading of data files into a hash-table - how?

From: Jens Axel Søgaard (jensaxel at soegaard.net)
Date: Sun Jan 1 17:50:10 EST 2006

Jens Axel Søgaard wrote:
> For fun I decided to try inserting 14000 lists of 6 strings
> of length 10 into the finite-map data structure provided by
> Galore. And then do 10000 random lookups. 

And here is the same test with a trie.

;;; trie.scm  --  Jens Axel Søgaard

; A quick implementation of tries inspired by page 163-165 in
; Chris Okasaki's "Purely Functional Data Structures".

; A trie is a finite map which maps keys in form of lists over
; a base type to values.

; We represent as a triple:

(define-struct trie (end value map) (make-inspector))

; where
;   end    is a boolean
;   value  is of the base type
;   map    is a finite map from the base type to tries

; A trie containing "ca", "car" and "o" can be drawn as
; a tree, where end and value is drawn at the nodes, and
; the map contains the labels of the subtrees.

;         #f,-
;          /\
;      c  /  \ o
;        /    \
;      #f,-  #t,3
;      /
;   a /
;    /
;  #t,1
;   |
; r |
;   |
;  #t,2

; End indicates that a certain key sequence has an associated value.
; In the above trie, the sequence (list #\c) has no associated value,
; where as (list #\c #\a) is associated to a 1.

; Here is one way to represent the finite map:

#;
(begin
(require (prefix fm:
                  (planet "finite-map.scm" ("soegaard" "galore.plt"))))

(define (fm:lookup k m)
   (cond
     [(fm:get k m) => cdr]
     [else            #f]))

(define (fm:bind k v m)
   (fm:insert k v m))
)

; and here is another (fast with few branches):

(begin
(define (fm:empty)
   '())
(define (fm:empty? m)
   (null? m))
(define (fm:lookup k m)
   (cond
     [(assoc k m) => cdr]
     [else           #f]))
(define (fm:bind k v m)
   (cons (cons k v) m))
)


; empty : -> trie
;   return an empty trie
(define empty
   (let ([e (make-trie #f #f (fm:empty))])
     (lambda () e)))

; empty? : trie -> boolean
;   determine whether the trie is empty
(define (empty? t)
   (and (not (trie-end t))
        (fm:empty? (trie-map t))))

(define (handle-not-found t)
   (if (eq? t #f)
       (empty)
       t))

; bind : (list base) object trie -> trie
;   extend the trie t with a binding of the key
;   sequence ks to the value x
(define (bind ks x t)
   (cond
     [(null? ks)
      (make-trie #t x (trie-map t))]
     [else
      (let ([k  (car ks)]
            [ks (cdr ks)]
            [m  (trie-map t)]
            [v  (trie-value t)]
            [e  (trie-end t)])
        (let* ([t  (handle-not-found (fm:lookup k m))]
               [t1 (bind ks x t)])
          (make-trie e v (fm:bind k t1 m))))]))

; lookup : (list base) trie -> (union value #f)
;   return either the value associated to the key sequence ks,
;   or return #f if no association is found
(define (lookup ks t)
   (cond
     [(and (null? ks) (not (trie-end t)))
      #f] ; not found
     [(null? ks)
      (trie-value t)]
     [else
      (lookup (cdr ks)
              (handle-not-found (fm:lookup (car ks) (trie-map t))))]))

;;; TEST

(define words '(("car" 1)
                 ("cart" 2)
                 ("cat" 3)
                 ("dog" 4)))

(print-struct #t)

; insert all keys with associated values into the trie t
(define t (foldl (lambda (word trie)
                    (bind (string->list (car word)) (cadr word) trie))
                  (empty)
                  words))

; lookup all the inserted words
(map (lambda (word)
        (lookup (string->list (car word)) t))
      words)

; check #f is returned from unassociated words
(lookup (string->list "carton") t)
(lookup (string->list "ca") t)
(lookup (string->list "") t)


;;;
;;;
;;;

;;; The test from yesterday
(require (lib "67.ss" "srfi")
          (lib "42.ss" "srfi")
          (lib "27.ss" "srfi"))

; random-char : -> char
;   return a random char
(define random-char
  (let* ([alphabet "abcdeghijklmnopqrstuvwxyz"]
         [len      (string-length alphabet)])
    (lambda ()
      (string-ref alphabet (random len)))))

; random-string : natural -> string
;   return a random string of length n
(define (random-string n)
  (string-ec (: i n) (random-char)))

; random-strings : natural -> (list string)
;   return a random list of length m of strings of length n
(define (random-strings m n)
  (list-ec (: i m) (random-string n)))

; 14000 lists of 6 strings of length 10
(define lists-of-strings
  (list-ec (: i 14000) (random-strings 6 1)))

; 10000 random keys for lookup timing
(define random-keys
   (list-ec (: i 10000)
            (list-ref lists-of-strings
                      (random (length lists-of-strings)))))


;;; TRIE TEST
'TRIE

(collect-garbage)
(define trie
  ; make a finite map, with the above random keys,
  ; use (random 10) to generate the associated values
   (time (foldl (lambda (s t)
                  (bind s (random 10) t))
                (empty)
                lists-of-strings)))

(collect-garbage)
; time 10000 random lookups
(time (for-each (lambda (key)
                  (lookup key trie))
        random-keys))


;;; TEST FINITE MAP
'FINITE-MAP


(require (planet "finite-map.scm" ("soegaard" "galore.plt")))
(collect-garbage)
(define finite-map
  ; make a finite map, with the above random keys,
  ; use (random 10) to generate the associated values
  (time (foldl (lambda (s fm)
                 (insert s (random 10) fm))
               (empty)
               lists-of-strings)))

(collect-garbage)
; time 10000 random lookups
(time (for-each (lambda (key)
                  (get key finite-map))
        random-keys))


-- 
Jens Axel Søgaard




Posted on the users mailing list.