[plt-scheme] help on how to write a frequency-counting function in a more functional way

From: Matthias Felleisen (matthias at ccs.neu.edu)
Date: Sun Apr 19 18:01:08 EDT 2009

So I ran the thing for four different sample sizes:

1000 @ vector: cpu time: 14 real time: 14 gc time: 0
1000 @ a list: cpu time: 14 real time: 14 gc time: 0
1000 @ bst   : cpu time: 21 real time: 21 gc time: 0
1000 @ hash  : cpu time: 14 real time: 15 gc time: 0

10000 @ vector: cpu time: 82 real time: 86 gc time: 0
10000 @ a list: cpu time: 81 real time: 82 gc time: 0
10000 @ bst   : cpu time: 181 real time: 184 gc time: 0
10000 @ hash  : cpu time: 93 real time: 94 gc time: 0

100000 @ vector: cpu time: 440 real time: 448 gc time: 0
100000 @ a list: cpu time: 543 real time: 553 gc time: 0
100000 @ bst   : cpu time: 1645 real time: 1671 gc time: 23
100000 @ hash  : cpu time: 815 real time: 836 gc time: 38

1000000 @ vector: cpu time: 3603 real time: 3669 gc time: 0
1000000 @ a list: cpu time: 8015 real time: 8346 gc time: 1539
1000000 @ bst   : cpu time: 15353 real time: 15605 gc time: 349
1000000 @ hash  : cpu time: 7632 real time: 7807 gc time: 455

As soon as you have 100,000 frequencies in an approximate range of  
20,000 steps, the functional solutions don't look that good compared  
to vectors. With 1,000,000 it's unquestionable. I bet you can repeat  
this experiment in C and get similar results. In a sparse world, the  
results are indifferent.

I will leave it to Sigrid to play with imperative hashes and balanced  
trees and who knows what.

The code is attached. It should be easy to play with other parameters  
of the experiment

-- Matthias

#lang scheme

(require mzlib/etc)

(define HIGH 22000)
(define LOW     80)
;; Frequency is in [LOW ... HIGH)

(define SIZE 1000000) ;; sample size for file

(define F "~s: ~s\n") ;; output format for result lines

;; Nat -> Void
(define (experiment SIZE)
   ;;  
------------------------------------------------------------------------ 
-----
   ;; creating the sample

   ;; String Nat -> Void
   ;; generate file f with n frequencies between 80 and 22000
   (define (gen f n)
     (define (freq) (+ LOW (random (- HIGH LOW))))
     (with-output-to-file f
       (lambda () (for ((i (in-range n))) (printf "~s " (freq))))
       #:exists 'truncate))

   (define create_output (gen "tmp1" SIZE))

   (define (test str f t)
     (collect-garbage)
     (printf "~a @ ~a " SIZE str)
     (time (f "tmp1" t))
     (void))

   (test "vector:" cnt-vec  "tmp-vec")
   (test "a list:" cnt-alst "tmp-alst")
   (test "bst   :" cnt-BST  "tmp-bst")
   (test "hash  :" cnt-ht   "tmp-hash"))

;;  
------------------------------------------------------------------------ 
-----
;; functions for gathering statistics
;; read file f and count how many times each frequency occurs in a  
vector
;; write result to file g in ascending order of frequencies

;; String String -> Void
;; imperative vector update via [Vectorof Nat]
(define (cnt-vec f g)
   (define a (make-vector (- HIGH LOW)))
   (define (up freq)
     (define i (- freq LOW))
     (vector-set! a i (+ (vector-ref a i) 1)))
   (with-input-from-file f
     (rec loop
       (lambda ()
         (define nxt (read))
         (unless (eof-object? nxt) (up nxt) (loop)))))
   ;; ---
   (with-output-to-file g
     (lambda ()
       (for ((i (in-range (- HIGH LOW))))
            (define v (vector-ref a i))
            (unless (= v 0) (printf F (+ 80 i) v))))
     #:exists 'truncate))

;; String -> Void
;; sort, then create association list via [Listof [List Nat Nat]]
(define (cnt-alst f g)
   (define l:in
     (with-input-from-file f
       (rec L
         (lambda ()
           (define nxt (read))
           (if (eof-object? nxt) '() (cons nxt (L)))))))
   (define l:st (sort l:in <))
   (define res
     (let L ([l (cdr l:st)][p (car l:st)][c 1])
       (if (null? l) '()
           (let ([a (car l)])
             (if (= a p)
                 (L (cdr l) p (+ c 1))
                 (cons (list p c) (L (cdr l) (car l) 1)))))))
   (out-al g res))

;; String String (-> Assoc) (Assoc Nat -> Assoc) (Assoc -> Void) -> Void
;; create association list on the fly
(define (cnt-fp f g nu up out)
   (out (with-input-from-file f
          (lambda ()
            (let L ([a (nu)])
              (define nxt (read))
              (if (eof-object? nxt) a (L (up a nxt))))))))

(define (cnt-AL f g)
   (cnt-fp f g
           (lambda () '())
           (lambda (al freq)
             (let L ((al al))
               (if (null? al)
                   (list (list freq 1))
                   (let* ([a (car al)]
                          [key (car a)])
                     (if (= key freq)
                         (cons (list key (+ (cadr a) 1)) (cdr al))
                         (cons a (L (cdr al))))))))
           (lambda (al) (out-al g al))))

(define (cnt-BST f g)
   (define-struct node (lft info count rgt))
   ;; A BST is one of:
   ;; -- '()
   ;; -- (make-node BST Frequency Nat BST)
   (cnt-fp f g
           (lambda () '())
           (lambda (a freq)
             (let L ([bst a])
               (if (null? bst)
                   (make-node '() freq 1 '())
                   (let* ([a (node-info bst)]
                          [lft (node-lft bst)]
                          [rgt (node-rgt bst)]
                          [cnt (node-count bst)])
                     (cond
                       [(< freq a) (make-node (L lft) a cnt rgt)]
                       [(= freq a) (make-node lft a (+ cnt 1) rgt)]
                       [else (make-node lft a cnt (L rgt))])))))
           (lambda (a)
             (out g
                  (lambda ()
                    (let L ((bst a))
                      (unless (null? bst)
                        (let* ([a (node-info bst)]
                               [lft (node-lft bst)]
                               [rgt (node-rgt bst)]
                               [cnt (node-count bst)])
                          (if (and (null? lft) (null? rgt))
                              (printf F a cnt)
                              (begin
                                (L lft)
                                (printf F a cnt)
                                (L rgt)))))))))))

(define (cnt-ht f g)
   (cnt-fp
    f g
    (lambda () #hash())
    (lambda (ht freq)
      (hash-update ht freq add1 0))
    (lambda (H)
      (out
       g (lambda () (hash-for-each H (lambda (k v) (printf "~s: ~s\n"  
k v))))))))

;;  
------------------------------------------------------------------------ 
-----
;; auxiliaries

;; String (-> Void) -> Void
;; create file f from th
(define (out f th) (with-output-to-file f th #:exists 'truncate))

;; String [Listof (List X Y)] -> Void
(define (out-al g res)
   (out g (lambda () (for-each (lambda (i)  (printf F (car i) (cadr  
i))) res))))

;;  
------------------------------------------------------------------------ 
-----
(let L ((i 1000)) (unless (> i SIZE) (experiment i) (L (* 10 i))))



Posted on the users mailing list.