;;; bit-io.scm ; This file consists of a PLT port of Oleg's bit-reader ; ; Jens Axel Søgaard is to blame for the bit-writer. (module bit-io mzscheme (provide make-bit-reader make-bit-writer) ; Binary parsing ;---------------------------------------- ; Apologia ; ; Binary parsing and unparsing are transformations between primitive or ; composite Scheme values and their external binary representations. ; ; Examples include reading and writing JPEG, TIFF, MP3, ELF file ; formats, communicating with DNS, Kerberos, LDAP, SLP internet ; services, participating in Sun RPC and CORBA/IIOP distributed systems, ; storing and retrieving (arrays of) floating-point numbers in a ; portable and efficient way. This project will propose a set of low- and ; intermediate- level procedures that make binary parsing possible. ; Scheme is a good language to do research in text compression. Text ; compression involves a great deal of building and traversing ; dictionaries, trees and similar data structures, where Scheme ; excels. Performance doesn't matter in research, but the size of ; compressed files does (to figure out the bpc for the common ; benchmarks). Variable-bit i/o is a necessity. It is implemented ; in the present file. ; ASN.1 corresponds to a higher-level parsing (LR parser ; vs. lexer). Information in LDAP responses and X.509 certificates is ; structural and recursive, and so lends itself to be processed in ; Scheme. Variable bit i/o is necessary, and so is a binary lexer for ; a LR parser. Parsing of ASN.1 is a highly profitable enterprise ;---------------------------------------- ; The outline of the project ; ; Primitives and streams ; ; - read-byte ; - read-u8vector (cf. read-string) ; - with-input-from-u8vector, with-input-from-encoded-u8vector 'base64,... ; building binary i/o streams from a sequence of bytes. Streams over ; u8vector, u16vector, etc. provide a serial access to memory. See SRFI-4 ; ; - read-bit, read-bits via overlayed streams given read-byte ; implemented in the present file. ; ; - mmap-u8vector, munmap-u8vector ; ; Conversions ; - u8vector->integer u8vector endianness, ; u8vector->sinteger u8vector endianness ; These conversion procedures turn a sequence of bytes to an unsigned or ; signed integer, minding the byte order. The u8vector in question can ; have size 1,2,4,8, 3 etc. bytes. These two functions therefore can be ; used to read shorts, longs, extra longs, etc. numbers. ; - u8vector-reverse and other useful u8vector operations ; ; - modf, frexp, ldexp ; The above primitives can be emulated in R5RS, yet they are quite handy ; (for portable FP manipulation) and can be executed very efficiently by ; an FPU. ; ; Higher-level parsing and combinators ; These are combinators that can compose primitives above for more ; complex (possibly iterative) actions. ; ; - skip-bits, next-u8token,... ; - IIOP, RPC/XDR, RMI ; - binary lexer for existing LR/LL-parsers ; ; The composition of primitives and combinators will represent binary ; parsing language in a _full_ notation. This is similar to XPath ; expressions in full notation. Later we need to find out the ; most-frequently used patterns of the binary parsing language and ; design an abbreviated notation. The latter will need a special ; "interpreter". The abbreviated notation may turn out to look like ; Olin's regular expressions. ;;======================================================================== ;; Configuration section ;; ; Performance is very important for binary parsing. We have to get all ; help from a particular Scheme system we can get. If a Scheme function ; can support the following primitives faster, we should take ; advantage of that fact. ;; Configuration for PLT (define-syntax << (syntax-rules () [(_ x n) (arithmetic-shift x n)])) (define-syntax >> (syntax-rules () [(_ x n) (arithmetic-shift x (- n))])) (define-syntax <<1 (syntax-rules () [(_ x) (arithmetic-shift x 1)])) (define-syntax >>1 (syntax-rules () [(_ x) (arithmetic-shift x -1)])) (define-syntax bit-set? (syntax-rules () [(_ x mask) (not (zero? (bitwise-and x mask)))])) ;; End configuration for PLT ; combine bytes in the MSB order. A byte may be #f (define (combine-two b1 b2) ; The result is for sure a fixnum (and b1 b2 (bitwise-ior (<< b1 8) b2))) (define (combine-three b1 b2 b3) ; The result is for sure a fixnum (and b1 b2 b3 (bitwise-ior (<< (bitwise-ior (<< b1 8) b2) 8) b3))) ; Here the result may be a BIGNUM (define (combine-bytes . bytes) (cond ((null? bytes) 0) ((not (car bytes)) #f) (else (let loop ((bytes (cdr bytes)) (result (car bytes))) (cond ((null? bytes) result) ((not (car bytes)) #f) (else (loop (cdr bytes) (+ (car bytes) (* 256 result))))))))) ;======================================================================== ; Reading a byte ; Read-byte is a fundamental primitive; it needs to be ; added to the standard. Most of the other functions are library ; procedures. The following is an approximation, which clearly doesn't ; hold if the port is a Unicode (especially UTF-8) character stream. ; The mzscheme read-byte is used. ; Return a byte as an exact integer [0,255], or the EOF object #;(define (read-byte port) (let ((c (read-char port))) (if (eof-object? c) c (char->integer c)))) ; The same as above, but returns #f on EOF. (define (read-byte-f port) (let ([b (read-byte port)]) (and (not (eof-object? b)) b))) ;======================================================================== ; Bit stream ; -- Function: make-bit-reader BYTE-READER ; Given a BYTE-READER (a thunk), construct and return a function ; bit-reader N ; ; that reads N bits from a byte-stream represented by the BYTE-READER. ; The BYTE-READER is a function that takes no arguments and returns ; the current byte as an exact integer [0-255]. The byte reader ; should return #f on EOF. ; The bit reader returns N bits as an exact unsigned integer, ; 0 -... (no limit). N must be a positive integer, otherwise the bit reader ; returns #f. There is no upper limit on N -- other than the size of the ; input stream itself and the amount of (virtual) memory an OS is willing ; to give to your process. If you want to read 1M of _bits_, go ahead. ; ; It is assumed that the bit order is the most-significant bit first. ; ; Note the bit reader keeps the following condition true at all times: ; (= current-inport-pos (ceiling (/ no-bits-read 8))) ; That is, no byte is read until the very moment we really need (some of) ; its bits. The bit reader does _not_ "byte read ahead". ; Therefore, it can be used to handle a concatenation of different ; bit/byte streams *STRICTLY* sequentially, _without_ 'backing up a char', ; 'unreading-char' etc. tricks. ; For example, make-bit-reader has been used to read GRIB files of ; meteorological data, which made of several bitstreams with headers and ; tags. ; Thus careful attention to byte-buffering and optimization are the ; features of this bit reader. ; ; Usage example: ; (define bit-reader (make-bit-reader (lambda () #b11000101))) ; (bit-reader 3) ==> 6 ; (bit-reader 4) ==> 2 ; The test driver below is another example. ; ; Notes on the algorithm. ; The function recognizes and handles the following special cases: ; - the buffer is empty and 8, 16, 24 bits are to be read ; - reading all bits which are currently in the byte-buffer ; (and then maybe more) ; - reading only one bit ; Since the bit reader is going to be called many times, optimization is ; critical. We need all the help from the compiler/interpreter ; we can get. (define (make-bit-reader byte-reader) (let ((buffer 0) (mask 0) ; mask = 128 means that the buffer is full and ; the msb bit is the current (yet unread) bit (bits-in-buffer 0)) ; read the byte into the buffer and set up the counters. ; return #f on eof (define (set-buffer) (set! buffer (byte-reader)) (and buffer (begin (set! bits-in-buffer 8) (set! mask 128) #t))) ; Read fewer bits than there are in the buffer (define (read-few-bits n) (let ((value (bitwise-and buffer ; all bits in buffer (sub1 (<<1 mask))))) (set! bits-in-buffer (- bits-in-buffer n)) (set! mask (>> mask n)) (>> value bits-in-buffer))) ; remove extra bits ; read n bits given an empty buffer, and append them to value, n>=8 (define (add-more-bits value n) (let loop ((value value) (n n)) (cond ((zero? n) value) ((< n 8) (let ((rest (read-n-bits n))) (and rest (+ (* value (<< 1 n)) rest)))) (else (let ((b (byte-reader))) (and b (loop (+ (* value 256) b) (- n 8)))))))) ; The main module (define (read-n-bits n) ; Check the most common cases first (cond ((not (positive? n)) #f) ((zero? bits-in-buffer) ; the bit-buffer is empty (case n ((8) (byte-reader)) ((16) (let ((b (byte-reader))) (combine-two b (byte-reader)))) ((24) (let* ((b1 (byte-reader)) (b2 (byte-reader))) (combine-three b1 b2 (byte-reader)))) (else (cond ((< n 8) (and (set-buffer) (read-few-bits n))) ((< n 16) (let ((b (byte-reader))) (and (set-buffer) (bitwise-ior (<< b (- n 8)) (read-few-bits (- n 8)))))) (else (let ((b (byte-reader))) (and b (add-more-bits b (- n 8))))))))) ((= n 1) ; read one bit (let ((value (if (bit-set? buffer mask) 1 0))) (set! mask (>>1 mask)) (set! bits-in-buffer (sub1 bits-in-buffer)) value)) ((>= n bits-in-buffer) ; will empty the buffer (let ((n-rem (- n bits-in-buffer)) (value (bitwise-and buffer ; for mask=64, it'll be &63 (sub1 (<<1 mask))))) (set! bits-in-buffer 0) (cond ((zero? n-rem) value) ((<= n-rem 16) (let ((rest (read-n-bits n-rem))) (and rest (bitwise-ior (<< value n-rem) rest)))) (else (add-more-bits value n-rem))))) (else (read-few-bits n)))) read-n-bits) ) ; -- Function: make-bit-writer BYTE-WRITER ; Given a BYTE-WRITER (function of one argument), construct and return a function ; bit-writer N B ; ; that writes N bits represented by the integer B to a byte-stream represented ; by the BYTE-WRITER. ; The BYTE-WRITER is a function that takes one argument and writes ; the given byte as an exact integer [0-255]. ; It is assumed that the bit order is the most-significant bit first. ; ; Note the bit writer will output bytes as soon as possible. That is ; the maximum number of waiting bits are 7. Call bit-writer with a ; non-number as argument to flush the remainin bits. (define (make-bit-writer byte-writer) (let ((buffer 0) (bits-in-buffer 0)) (define (empty-buffer!) (set! buffer 0) (set! bits-in-buffer 0)) (define (extend-buffer! n b) (set! buffer (bitwise-ior (<< buffer n) b)) (set! bits-in-buffer (+ bits-in-buffer n))) (define (set-buffer! n b) (set! buffer b) (set! bits-in-buffer n)) (define (write-n-bits n b) (cond ((not (number? n)) ; flush the buffer append 0-bits (byte-writer (<< buffer (- 8 bits-in-buffer)))) ((not (positive? n)) #f) ((zero? bits-in-buffer) ; the bit-buffer is empty (case n ((8) (byte-writer b)) ((16) (byte-writer (>> (bitwise-and #b1111111100000000 b) 8)) (byte-writer (bitwise-and #b11111111 b))) (else (let ([r (remainder n 8)]) (cond ((zero? r) (for-each byte-writer (let loop ([n n] [b b] [l '()]) (if (= n 0) l (loop (- n 8) (>> b 8) (cons (bitwise-and #b11111111 b) l)))))) ((< n 8) (set-buffer! n b)) ((< n 16) (byte-writer (>> b (- n 8))) (set-buffer! (- n 8) (bitwise-and b (>> #b11111111 (- 16 n))))) (else (let ([bits-to-buffer (remainder n 8)]) ; output all whole bytes, and buffer the rest (write-n-bits (- n bits-to-buffer) (>> b bits-to-buffer)) (set-buffer! bits-to-buffer (bitwise-and b (vector-ref #(0 1 3 7 15 31 63 127 255) r)))))))))) ((< n (- 8 bits-in-buffer)) ; everything goes to the buffer (extend-buffer! n b)) (else (let ([m (- 8 bits-in-buffer)]) ; the buffer and the initial bits make a byte (flush-output (current-error-port)) (byte-writer (bitwise-ior (<< buffer m) (>> b (- n m)))) (empty-buffer!) ; write the rest (write-n-bits (- n m) (bitwise-xor b (<< (>> b (- n m)) (- n m)))))))) write-n-bits)) ;;; TEST (define (test) (define (naturals n) (do ([i 0 (+ i 1)] [l '() (cons i l)]) [(= i n) l])) ; write the numbers 999 ... 1 to "tmp" and read them again (with-output-to-file "tmp" (lambda () (let ([w (make-bit-writer write-byte)]) (for-each (lambda (n) (w n n)) (naturals 1000)) (w 'flush 0))) 'replace) (with-input-from-file "tmp" (lambda () (let ([r (make-bit-reader read-byte)]) (for-each (lambda (n) (display (r n)) (display " ")) (naturals 1000)))))) )