;; Mike Burns 2004-08-22 mike@mike-burns.com ;; Mike Burns 2005-12-10 mike@mike-burns.com ;; Copyright 2004, 2005 Mike Burns ;; Parse emails #lang scheme (require ;mzscheme mzlib/contract ;(lib "contract.ss") mzlib/etc ; (lib "etc.ss") mzlib/pregexp ;(lib "pregexp.ss") ;(lib "list.ss") net/mime ; (lib "mime.ss" "net") ) ;; An email is a (make-email Assoc (listof (listof String))) ;; The headers currently do not include the MIME info, due to net/mime.ss . ;; The messages are all the messages in the email. If there are no ;; attachments, then there is only one message in the list. A message is a list ;; of string, each string representing one line. (define-struct email (headers messages) #:inspector #f) ;(make-inspector)) (define-struct (exn:malformed-email exn) () #:inspector #f) ;(make-inspector)) (provide/contract (struct email ((headers (listof (cons/c symbol? string?))) (messages (listof (listof string?))))) (parse-emails (() (input-port?) . opt-> . (listof email?))) (parse-email (() (input-port?) . opt-> . email?))) (provide (struct-out exn:malformed-email)) ;; Parse a stream of emails. Emails are separated by lines that begin with ;; "From " (note the lack of ":"). (define parse-emails ;(opt- (lambda ((ip (current-input-port))) (let loop ((line (peek-line ip))) (cond ((eof-object? line) '()) ((new-email? line) (let* ((parsed (parse-email (open-input-string (get-first-email ip)))) (parsed-rest (loop (peek-line ip)))) (cons parsed parsed-rest))) (else (raise (make-exn:malformed-email (string->immutable-string (format "~a: ~a" "Expected a \"From ...\", got" line)) (current-continuation-marks)))))))) ;; Parse an email. It either does or does not have an attachment. (define parse-email ;(opt- (lambda ((ip (current-input-port))) (let ((analysis (mime-analyze ip))) (if (multi-message? analysis) (parse-email-multi analysis) (parse-email-single analysis))))) ;; Parse an email with an attachment (define/contract parse-email-multi (message? . -> . email?) (lambda (analysis) (make-email (message-fields->assoc (message-fields analysis)) (map message->body (entity-parts (message-entity analysis)))))) ;; Parse an email with no attachment (define/contract parse-email-single (message? . -> . email?) (lambda (analysis) (make-email (message-fields->assoc (message-fields analysis)) (list (message->body analysis))))) (define (multi-message? analysis) (symbol=? (entity-type (message-entity analysis)) 'multipart)) ;; Produces a list of strings, each string representing a line in the email, ;; from a message. (define (message->body message) (entity-body->body (entity-body (message-entity message)))) ;; Uses the entity-body procedure to produce a list of strings, each string ;; representing a line in the message. (define (entity-body->body body) (let ((o (open-output-string))) ;; Print to o (body o) ;; cdr because of a leading newline (cdr (string->los (get-output-string o))))) ;; Break a string with embedded newlines into a list of strings, each string ;; representing one line. (define (string->los s) (pregexp-split "\n" s)) ;; Show the next line, without consuming anything. (define/contract peek-line (() (input-port?) . opt-> . (union eof-object? string?)) ;(opt- (lambda ((ip (current-input-port))) (let loop ((acc "") (c (peek-char ip)) (col 1)) (cond ((eof-object? c) (if (string=? acc "") c acc)) ((char=? c #\newline) acc) (else (loop (string-append acc (string c)) (peek-char ip col) (+ col 1))))))) ;; Is this line the start of a new email? (define (new-email? line) (pregexp-match "^(?mi:From|To|Envelope.*|Received|Return-Path|Date|Subject|Content\\-.*|MIME-Version|Forwarded|Message.*|From\\s)" line)) ;; Consume a list of strings of colon-separated values, and produce an assoc ;; of string, which are those values. (define/contract message-fields->assoc ((listof string?) . -> . (listof (cons/c symbol? string?))) (lambda (fields) (filter ;; Get rid of '()s pair? (map (lambda (field) (let ((n-v (pregexp-match "^(\\S+): *(.*)" field))) (if n-v (cons (string->symbol (cadr n-v)) (apply string-append (cddr n-v))) '()))) fields)))) ;; Produce the first email in a stream of emails. (define/contract get-first-email (input-port? . -> . string?) (lambda (ip) (let loop ((acc "") (line (peek-line ip)) (seen-first #f)) (cond ((eof-object? line) acc) ((new-email? line) (if seen-first acc (let* ((a (read-line ip)) (l (peek-line ip))) (loop a l #t)))) (else (let ((a (read-line ip)) (l (peek-line ip))) (loop (format "~a~n~a" acc a) l seen-first)))))))