#!/usr/bin/python # coding: utf-8 # Copyright © 2012 Jon Dowland # Available under the terms of the GNU General Public License (GPL), version 2 # twitter username: # [A-Za-z0-9_]{1,15} # url allowed chars: # https?://twitter.com/#!/jmtd/status/132076624555417600 # => '!#/.10325476:acedihjmopsrutw' # ~> '!#/.0-9:a-zA-Z' # date format: # November 03, 2011 at 02:48AM # ';' is a suitable field delimiter, because # it can't occur in the username, URL or date fields # it doesn't need escaping in a regex # it's very unlikely to be encoded in QP import sys,email,re,datetime from email.header import Header from email.message import Message from dateutil import parser s = sys.stdin.read() m1 = email.message_from_string(s) m2 = Message() ss = u'' for bit in email.header.decode_header(m1['Subject']): if bit[1] in ('utf-8', None): ss += unicode(bit[0],'utf-8') else: ss += "?" user_re = r'[A-Za-z0-9_]{1,15}' url_re = r'https?://twitter.com/(#!/)?(%s)/status(es)?/\d+' % user_re r = re.compile(r'^([^;]+);([^;]+);(.*)$',re.DOTALL) md = r.match(ss) if md: url, date, text = md.groups() md = re.match(url_re, url) user = md.group(2) # weirdly, they seem to return times 10 hours behind GMT (and don't declare TZ) # so 02:48AM above was really 12:48PM dt = parser.parse(date) + datetime.timedelta(hours=10) m2['From'] = '"%s" <>' % user m2['Date'] = dt.strftime("%a, %d %b %Y %H:%M:%S +0000") m2['Subject'] = text m2['X-URL'] = url else: sys.exit(1) print m2