#!/usr/bin/env pypy # -*- coding: utf-8 -*- ''' bbzip.py, a baby bzip-like compressor by Charlie Loyd, 2012-01-11. Usage: bbzip3.py -c file file.bbz bbzip3.py -u file.bbz file This is just a simple, no-clever-stuff implementation of the text compressor suggested in Burrows & Wheeler's famous 1994 paper,[1] which later research proved to be quite well designed from the start. This implementation is totally unrobust and doesn't do anything like breaking a file into blocks, so it will run out of memory for midsized+ files. As of the last time I updated this comment, it runs in *roughly* O(n*5) to O(n*10) time and space, which is terrible but better than the O(n^2) of a naïve implementation. Because I'm lazy, the only input we accept is files of UTF-8 strings. I insist that you use pypy instead of stock CPython; it's *intensely* faster. The code is not well commented. Some docs I found especially useful: 1. "A block-sorting lossless data compression algorithm", Burrows & Wheeler http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.html Brilliant but a bit too terse to implement well paragraph-by-paragraph. 2. "Burrows Wheeler Compression: Principles and Reflections", Fenwick http://www.cs.auckland.ac.nz/~peter-f/FTPfiles/BWreflections.pdf (preprint) Some illuminating remarks showing the ways research has advanced. 3. "Cache Friendly Burrows-Wheeler Inversion", Kärkkäinen & Puglisi http://www.cs.helsinki.fi/u/tpkarkka/publications/ccp2011.pdf Short and sweet, with a clear presentation of standard iBWT algorithms. 4. "Limitations of the compressed file format", Seward http://bzip.org/1.0.5/bzip2-manual-1.0.5.html#limits Seward points out his (relatively minor) mistakes in bzip2. Todo: * BUGs. * FIXMEs. * Comments! And general code clarity, especially better argument names. * Generality, speed, and elegance of string<->binary<->file conversions. This includes handling binary input and doing i/o in bigger chunks. The i/o stuff in general shouldn't be such an afterthought. import struct? * Clean up the binary utility functions. * Clarity and elegance of the iBWT function. This is the most interesting work here from an algorithms & data structures point of view. * Start to think about real optimization? Asymptotic complexity, but also e.g. all the array slices and splices chould probably be a lot cleaner. * Use blocks of fixed length instead of a whole file at a time. * A proper file header format with useful stuff. Be adaptive? * Improve modularity so you can painlessly swap out, say, the MTF step. * Improve performance on v. small files? Most general-purpose compressors are bad at this, but I think it might be possible if you try for it specifically. ''' from math import log from sys import argv, stderr eof = -1 # an invalid codepoint as an end-of-string marker # Rice code constant: must be an integer power of 2. # Smaller is better for steeper local character distributions. # Todo: test various corpuses # Todo: consider expressing as log2 (4 as 2, 8 as 3...) Rice_m = 4 # See BUG below in unrice. # Tedious binary utility crap. "Binary" means lists of 0/1. # FIXME: rename these and their arguments. def pad(binary, length): return [0] * (length-len(binary)) + binary def backpad(b, length): return b + [0] * (length-len(b)) def binary(n): # HURRR, I'M A FUNCTON return map(int, bin(n)[2:]) def unbinary(s): return int(''.join(map(str, s)), 2) # MTF def mtf(s): out = [] uniq = [eof] + range(max(s)+1) pos = 0 for char in s: pos += 1 offset = uniq.index(char) out.append(offset) uniq.remove(char) uniq = [char] + uniq return out def unmtf(s): out = [] uniq = [eof] + range(max(s)+1) for code in s: out.append(uniq[code]) uniq = [uniq[code]] + uniq[:code] + uniq[code+1:] return out # RLE of 0 def rle0(s): # Look at this. May as well be fuckin' assembly. out = [] rl = 0 for c in s: if c > 0: if rl > 1: out += binary(rl+1)[1:] rl = 0 elif rl == 1: out += [0] rl = 0 out.append(c+1) else: rl += 1 out += binary(rl+1)[1:] return out def unrle0(s): out = [] binary = [] for c in s: if c > 1: if len(binary) > 0: out += [0] * (unbinary([1] + binary)-1) binary = [] out.append(c-1) else: binary.append(c) if len(binary) > 0: out += [0] * (unbinary([1] + binary)-1) return out # FIXME: way too much "return out" up in this place # Rice, m = 4 def rice(s): m = Rice_m l = int(log(m, 2)) out = [] for n in s: q = n/m r = n%m out += [1]*q + [0] + pad(binary(r), l) return out def unrice(s): m = Rice_m l = int(log(m, 2)) # floor(log2 of m) out = [] # jesus offset = 0 # BUG: something boring is happening here, probably with padding 0s # at the end of strings. This line seems to work for m=4 for inputs # I've tested, but I don't know why, and it breaks for some m != 4. while offset <= len(s)-l-4: q = 0 while s[offset] == 1: q += 1 offset += 1 offset += 1 # skip the marker 0 r = 0 r = unbinary(s[offset:offset+l]) offset += l out.append(q*m + r) return out # BWT helpers for rotation comparison: def seq_cycle_nth(seq, c, n): return seq[(c+n) % len(seq)] def compare_cycles(seq, a, b): length = len(seq) for i in range(length): if i == length: return 0 # Hey, maybe the user's a dumbass. ¯\_(ツ)_/¯ ac = seq_cycle_nth(seq, a, i) bc = seq_cycle_nth(seq, b, i) if ac < bc: return -1; if ac > bc: return 1; else: continue def cmp_for(seq): # Lie back and think of Scheme. return lambda x, y: compare_cycles(seq, x, y) # BWT def bwt(seq): seq.append(eof) matrix = range(len(seq)) matrix.sort(cmp_for(seq)) # f(ôô)f return [seq_cycle_nth(seq, i, -1) for i in matrix] def unbwt(L): # This is a kind of dumbed-down implementation, I think lifted largely from # Wikipedia before I had a firm handle on what the slightly better methods # were really doing. FIXME with saws and hot tar. F = sorted(L) count = [0] * (max(L)+2) start = [-1] * (max(L)+2) out = [0] * len(L) links = [None] * len(L) for i in range(len(L)): si = L[i] links[i] = count[si] count[si] += 1 si = F[i] if start[si] == -1: start[si] = i li = L.index(eof) # blithe addition of an unnecessary O(n) operation for i in range(len(L)): c = L[li] out[len(L)-i-1] = c si = c li = start[si] + links[li] return out[:-1] # the eof would confuse chr() # File writing utilities ಠ_ಠ def binwrite(f, data): # Ugly, tedious, and slow. Yes, like my mom. for byte in range(len(data)/8 + 1): start = byte*8 end = min(start+8, len(data)) chunk = data[start:end] chunk = backpad(chunk, 8) f.write(chr(unbinary(chunk))) # God, what am I doing with my life? # Let's do this thing: if __name__ == '__main__': # I hope you like dense blocks of boring code. if argv[1] == '-c': uncompressed = open(argv[2], 'r').read().decode('utf-8') outfile = open(argv[3], 'wb') uncompressed = map(ord, uncompressed) compressed = rice(rle0(mtf(bwt(uncompressed)))) compressed = pad(compressed, len(compressed)%8 + 8) binwrite(outfile, compressed) outfile.close() elif argv[1] == '-u': data = open(argv[2], 'rb').read() compressed = [] for byte in data: compressed += pad(binary(ord(byte)), 8) # Iterating over bytes again! outfile = open(argv[3], 'w') uncompressed = unbwt(unmtf(unrle0(unrice(compressed)))) outfile.write(''.join(map(chr, uncompressed)).encode('utf-8')) else: print >> stderr, 'Usage: bbzip.py -{c,u} input output'