#!/usr/bin/env pypy
# -*- coding: utf-8 -*-

'''
bbzip.py, a baby bzip-like compressor
by Charlie Loyd, 2012-01-11.

Usage:

bbzip3.py -c file file.bbz
bbzip3.py -u file.bbz file

This is just a simple, no-clever-stuff implementation of the text compressor
suggested in Burrows & Wheeler's famous 1994 paper,[1] which later research
proved to be quite well designed from the start. This implementation is
totally unrobust and doesn't do anything like breaking a file into blocks,
so it will run out of memory for midsized+ files. As of the last time I
updated this comment, it runs in *roughly* O(n*5) to O(n*10) time and space,
which is terrible but better than the O(n^2) of a naïve implementation.

Because I'm lazy, the only input we accept is files of UTF-8 strings.

I insist that you use pypy instead of stock CPython; it's *intensely* faster.

The code is not well commented. Some docs I found especially useful:

1. "A block-sorting lossless data compression algorithm", Burrows & Wheeler
   http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.html
   Brilliant but a bit too terse to implement well paragraph-by-paragraph.

2. "Burrows Wheeler Compression: Principles and Reflections", Fenwick
   http://www.cs.auckland.ac.nz/~peter-f/FTPfiles/BWreflections.pdf (preprint)
   Some illuminating remarks showing the ways research has advanced.

3. "Cache Friendly Burrows-Wheeler Inversion", Kärkkäinen & Puglisi
   http://www.cs.helsinki.fi/u/tpkarkka/publications/ccp2011.pdf
   Short and sweet, with a clear presentation of standard iBWT algorithms.

4. "Limitations of the compressed file format", Seward
   http://bzip.org/1.0.5/bzip2-manual-1.0.5.html#limits
   Seward points out his (relatively minor) mistakes in bzip2.

Todo:

* BUGs.

* FIXMEs.

* Comments! And general code clarity, especially better argument names.

* Generality, speed, and elegance of string<->binary<->file conversions.
  This includes handling binary input and doing i/o in bigger chunks.
  The i/o stuff in general shouldn't be such an afterthought. import struct?

* Clean up the binary utility functions.

* Clarity and elegance of the iBWT function. This is the most interesting
  work here from an algorithms & data structures point of view.

* Start to think about real optimization? Asymptotic complexity, but also
  e.g. all the array slices and splices chould probably be a lot cleaner.

* Use blocks of fixed length instead of a whole file at a time.

* A proper file header format with useful stuff. Be adaptive?

* Improve modularity so you can painlessly swap out, say, the MTF step.

* Improve performance on v. small files? Most general-purpose compressors are
  bad at this, but I think it might be possible if you try for it specifically.

'''

from math import log
from sys import argv, stderr


eof = -1 # an invalid codepoint as an end-of-string marker

# Rice code constant: must be an integer power of 2.
# Smaller is better for steeper local character distributions.
# Todo: test various corpuses
# Todo: consider expressing as log2 (4 as 2, 8 as 3...)
Rice_m = 4 # See BUG below in unrice.


# Tedious binary utility crap. "Binary" means lists of 0/1.
# FIXME: rename these and their arguments.

def pad(binary, length):
	return [0] * (length-len(binary)) + binary

def backpad(b, length):
	return b + [0] * (length-len(b))

def binary(n): # HURRR, I'M A FUNCTON
	return map(int, bin(n)[2:])

def unbinary(s):
	return int(''.join(map(str, s)), 2)

# MTF

def mtf(s):
	out = []
	uniq = [eof] + range(max(s)+1)
	
	pos = 0
	for char in s:
		pos += 1
		offset = uniq.index(char)
		out.append(offset)
		uniq.remove(char)
		uniq = [char] + uniq
	return out

def unmtf(s):
	out = []
	uniq = [eof] + range(max(s)+1)
	for code in s:
		out.append(uniq[code])
		uniq = [uniq[code]] + uniq[:code] + uniq[code+1:]
	
	return out

# RLE of 0

def rle0(s): # Look at this. May as well be fuckin' assembly.
	out = []
	rl = 0
	for c in s:
		if c > 0:
			if rl > 1:
				out += binary(rl+1)[1:]
				rl = 0
			elif rl == 1:
				out += [0]
				rl = 0
			out.append(c+1)
		else:
			rl += 1
	
	out += binary(rl+1)[1:]
	return out

def unrle0(s):
	out = []
	binary = []
	for c in s:
		if c > 1:
			if len(binary) > 0:
				out += [0] * (unbinary([1] + binary)-1)
				binary = []
			out.append(c-1)
		else:
			binary.append(c)
	if len(binary) > 0:
		out += [0] * (unbinary([1] + binary)-1)
	return out # FIXME: way too much "return out" up in this place


# Rice, m = 4

def rice(s):
	m = Rice_m
	l = int(log(m, 2))
	out = []
	for n in s:
		q = n/m
		r = n%m
		out += [1]*q + [0] + pad(binary(r), l)
	return out

def unrice(s):
	m = Rice_m
	l = int(log(m, 2)) # floor(log2 of m)
	out = [] # jesus
	offset = 0
	
	# BUG: something boring is happening here, probably with padding 0s
	# at the end of strings. This line seems to work for m=4 for inputs
	# I've tested, but I don't know why, and it breaks for some m != 4.
	while offset <= len(s)-l-4:
		q = 0
		while s[offset] == 1:
			q += 1
			offset += 1

		offset += 1 # skip the marker 0
		
		r = 0
		r = unbinary(s[offset:offset+l])
		offset += l
		
		out.append(q*m + r)

	return out

# BWT helpers for rotation comparison:

def seq_cycle_nth(seq, c, n):
	return seq[(c+n) % len(seq)]

def compare_cycles(seq, a, b):
	length = len(seq)
	
	for i in range(length):
		if i == length: return 0 # Hey, maybe the user's a dumbass. ¯\_(ツ)_/¯

		ac = seq_cycle_nth(seq, a, i)
		bc = seq_cycle_nth(seq, b, i)
		
		if ac < bc: return -1;
		if ac > bc: return 1;
		else: continue

def cmp_for(seq): # Lie back and think of Scheme.
	return lambda x, y: compare_cycles(seq, x, y)


# BWT

def bwt(seq):
	seq.append(eof)
	matrix = range(len(seq))
	matrix.sort(cmp_for(seq)) # f(ôô)f

	return [seq_cycle_nth(seq, i, -1) for i in matrix]

def unbwt(L):
	# This is a kind of dumbed-down implementation, I think lifted largely from
	# Wikipedia before I had a firm handle on what the slightly better methods
	# were really doing. FIXME with saws and hot tar.
	F = sorted(L)

	count = [0] * (max(L)+2)
	start = [-1] * (max(L)+2)
	out = [0] * len(L)
	links = [None] * len(L)
	
	for i in range(len(L)):
		si = L[i]
		links[i] = count[si]
		count[si] += 1
		si = F[i]
		if start[si] == -1:
			start[si] = i
	
	li = L.index(eof) # blithe addition of an unnecessary O(n) operation
	for i in range(len(L)):
		c = L[li]
		out[len(L)-i-1] = c
		si = c
		li = start[si] + links[li]
	
	return out[:-1] # the eof would confuse chr()


# File writing utilities ಠ_ಠ

def binwrite(f, data): # Ugly, tedious, and slow. Yes, like my mom.
	for byte in range(len(data)/8 + 1):
		start = byte*8
		end = min(start+8, len(data))
		chunk = data[start:end]
		chunk = backpad(chunk, 8)
		f.write(chr(unbinary(chunk)))
		# God, what am I doing with my life?

# Let's do this thing:

if __name__ == '__main__': # I hope you like dense blocks of boring code.
	if argv[1] == '-c':
		uncompressed = open(argv[2], 'r').read().decode('utf-8')
		outfile = open(argv[3], 'wb')
		uncompressed = map(ord, uncompressed)
		compressed = rice(rle0(mtf(bwt(uncompressed))))
		compressed = pad(compressed, len(compressed)%8 + 8)
		binwrite(outfile, compressed)
		outfile.close()
		
	elif argv[1] == '-u':
		data = open(argv[2], 'rb').read()
		compressed = []
		for byte in data:
			compressed += pad(binary(ord(byte)), 8) # Iterating over bytes again!
		outfile = open(argv[3], 'w')
		uncompressed = unbwt(unmtf(unrle0(unrice(compressed))))
		outfile.write(''.join(map(chr, uncompressed)).encode('utf-8'))
		
	else:
		print >> stderr, 'Usage: bbzip.py -{c,u} input output'