Lossy Text Compression

We simply pick the shortest alternative word from a thesaurus. In order
to compress text in a lossy fashion.

Alice in wonderland compresses from 164K to 157K (and still just about being readable)!


import textwrap
from nltk.corpus import wordnet as wn
import nltk
import sys

def compressWord(word):
	leng = len(word)
	sword = word

	for i, syn in enumerate(wn.synsets(word)):
		syns = [n.name().replace('_', ' ') for n in syn.lemmas()]

		if not syns[0] == word:

		for s in syns:
			if len(s) < leng:
				sword = s
				leng = len(sword)
	return sword

def compressFile(filename):
	out = open(filename).read() 
	output = ""

	words = nltk.tokenize.RegexpTokenizer("(?:[A-Z][.])+|\d[\d,.:\-/\d]*\d|\w+[\w\-\'.&|@:/]*\w+|\s|.|,|'|\"", False).tokenize(out)
	for w in words:
		c = compressWord(w)
		if c == None:
			output += w
			output += c

	return (output)

print (compressFile("pg11.txt"))

Latest Source code

Leave Comment

four + = 7

Error Please check your entries!