/* PAQ3 - File archiver and compressor.
(C) 2003, Matt Mahoney, mmahoney@cs.fit.edu
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License version 2 as
published by the Free Software Foundation at
http://www.gnu.org/licenses/gpl.txt or (at your option) any later version.
This program is distributed without any warranty.
USAGE
To compress: PAQ3 archive file file... (1 or more file names), or
or (MSDOS): dir/b | PAQ3 archive (read file names from input)
or (UNIX): ls | PAQ3 archive
To decompress: PAQ3 archive
To list contents: more < archive
Compression: The files listed are compressed and stored in the archive,
which is created. The archive must not already exist. File names may
specify a path, which is stored. If there are no file names on the command
line, then PAQ3 prompts for them, reading until the first blank line or
end of file.
Decompression: No file names are specified. The archive must exist.
If a path is stored, the file is extracted to the appropriate directory,
which must exist. PAQ3 does not create directories. If the file to be
extracted already exists, it is not replaced; rather it is compared with
the archived file, and the offset of the first difference is reported.
It is not possible to add, remove, or update files in an existing archive.
If you want to do this, extract the files, delete the archive, and
create a new archive with just the files you want.
PAQ3 is an improved version of PAQ1SSE/PAQ2 by Serge Osnach, who added
SSE to my PAQ1 (both available at http://cs.fit.edu/~mmahoney/compression/ ).
PAQ3 uses an improved SSE implementation, and adds update exclusion to
the NonstationaryPPM and WordModel models.
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
using namespace std;
const int MEM=6; // 0-8 = 1M, 2M ... 256M memory model. 6 = 64M
// 8-32 bit unsigned types, adjust as appropriate
typedef unsigned char U8;
typedef unsigned short U16;
typedef unsigned long U32;
class U24 { // 24-bit unsigned int
U8 b0, b1, b2; // Low, mid, high byte
public:
explicit U24(int x=0): b0(x), b1(x>>8), b2(x>>16) {}
operator int() const {return (((b2<<8)|b1)<<8)|b0;}
};
// 32-bit random number generator based on r(i) = r(i-24) ^ r(i-55)
class Random {
U32 table[55]; // Last 55 random values
int i; // Index of current random value in table
public:
Random();
U32 operator()() { // Return 32-bit random number
if (++i==55) i=0;
if (i>=24) return table[i]^=table[i-24];
else return table[i]^=table[i+31];
}
} rnd;
Random::Random(): i(0) {
for (int j=0; j<55; ++j)
table[j]=314159265*j;
for (int j=0; j<10000; ++j)
operator()();
}
/* Model interface. A Predictor is made up of a collection of various
models, whose outputs are summed to yield a prediction. Methods:
Model.predict(int& n0, int& n1) - Adds to counts n0 and n1 such that
it predicts the next bit will be a 1 with probability n1/(n0+n1)
and confidence n0+n1.
Model.update(int y) - Appends bit y (0 or 1) to the model.
*/
class Model {
public:
virtual void predict(int& n0, int& n1) const = 0;
virtual void update(int y) = 0;
virtual ~Model() {}
};
/* Hash table element base class. It contains an 8-bit checksum to
detect collisions, and a priority() method which is used to control
replacement when full by replacing the element with the lowest priority
(0 = unused). The derived class should supply the data
to be stored and override priority(). */
class HashElement {
U8 ch; // Checksum
public:
HashElement(int c=0): ch(c) {} // Initialize the checksum, 0 = unused
int checksum() const {return ch;} // Collision if not matched
int priority() const {return ch!=0;} // Override: lowest replaced first
};
/* 3 byte counter, shown for reference only. It implements a
nonstationary pair of counters of 0s and 1s such that preference is
given to recent history by discarding old bits. */
class Counter3: public HashElement {
U8 n[2]; // n[y] is the counts of ys (0s or 1s)
public:
Counter3(int c=0): HashElement(c) {n[0]=n[1]=0;}
int get0() const {return n[0];} // Return count of 0s
int get1() const {return n[1];} // Return count of 1s
int priority() const {return get0()+get1();} // For hash replacement
void add(int y) { // Add y (0 or 1) to n[y] and age the opposite count
if (n[y]<255) ++n[y];
if (n[1-y]>2) (n[1-y]/=2)+=1;
}
};
/* Approximately equivalent 2 byte counter implementing the above.
The representable counts (n0, n1) are 0-10, 12, 14, 16, 20, 24, 28,
32, 48, 64, 128, 256, 512. Both counts are represented by a single
8-bit state. Counts larger than 10 are incremented probabilistically.
Although it uses 1/3 less memory, it is 8% slower and gives 0.05% worse
compression than the 3 byte counter. */
class Counter: public HashElement {
U8 state;
struct E { // State table entry
U16 n0, n1; // Counts represented by state
U8 s00, s01; // Next state on input 0 without/with probabilistic incr.
U8 s10, s11; // Next state on input 1
U32 p0, p1; // Probability of increment x 2^32-1 on inputs 0, 1
};
static E table[244]; // State table
public:
Counter(int c=0): HashElement(c), state(0) {}
int get0() const {return table[state].n0;}
int get1() const {return table[state].n1;}
int priority() const {return state;}
void add(int y) {
if (y) {
if (state<94 || rnd() is a hash table of 2^N elements of type T
(derived from HashElement) with linear search
of M elements in case of collision. If all elements collide,
then the one with the lowest .priority() is replaced.
Hashtable[i] returns a T& indexed by the lower bits of i whose
checksum matches the upper bits of i, creating or replacing if needed.
*/
template
class Hashtable {
private:
T* table; // Array of 2^N+M elements
Hashtable(const Hashtable&); // No copy
Hashtable& operator=(const Hashtable&); // No assignment
public:
Hashtable(): table(new Counter[(1<
inline T& Hashtable::operator[](U32 i) {
const U32 lb=i&((1<>24;
int bj=lb;
int bget=1000000000;
for (U32 j=lb; j counter0; // Counters for context lengths 0 and 1
vector counter1;
Hashtable counter2; // for lengths 2 to N-1
Counter *cp[N]; // Pointers to current counters
U32 hash[N]; // Hashes of last 0 to N-1 bytes
public:
inline void predict(int& c0, int& c1) const; // Add to counts of 0s and 1s
inline void update(int y); // Append bit y (0 or 1) to model
NonstationaryPPM();
int getc0() const {return c0;}
int getc1() const {return c1;}
int getc2() const {return c2;}
};
NonstationaryPPM::NonstationaryPPM(): c0(1), c1(0), c2(0), cn(1),
counter0(256), counter1(65536) {
for (int i=0; iget0()*wt;
n1+=cp[i]->get1()*wt;
}
}
// Add bit y (0 or 1) to model
void NonstationaryPPM::update(int y) {
// Count y by context
for (int i=N-1; i>=0; --i) {
cp[i]->add(y);
if ((cp[i]->get0()+cp[i]->get1())*i > 100) { // Update exclusion
cp[i]->add(y);
break;
}
}
// Store bit y
cn+=cn+y;
if (cn>=53) cn-=53;
c0+=c0+y;
if (c0>=256) { // Start new byte
for (int i=N-1; i>0; --i)
hash[i]=(hash[i-1]+c0)*987660757;
c2 = c1;
c1=c0-256;
c0=1;
cn=1;
}
// Set up pointers to next counters
cp[0]=&counter0[c0];
cp[1]=&counter1[c0+(c1<<8)];
for (int i=2; i buf; // Input buffer, wraps at end
vector ptr; // Hash table of pointers
U32 hash; // Hash of current context up to pos-1
int pos; // Element of buf where next bit will be stored
int bpos; // Number of bits (0-7) stored at buf[pos]
int begin; // Points to first matching byte (does not wrap)
int end; // Points to last matching byte + 1, 0 if no match
public:
MatchModel(): buf(0x10000*(1<1000)
wt=3000000;
else
wt*=wt*3;
if ((buf[end]>>(7-bpos))&1)
n1+=wt;
else
n0+=wt;
}
}
// Append bit y to buf and check that it matches at the end.
// After a byte is completed, compute a new hash and store it.
// If there is no current match, search for one.
void update(int y) {
(buf[pos]<<=1)+=y; // Store bit
++bpos;
if (end && (buf[end]>>(8-bpos))!=buf[pos]) // Does it match?
begin=end=0; // no
if (bpos==8) { // New byte
bpos=0;
hash=hash*(16*123456791)+buf[pos]+1;
if (++pos==int(buf.size()))
pos=0;
if (end)
++end;
else { // If no match, search for one
U32 h=(hash^(hash>>16))&(ptr.size()-1);
end=ptr[h];
if (end>0) {
begin=end;
int p=pos;
while (begin>0 && p>0 && begin!=p+1 && buf[begin-1]==buf[p-1]) {
--begin;
--p;
}
}
if (end==begin) // No match found
begin=end=0;
ptr[h]=U24(pos);
}
}
}
};
/* A WordModel predicts in the context of whole words (a-z, case
insensitive) using a trigram model. */
class WordModel {
private:
U32 word1, word0; // Hash of previous and current word
int ww1, ww0; // Word weights (lengths)
Hashtable t1; // Model
int c1; // Previous char, lower case
int c0; // 0-7 bits of current char with leading 1 bit
int c0h; // Small hash of c0
Counter *cp0, *cp1; // Points into t1 current context
int lettercount; // For detecting English
enum {THRESHOLD=5}; // lettercount threshold for English
public:
WordModel(): word1(0), word0(0), ww1(0), ww0(0),
c1(0), c0(1), c0h(1), cp0(&t1[0]), cp1(&t1[0]), lettercount(0) {}
void predict(int& n0, int& n1) const {
if (lettercount>=THRESHOLD) {
const int wt0=(ww0+1)*(ww0+1);
n0+=cp0->get0()*wt0;
n1+=cp0->get1()*wt0;
const int wt1=(ww0+ww1+2)*(ww0+ww1+2);
n0+=cp1->get0()*wt1;
n1+=cp1->get1()*wt1;
}
}
void update(int y) {
if (lettercount>=THRESHOLD) {
cp1->add(y);
if (cp1->get0()+cp1->get1() < 100)
cp0->add(y);
}
// Update contexts with next bit
c0+=c0+y;
c0h+=c0h+y;
if (c0h>=59) c0h-=59;
if (c0>=256) {
c0-=256;
if (lettercount>0 &&
(c0>127 || (c0<32 && c0!='\n' && c0!='\r' && c0!='\t')))
--lettercount;
if (isalpha(c0)) {
word0=(word0+tolower(c0)+1)*234577751*16;
if (++ww0>8) ww0=8;
if (lettercount=THRESHOLD) {
U32 h=word0*123456791+c1*345689647+c0h+(c0<<24);
cp0=&t1[h];
cp1=&t1[word1+h];
}
}
};
/* CyclicModel models data with fixed length records such as tables,
databases, images, audio, binary numeric data, etc. It models
runs of 0s or 1s in bit columns. A table is detected when an 8-bit
pattern occurs 4 or more times in a row spaced by the same interval. The
table ends after no repetition detection for the same number of bits
as were in the table. */
class CyclicModel: public Model {
struct E {
int p, n, r; // Position of last match, number of matches, interval
E(): p(0), n(0), r(0) {}
};
vector cpos; // Table of repeat patterns by char
int pos; // Current bit position in input
int c0; // Last 8 bits
int cycle; // Most likely number of columns
int column; // Column number, 0 to cycle-1
int size; // Number of bits before the table expires, 0 to 3*cycle
Hashtable t; // Context is last 8 bits in column
vector t1; // Context is the column number only
Counter *cp, *cp1; // Points to t, t1
public:
CyclicModel(): cpos(256), pos(0), c0(1), cycle(0), column(0), size(0),
t1(2048), cp(&t[0]), cp1(&t1[0]) {}
void predict(int& n0, int& n1) const {
if (cycle>0) {
int wt=16;
n0+=cp->get0()*wt;
n1+=cp->get1()*wt;
wt=4;
n0+=cp1->get0()*wt;
n1+=cp1->get1()*wt;
}
}
void update(int y) {
if (++column>=cycle) column=0;
if (size>0 && --size==0) cycle=0;
cp->add(y);
cp1->add(y);
c0=(c0+c0+y)&0xff;
++pos;
E& e=cpos[c0];
if (e.p+e.r==pos) {
++e.n;
if (e.n>3 && e.r>8 && e.r*e.n>size) {
size=e.r*e.n;
if (cycle!=e.r) {
cycle=e.r;
column=pos%cycle;
}
}
}
else {
e.n=1;
e.r=pos-e.p;
}
e.p=pos;
int h=column*3+c0*876546821;
cp=&t[h];
cp1=&t1[column&2047];
}
};
/* class SparseModel
TODO: find better hash functions, better weigh function,
try additional sparse context masks.
*/
class SparseModel: public Model {
mutable Hashtable counter;
int c0,c1,c2,c3,c4;
int cp, cp1, cp2;
public:
SparseModel(){c0=c1=c2=c3=c4=0;c0=1;cp=0;cp1=0;cp2=0;}
void predict(int& n0, int& n1) const {
int wt=1;
int a0=0, a1=0;
Counter * t = &counter[cp];
a0+=t->get0()*wt;
a1+=t->get1()*wt;
t = &counter[cp1];
a0+=t->get0()*wt;
a1+=t->get1()*wt;
t = &counter[cp2];
a0+=t->get0()*wt;
a1+=t->get1()*wt;
n0 += a0;
n1 += a1;
if((a0+a1<<4) < (n0+n1)){
n0 += a0;
n1 += a1;
}
}
void update(int y){
Counter * t = &counter[cp];
t->add(y);
t = &counter[cp1];
t->add(y);
t = &counter[cp2];
t->add(y);
cp = cp*553+y;
cp1 = cp1*353+y;
cp2= cp2*5257+y;
c0 += c0+y;
if(c0>=256){
c4=c3;c3=c2;c2=c1;c1=c0-256;
c0 = 1;
cp = 8826343*c2+c3*55342+0x1234;
cp1 = 8823*c3+c4+0xfe9;
cp2 = 3823*c1+17*c3+0xbe11;
}
}
};
// Secondary source encoder
struct SSEContext {
U8 c1, n; // Count of 1's, count of bits
U16 p() const {return U32(65535)*(c1*64+1)/(n*64+2);}
void update(int y) {
if (y)
++c1;
if (++n>254) {
c1/=2;
n/=2;
}
}
};
const int SSE1=256*3*3*3*3, SSE2=32; // dimensions
const int SSESCALE=1024/SSE2;
SSEContext sse[SSE1][SSE2+1]; // [context][mapped probability]
// Scale probability p (0 to 64K-1) into a context in the range 0 to 1K-1
class SSEMap {
enum {N=4096};
U16 table[N];
public:
int operator()(int p) const {return table[p/(65536/N)];}
SSEMap();
} ssemap; // global functoid
SSEMap::SSEMap() {
for (int i=0; i1023) p=1023;
if (p<0) p=0;
table[i]=p;
}
}
/* A Predictor predicts the next bit given the bits so far using a
collection of models. Methods:
p() returns probability of a 1 being the next bit, P(y = 1)
as a 16 bit number (0 to 64K-1).
update(y) updates the models with bit y (0 or 1)
*/
class Predictor {
NonstationaryPPM m1;
MatchModel m2;
WordModel m3;
CyclicModel m4;
SparseModel m5;
mutable int context, ssep;
public:
Predictor();
~Predictor();
U16 p() const {
int n0=1, n1=n0;
context=m1.getc0();
m4.predict(n0, n1); context=context*3+(n0*2>n1)+(n0>n1*2);
m2.predict(n0, n1); context=context*3+(n0*4>n1)+(n0>n1*4);
m3.predict(n0, n1); context=context*3+(n0*8>n1)+(n0>n1*8);
m5.predict(n0, n1);
m1.predict(n0, n1); context=context*3+(m1.getc1()>=64)+((m1.getc1()>=64)&&(m1.getc2()>=64));//+(n0*16>n1)+(n0>n1*16);
int n=n0+n1;
while (n>32767) {
n1/=16;
n/=16;
}
U16 pr=65535*n1/n;
ssep=ssemap(pr);
int wt=ssep%SSESCALE;
int i=ssep/SSESCALE;
return (((sse[context][i].p()*(SSESCALE-wt)+sse[context][i+1].p()*wt) /SSESCALE)*3+pr)/4;
}
void update(int y) {
m1.update(y);
m2.update(y);
m3.update(y);
m4.update(y);
m5.update(y);
sse[context][ssep/SSESCALE].update(y);
sse[context][ssep/SSESCALE+1].update(y);
}
};
Predictor::Predictor() {
int N=4096;
int oldp=SSE2+1;
for (int i=N-1; i>=0; --i) {
int p=(ssemap(i*65536/N)+SSESCALE/2)/SSESCALE;
int n=1+N*N/((i+1)*(N-i));
if (n>254) n=254;
int c1=(i*n+N/2)/N;
for (int j=oldp-1; j>=p; --j) {
for (int k=0; k=0x10000000) xmid+=(xdiff>>16)*p;
else if (xdiff>=0x1000000) xmid+=((xdiff>>12)*p)>>4;
else if (xdiff>=0x100000) xmid+=((xdiff>>8)*p)>>8;
else if (xdiff>=0x10000) xmid+=((xdiff>>4)*p)>>12;
else xmid+=(xdiff*p)>>16;
// Update the range
if (y)
x2=xmid;
else
x1=xmid+1;
predictor.update(y);
// Shift equal MSB's out
while (((x1^x2)&0xff000000)==0) {
putc(x2>>24, archive);
x1<<=8;
x2=(x2<<8)+255;
}
}
/* Decode one bit from the archive, splitting [x1, x2] as in the encoder
and returning 1 or 0 depending on which subrange the archive point x is in.
*/
inline int Encoder::decode() {
// Split the range
const U32 p=predictor.p(); // Probability P(1) * 64K rounded down
const U32 xdiff=x2-x1;
U32 xmid=x1; // = x1+p*(x2-x1) multiply without overflow, round down
if (xdiff>=0x10000000) xmid+=(xdiff>>16)*p;
else if (xdiff>=0x1000000) xmid+=((xdiff>>12)*p)>>4;
else if (xdiff>=0x100000) xmid+=((xdiff>>8)*p)>>8;
else if (xdiff>=0x10000) xmid+=((xdiff>>4)*p)>>12;
else xmid+=(xdiff*p)>>16;
// Update the range
int y=0;
if (x<=xmid) {
y=1;
x2=xmid;
}
else
x1=xmid+1;
predictor.update(y);
// Shift equal MSB's out
while ((x1>>24)==(x2>>24)) {
x1<<=8;
x2=(x2<<8)+255;
x=(x<<8)+getc(archive);
}
return y;
}
// Should be called when there is no more to compress
void Encoder::flush() {
// In COMPRESS mode, write out the remaining bytes of x, x1 < x < x2
if (mode==COMPRESS) {
while (((x1^x2)&0xff000000)==0) {
putc(x2>>24, archive);
x1<<=8;
x2=(x2<<8)+255;
}
putc(x2>>24, archive); // First unequal byte
}
}
// Read one byte from encoder e
int decompress(Encoder& e) { // Decompress 8 bits, MSB first
int c=0;
for (int i=0; i<8; ++i)
c=c+c+e.decode();
return c;
}
// Write one byte c to encoder e
void compress(Encoder& e, int c) {
for (int i=7; i>=0; --i)
e.encode((c>>i)&1);
}
// Fail if out of memory
void handler() {
printf("Out of memory\n");
exit(1);
}
// Read and return a line of input from FILE f (default stdin) up to
// first control character except tab. Skips CR in CR LF.
string getline(FILE* f=stdin) {
int c;
string result="";
while ((c=getc(f))!=EOF && (c>=32 || c=='\t'))
result+=char(c);
if (c=='\r')
(void) getc(f);
return result;
}
// User interface
int main(int argc, char** argv) {
set_new_handler(handler);
// Check arguments
if (argc<2) {
printf(
"PAQ3 file compressor/archiver, (C) 2003, Matt Mahoney, mmahoney@cs.fit.edu\n"
"This program is free software distributed without warranty under the terms\n"
"of the GNU General Public License, see http://www.gnu.org/licenses/gpl.txt\n"
"\n"
"To compress: PAQ3 archive filenames... (archive will be created)\n"
" or (MSDOS): dir/b | PAQ3 archive (reads file names from input)\n"
"To extract/compare: PAQ3 archive (does not clobber existing files)\n"
"To view contents: more < archive\n");
return 1;
}
// File names and sizes from input or archive
vector filename; // List of names
vector filesize; // Size or -1 if error
int start_time=clock();
int uncompressed_bytes=0, compressed_bytes=0; // Input, output sizes
// Extract files
FILE* archive=fopen(argv[1], "rb");
if (archive) {
if (argc>2) {
printf("File %s already exists\n", argv[1]);
return 1;
}
printf("Extracting archive %s ...\n", argv[1]);
// Read "PAQ3\r\n" at start of archive
if (getline(archive) != "PAQ3") {
printf("Archive file %s not in PAQ3 format\n", argv[1]);
return 1;
}
// Read "size filename" in "%d\t%s\r\n" format
while (true) {
string s=getline(archive);
if (s.size()>1) {
filesize.push_back(atol(s.c_str()));
string::iterator tab=find(s.begin(), s.end(), '\t');
if (tab!=s.end())
filename.push_back(string(tab+1, s.end()));
else
filename.push_back("");
}
else
break;
}
// Test end of header for "\f\0"
{
int c1=0, c2=0;
if ((c1=getc(archive))!='\f' || (c2=getc(archive))!=0) {
printf("%s: Bad PAQ3 header format %d %d\n", argv[1],
c1, c2);
return 1;
}
}
// Extract files from archive data
Encoder e(DECOMPRESS, archive);
for (int i=0; i2)
for (int i=2; i=0)
fprintf(archive, "%ld\t%s\r\n", filesize[i], filename[i].c_str());
}
putc(032, archive); // MSDOS EOF
putc('\f', archive);
putc(0, archive);
// Write data
Encoder e(COMPRESS, archive);
long file_start=ftell(archive);
for (int i=0; i=0) {
uncompressed_bytes+=size;
printf("%-23s %10ld -> ", filename[i].c_str(), size);
FILE* f=fopen(filename[i].c_str(), "rb");
int c;
for (long j=0; j0 && elapsed_time>0) {
printf(" (%1.4f bpc, %1.2f%% at %1.0f KB/s)",
compressed_bytes*8.0/uncompressed_bytes,
compressed_bytes*100.0/uncompressed_bytes,
uncompressed_bytes/(elapsed_time*1000.0));
}
printf("\n");
return 0;
}