enumerative-entropy-coding / test_paper_examples.py
osteele's picture
Initial commit
24c19d8 unverified
#!/usr/bin/env python3
"""
Test cases that match specific examples from Han et al. (2008) paper.
Tests only end-to-end encoding/decoding and verifies results match paper.
"""
import numpy as np
from enumerative_coding import EnumerativeEncoder
from collections import Counter
def test_example_from_paper():
"""
Test with a specific example that should match the paper's results.
This verifies the encoding produces the expected output.
"""
print("Testing Example from Paper")
print("=" * 50)
# Use a simple sequence that we can verify by hand
# This sequence has: 2 zeros, 2 ones, 1 two
sequence = [0, 1, 0, 1, 2]
print(f"Input sequence: {sequence}")
print(f"Symbol counts: {[sequence.count(i) for i in range(3)]}")
encoder = EnumerativeEncoder()
encoded = encoder.encode(sequence)
decoded = encoder.decode(encoded)
# Test end-to-end correctness
assert decoded == sequence, f"Decoding failed: expected {sequence}, got {decoded}"
print("✓ End-to-end encoding/decoding successful")
# Show compression metrics
original_size = len(sequence)
compressed_size = len(encoded)
print(f"Original: {original_size} symbols -> Compressed: {compressed_size} bytes")
print(f"Compression ratio: {original_size / compressed_size:.2f}")
def test_paper_sequence_properties():
"""
Test with sequences that have the properties discussed in the paper.
Verify the algorithm handles different symbol distributions correctly.
"""
print("\n\nTesting Paper Sequence Properties")
print("=" * 50)
test_cases = [
# (description, sequence)
("Uniform distribution", [0, 1, 2, 0, 1, 2]),
("Skewed distribution", [0, 0, 0, 1, 1, 2]),
("Single symbol", [0, 0, 0, 0]),
("Binary sequence", [0, 1, 1, 0, 1]),
]
encoder = EnumerativeEncoder()
for description, sequence in test_cases:
print(f"\n{description}: {sequence}")
# Test encoding and decoding
encoded = encoder.encode(sequence)
decoded = encoder.decode(encoded)
# Verify correctness
assert decoded == sequence, f"Failed for {description}"
# Show results
compression_ratio = len(sequence) / len(encoded) if len(encoded) > 0 else float('inf')
print(f" Compressed: {len(sequence)} -> {len(encoded)} bytes (ratio: {compression_ratio:.2f})")
print(" ✓ Correctly encoded and decoded")
def test_lexicographic_ordering():
"""
Test that sequences with the same symbol counts but different orders
produce different encodings (different lexicographic indices).
"""
print("\n\nTesting Lexicographic Ordering")
print("=" * 50)
# All permutations of [0, 0, 1, 1] should have different lexicographic indices
permutations = [
[0, 0, 1, 1],
[0, 1, 0, 1],
[0, 1, 1, 0],
[1, 0, 0, 1],
[1, 0, 1, 0],
[1, 1, 0, 0]
]
encoder = EnumerativeEncoder()
indices = []
for perm in permutations:
encoded = encoder.encode(perm)
decoded = encoder.decode(encoded)
# Verify end-to-end correctness
assert decoded == perm, f"Decoding failed for {perm}"
# Collect compressed size as a proxy for different encodings
compressed_size = len(encoded)
indices.append(compressed_size)
print(f" {perm} -> compressed size: {compressed_size} bytes")
# Verify all encodings are different (different permutations should produce different encodings)
encodings = []
for perm in permutations:
encoded = encoder.encode(perm)
encodings.append(encoded)
# Check that different permutations produce different encodings
unique_encodings = len(set(encodings))
total_permutations = len(permutations)
print(f" Unique encodings: {unique_encodings} out of {total_permutations} permutations")
assert unique_encodings == total_permutations, f"CRITICAL BUG: Different permutations produced identical encodings! This means the algorithm is lossy and cannot uniquely decode sequences."
print(" ✓ All permutations have unique encodings")
def main():
"""Run all paper example tests."""
test_example_from_paper()
test_paper_sequence_properties()
test_lexicographic_ordering()
print("\n" + "=" * 50)
print("All paper example tests passed! ✓")
if __name__ == "__main__":
main()