#!/usr/bin/env python3 """Nacrith CPU -- Neural Arithmetic Compression -- Hugging Face Space Demo (CPU). Supports both text (TC01) and binary (NC05) compression. """ import gzip import time import base64 import gradio as gr from compressor import NeuralCompressor, MAGIC, MAGIC_BIN, HEADER_SIZE from utils import format_size MAX_TEXT_TOKENS = 1500 MAX_BINARY_UPLOAD = 1 * 1024 * 1024 # 1 MB MAX_NC_UPLOAD = 1 * 1024 * 1024 # 1 MB compressor = None def get_compressor(): global compressor if compressor is None: compressor = NeuralCompressor(verbose=False) return compressor # --------------------------------------------------------------------------- # Tab 1: Compress Text (TC01) # --------------------------------------------------------------------------- def compress_text(text): if not text or not text.strip(): return "Please enter some text to compress.", "", "" text = text.strip() comp = get_compressor() original_bytes = len(text.encode("utf-8")) token_ids = comp.model.tokenizer.encode(text) num_tokens = len(token_ids) if num_tokens > MAX_TEXT_TOKENS: return ( f"**Input too long.** Please use up to {MAX_TEXT_TOKENS} tokens (~4000 characters) " "for the demo. Larger files work locally -- see " "[GitHub](https://github.com/st4ck/Nacrith-CPU).", "", "", ) t0 = time.time() compressed = comp.compress(text) elapsed = time.time() - t0 compressed_size = len(compressed) ratio = compressed_size / original_bytes * 100 tokens_per_sec = num_tokens / elapsed if elapsed > 0 else 0 decompressed = comp.decompress(compressed) lossless = decompressed == text gzip_data = gzip.compress(text.encode("utf-8"), compresslevel=9) gzip_size = len(gzip_data) gzip_ratio = gzip_size / original_bytes * 100 improvement = gzip_size / compressed_size if compressed_size > 0 else 0 b64 = base64.b64encode(compressed).decode("ascii") verify = "Yes" if lossless else "FAILED" md = f"""## Compression Results | | Size | Ratio | |---|---|---| | **Original** | {format_size(original_bytes)} | 100% | | **gzip -9** | {format_size(gzip_size)} | {gzip_ratio:.1f}% | | **Nacrith CPU** | {format_size(compressed_size)} | {ratio:.1f}% | **Nacrith CPU is {improvement:.1f}x smaller than gzip** Tokens: {num_tokens} | Time: {elapsed:.1f}s ({tokens_per_sec:.0f} tok/s) | Lossless: {verify} | Space saved: {100 - ratio:.1f}% """ download_html = ( f'' f'Download compressed.nc ({format_size(compressed_size)})' ) return md, download_html, b64 # --------------------------------------------------------------------------- # Tab 2: Upload Text/Binary — auto-detects format # --------------------------------------------------------------------------- def _is_text_file(data: bytes) -> bool: """Check if data looks like a valid UTF-8 text file.""" try: data.decode("utf-8") return True except UnicodeDecodeError: return False def compress_file_b64(b64_data, filename): if not b64_data or not b64_data.strip(): return "Please upload a file using the button above.", "" try: data = base64.b64decode(b64_data.strip()) except Exception: return "**Failed to read uploaded file.**", "" if len(data) > MAX_BINARY_UPLOAD: return ( f"**File too large** ({format_size(len(data))}). " f"Max upload size is {format_size(MAX_BINARY_UPLOAD)}.", "", ) if len(data) == 0: return "**Empty file.** Please upload a file with content.", "" comp = get_compressor() original_size = len(data) is_text = _is_text_file(data) gzip_data = gzip.compress(data, compresslevel=9) gzip_size = len(gzip_data) gzip_ratio = gzip_size / original_size * 100 t0 = time.time() if is_text: text = data.decode("utf-8") compressed = comp.compress(text) else: compressed = comp.compress_bytes(data) elapsed = time.time() - t0 compressed_size = len(compressed) ratio = compressed_size / original_size * 100 improvement = gzip_size / compressed_size if compressed_size > 0 else 0 # Verify lossless round-trip decompressed = comp.decompress(compressed) if is_text: lossless = decompressed == text else: lossless = decompressed == data verify = "Yes" if lossless else "FAILED" b64_out = base64.b64encode(compressed).decode("ascii") # Derive output filename out_name = (filename.strip() + ".nc") if filename and filename.strip() else "compressed.nc" mode_label = "Text (TC01)" if is_text else "Binary (NC05)" md = f"""## Compression Results | | Size | Ratio | |---|---|---| | **Original** | {format_size(original_size)} | 100% | | **gzip -9** | {format_size(gzip_size)} | {gzip_ratio:.1f}% | | **Nacrith CPU** | {format_size(compressed_size)} | {ratio:.1f}% | **Nacrith CPU is {improvement:.1f}x smaller than gzip** Mode: {mode_label} | Time: {elapsed:.1f}s | Lossless: {verify} | Space saved: {100 - ratio:.1f}% """ download_html = ( f'' f'Download {out_name} ({format_size(compressed_size)})' ) return md, download_html # --------------------------------------------------------------------------- # Tab 3: Decompress (TC01 text or NC05 binary) # --------------------------------------------------------------------------- def _decompress_data(data): """Shared decompression logic. Returns (md, download_html, text).""" if len(data) < HEADER_SIZE: return "**Invalid data.** Too short to be a Nacrith CPU compressed file.", "", "" magic = data[:4] if magic not in (MAGIC, MAGIC_BIN): return "**Invalid data.** Not a Nacrith CPU compressed file (wrong magic bytes).", "", "" comp = get_compressor() is_binary = magic == MAGIC_BIN try: t0 = time.time() result = comp.decompress(data) elapsed = time.time() - t0 except Exception as e: return f"**Decompression failed:** {e}", "", "" if is_binary: original_size = len(result) b64_out = base64.b64encode(result).decode("ascii") md = f"""## Decompression Results (Binary) - Compressed: {format_size(len(data))} - Decompressed: {format_size(original_size)} - Time: {elapsed:.1f}s - Format: NC05 (hybrid binary) - **Lossless reconstruction successful** """ download_html = ( f'' f'Download decompressed.bin ({format_size(original_size)})' ) return md, download_html, "" else: original_bytes = len(result.encode("utf-8")) md = f"""## Decompression Results (Text) - Compressed: {format_size(len(data))} - Decompressed: {format_size(original_bytes)} - Time: {elapsed:.1f}s - Format: TC01 (text) - **Lossless reconstruction successful** """ return md, "", result def decompress_file_b64(b64_data): """Decompress from file uploaded via JS (passed as base64).""" if not b64_data or not b64_data.strip(): return "**No file.** Please upload a .nc file.", "", "" try: data = base64.b64decode(b64_data.strip()) except Exception: return "**Failed to read uploaded file.**", "", "" if len(data) > MAX_NC_UPLOAD: return ( f"**File too large** ({format_size(len(data))}). " f"Max size is {format_size(MAX_NC_UPLOAD)}.", "", "", ) return _decompress_data(data) def decompress_b64(b64_text): """Decompress from pasted base64 data.""" if not b64_text or not b64_text.strip(): return "**No data.** Paste base64 data from the Compress tab, or upload a .nc file above.", "", "" try: data = base64.b64decode(b64_text.strip()) except Exception: return "**Invalid base64 data.** Please paste the exact output from the Compress tab.", "", "" if len(data) > MAX_NC_UPLOAD: return ( f"**Data too large** ({format_size(len(data))}). " f"Max size is {format_size(MAX_NC_UPLOAD)}.", "", "", ) return _decompress_data(data) # --------------------------------------------------------------------------- # UI # --------------------------------------------------------------------------- HEADER_HTML = """
Neural Arithmetic Compression -- State-of-the-Art Lossless Compression
Website | GitHub | Trigram Tables + Arithmetic Coding | CPU-only | Supports text & binary files
Information is Already There