Spaces:

robtacconelli
/

Nacrith-GPU

Running on Zero

App Files Files Community

robtacconelli commited on Feb 24

Commit

8f1bdaa

verified ·

1 Parent(s): 5b8133e

Upload 11 files

Browse files

Files changed (1) hide show

compressor.py +199 -0

compressor.py CHANGED Viewed

@@ -17,6 +17,8 @@ by ParallelNeuralCompressor (NC05/NC06 formats).
 """
 import gc
 import struct
 import sys
@@ -30,6 +32,14 @@ from lzp_model import LZPModel
 from context_mixer import ContextMixer
 from adaptive_head import AdaptiveHead
 # ---- CDF precision ----
 # Enhanced CDF: 2^24 instead of the original 2^16.
@@ -606,3 +616,192 @@ class NeuralCompressor:
             num_models=num_mix, lr=DEFAULT_MIXER_LR,
         ) if num_mix > 1 else None

 """
 import gc
+import gzip
+import lzma
 import struct
 import sys
 from context_mixer import ContextMixer
 from adaptive_head import AdaptiveHead
+# ---- File format constants (NC05 text / NC06 hybrid binary) ----
+MAGIC = b'NC05'       # single-worker text format
+MAGIC_BIN = b'NC06'   # single-worker hybrid binary format
+# Minimum bytes needed to identify a valid header (NC05: 9B)
+HEADER_SIZE = 9
+NC06_VERSION = 1
 # ---- CDF precision ----
 # Enhanced CDF: 2^24 instead of the original 2^16.
             num_models=num_mix, lr=DEFAULT_MIXER_LR,
         ) if num_mix > 1 else None
+    # ------------------------------------------------------------------
+    # Public compress / decompress (NC05 text, NC06 hybrid binary)
+    # ------------------------------------------------------------------
+    def compress(self, text: str) -> bytes:
+        """Compress text to bytes (NC05 single-chunk format)."""
+        flags = self._config_flags()
+        temp_encoded = int(round(self.temperature * 10000))
+        if not text:
+            return MAGIC + struct.pack('>BHH', flags, temp_encoded, 0)
+        self.model.reset_cache()
+        self._reset_secondary_models()
+        num_tokens, compressed_bits, stream = self._compress_text_to_stream(text)
+        header = MAGIC + struct.pack('>BHH', flags, temp_encoded, 1)
+        entry = struct.pack('>III', num_tokens, compressed_bits, len(stream))
+        return header + entry + stream
+    def compress_bytes(self, data: bytes) -> bytes:
+        """Compress raw bytes using hybrid chunked format (NC06)."""
+        chunks = _segment_chunks(data)
+        num_entries = len(chunks)
+        flags = self._config_flags()
+        temp_encoded = int(round(self.temperature * 10000))
+        file_header = MAGIC_BIN + struct.pack(
+            '>BHII', flags, temp_encoded, NC06_VERSION, num_entries,
+        )
+        if num_entries == 0:
+            return file_header
+        entry_table = []
+        binary_parts = []
+        text_indices = []
+        total_binary = 0
+        for ci, (chunk_type, offset, length) in enumerate(chunks):
+            entry_table.append(struct.pack('>BI', chunk_type, length))
+            if chunk_type == CHUNK_TYPE_BINARY:
+                binary_parts.append(data[offset:offset + length])
+                total_binary += length
+            else:
+                text_indices.append(ci)
+        if total_binary > 0:
+            binary_blob = b''.join(binary_parts)
+            if total_binary >= LZMA_THRESHOLD:
+                compressed = lzma.compress(binary_blob)
+                method = BLOB_LZMA
+            else:
+                compressed = gzip.compress(binary_blob, compresslevel=9)
+                method = BLOB_GZIP
+            if len(compressed) >= total_binary:
+                compressed = binary_blob
+                method = BLOB_RAW
+            binary_section = struct.pack('>BI', method, len(compressed)) + compressed
+        else:
+            binary_section = b''
+        # NC06 text entry: n_sub_chunks(2) + sub-chunk table + streams
+        # Single worker: always 1 sub-chunk per text entry.
+        text_sections = []
+        for ci in text_indices:
+            chunk_type, offset, length = chunks[ci]
+            text = data[offset:offset + length].decode('latin-1')
+            self.model.reset_cache()
+            self._reset_secondary_models()
+            token_count, bit_count, stream = self._compress_text_to_stream(text)
+            sub_entry = struct.pack('>III', token_count, bit_count, len(stream))
+            text_sections.append(struct.pack('>H', 1) + sub_entry + stream)
+        return (file_header
+                + b''.join(entry_table)
+                + binary_section
+                + b''.join(text_sections))
+    def decompress(self, data: bytes) -> 'str | bytes':
+        """Decompress NC05 (text) or NC06 (hybrid binary) format."""
+        if len(data) < HEADER_SIZE:
+            raise ValueError("Data too short to contain a valid header")
+        magic = data[:4]
+        if magic == MAGIC:
+            return self._decompress_nc05(data)
+        elif magic == MAGIC_BIN:
+            return self._decompress_nc06(data)
+        else:
+            raise ValueError(
+                f"Invalid magic bytes: {magic!r} "
+                f"(expected {MAGIC!r} or {MAGIC_BIN!r})"
+            )
+    def _decompress_nc05(self, data: bytes) -> str:
+        """Decompress NC05 (text) format."""
+        flags = data[4]
+        temp_encoded, n_chunks = struct.unpack('>HH', data[5:9])
+        if n_chunks == 0:
+            return ""
+        self._apply_flags(flags)
+        self.temperature = temp_encoded / 10000.0
+        pos = 9
+        entries = []
+        for _ in range(n_chunks):
+            num_tokens, comp_bits, stream_len = struct.unpack(
+                '>III', data[pos:pos + 12],
+            )
+            entries.append((num_tokens, comp_bits, stream_len))
+            pos += 12
+        texts = []
+        for num_tokens, comp_bits, stream_len in entries:
+            stream = data[pos:pos + stream_len]
+            pos += stream_len
+            self.model.reset_cache()
+            self._reset_secondary_models()
+            texts.append(self._decompress_text_stream(stream, num_tokens))
+        return ''.join(texts)
+    def _decompress_nc06(self, data: bytes) -> bytes:
+        """Decompress NC06 (hybrid binary) format."""
+        flags = data[4]
+        temp_encoded, _version, num_entries = struct.unpack('>HII', data[5:15])
+        self._apply_flags(flags)
+        self.temperature = temp_encoded / 10000.0
+        if num_entries == 0:
+            return b""
+        pos = 15
+        entries = []
+        total_binary = 0
+        for _ in range(num_entries):
+            etype, elen = struct.unpack('>BI', data[pos:pos + 5])
+            entries.append((etype, elen))
+            if etype == CHUNK_TYPE_BINARY:
+                total_binary += elen
+            pos += 5
+        binary_data = b''
+        if total_binary > 0:
+            method, comp_len = struct.unpack('>BI', data[pos:pos + 5])
+            pos += 5
+            compressed = data[pos:pos + comp_len]
+            pos += comp_len
+            if method == BLOB_RAW:
+                binary_data = compressed
+            elif method == BLOB_GZIP:
+                binary_data = gzip.decompress(compressed)
+            elif method == BLOB_LZMA:
+                binary_data = lzma.decompress(compressed)
+        binary_offset = 0
+        output_parts = []
+        for etype, elen in entries:
+            if etype == CHUNK_TYPE_BINARY:
+                output_parts.append(
+                    binary_data[binary_offset:binary_offset + elen]
+                )
+                binary_offset += elen
+            else:
+                n_sub = struct.unpack('>H', data[pos:pos + 2])[0]
+                pos += 2
+                sub_entries = []
+                for _ in range(n_sub):
+                    num_tokens, comp_bits, stream_len = struct.unpack(
+                        '>III', data[pos:pos + 12],
+                    )
+                    sub_entries.append((num_tokens, comp_bits, stream_len))
+                    pos += 12
+                texts = []
+                for num_tokens, comp_bits, stream_len in sub_entries:
+                    stream = data[pos:pos + stream_len]
+                    pos += stream_len
+                    self.model.reset_cache()
+                    self._reset_secondary_models()
+                    texts.append(
+                        self._decompress_text_stream(stream, num_tokens)
+                    )
+                output_parts.append(''.join(texts).encode('latin-1'))
+        return b''.join(output_parts)