"""
Hybrid compressor (v7.6): TRUE PARALLEL multiprocessing + accumulated XZ bucket.

1. Auto-discover all trigram tables in trigrams/ directory
2. Load all numpy arrays into shared memory ONCE (zero-copy for workers)
3. Dynamic chunk sizing: max(2048, min(65536, segment_len // 10))
4. Full-file contiguous XZ: one lzma.compress(entire file) runs in a
   background worker while per-chunk trigram/lzma entries are computed.
   At the end, emit whichever plan (full-file XZ vs individual entries)
   produces smaller output.
5. Trigram tables tested in PARALLEL via ProcessPoolExecutor

Key improvements over v7.5:
 - ProcessPoolExecutor bypasses the GIL for true CPU parallelism
 - Shared memory segments avoid duplicating 3.9GB of tables
 - All CPU cores utilized (workers = min(cpu_count, num_tables + 1))
 - Full-file contiguous XZ exploits cross-chunk repetition for better ratios
 - Dynamic chunk sizing adapts to file size

File formats:
  TC01 -- pure text (single stream, backward compat)
  NC03 -- hybrid chunked format (binary + text sub-chunks, single table)
  NC05 -- parallel multi-table format (adds table_id per trigram entry)
"""
import glob
import lzma
import os
import struct
import sys
from collections import Counter, defaultdict
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import shared_memory, cpu_count

import numpy as np
from transformers import AutoTokenizer

# Imports needed in main process for decompression
from arithmetic_coder import ArithmeticEncoder, ArithmeticDecoder

# ---- format constants ----
MAGIC_TEXT  = b'TC01'
MAGIC_NC03  = b'NC03'
MAGIC_CHUNK = b'NC05'

METHOD_BINARY  = 0x42  # 'B' -- binary, always lzma
METHOD_TRIGRAM = 0x54  # 'T' -- text sub-chunk, trigram won
METHOD_LZMA   = 0x4C  # 'L' -- text sub-chunk, lzma won

# ---- chunk sizes ----
# Dynamic chunk sizing: computed per text segment in compress_bytes()
# chunk_size = max(2048, min(65536, segment_len // 10))

# ---- binary/text detection thresholds ----
MIN_TEXT_RUN     = 64
MAX_BRIDGE_GAP   = 8
MIN_BINARY_CHUNK = 64

CHUNK_TYPE_TEXT   = 0x54  # internal, for segmentation
CHUNK_TYPE_BINARY = 0x42

# Bytes considered text-like: printable ASCII + tab/LF/CR
TEXT_BYTES = frozenset(range(32, 127)) | {9, 10, 13}


# ==================================================================
# Binary/text segmentation (same as v7.4/v7.5)
# ==================================================================

def _segment_chunks(data: bytes) -> list[tuple[int, int, int]]:
    """Segment data into text and binary regions.

    Returns list of (chunk_type, offset, length).
    """
    if not data:
        return []

    # Step 1: classify contiguous runs
    runs = []
    current_type = CHUNK_TYPE_TEXT if data[0] in TEXT_BYTES else CHUNK_TYPE_BINARY
    run_start = 0
    for i in range(1, len(data)):
        byte_type = CHUNK_TYPE_TEXT if data[i] in TEXT_BYTES else CHUNK_TYPE_BINARY
        if byte_type != current_type:
            runs.append((current_type, run_start, i - run_start))
            current_type = byte_type
            run_start = i
    runs.append((current_type, run_start, len(data) - run_start))

    # Step 2: demote short text runs to binary
    runs = [
        (CHUNK_TYPE_BINARY if t == CHUNK_TYPE_TEXT and length < MIN_TEXT_RUN else t,
         off, length)
        for t, off, length in runs
    ]

    # Step 3: merge adjacent same-type
    runs = _merge_adjacent(runs)

    # Step 4: bridge small binary gaps between text runs
    if len(runs) >= 3:
        bridged = [runs[0]]
        i = 1
        while i < len(runs) - 1:
            prev_t = bridged[-1][0]
            curr_t, curr_off, curr_len = runs[i]
            next_t = runs[i + 1][0]
            if (prev_t == CHUNK_TYPE_TEXT and curr_t == CHUNK_TYPE_BINARY
                    and next_t == CHUNK_TYPE_TEXT and curr_len <= MAX_BRIDGE_GAP):
                prev_t2, prev_off, prev_len = bridged[-1]
                _, _, next_len = runs[i + 1]
                bridged[-1] = (CHUNK_TYPE_TEXT, prev_off,
                               prev_len + curr_len + next_len)
                i += 2
            else:
                bridged.append((curr_t, curr_off, curr_len))
                i += 1
        if i < len(runs):
            bridged.append(runs[i])
        runs = bridged

    # Step 5: merge again
    runs = _merge_adjacent(runs)

    # Step 6: absorb small binary chunks into adjacent text
    if len(runs) >= 2:
        absorbed = []
        i = 0
        while i < len(runs):
            t, off, length = runs[i]
            if t == CHUNK_TYPE_BINARY and length < MIN_BINARY_CHUNK:
                left_text = (absorbed and absorbed[-1][0] == CHUNK_TYPE_TEXT)
                right_text = (i + 1 < len(runs)
                              and runs[i + 1][0] == CHUNK_TYPE_TEXT)
                if left_text and right_text:
                    prev_t, prev_off, prev_len = absorbed[-1]
                    _, _, next_len = runs[i + 1]
                    absorbed[-1] = (CHUNK_TYPE_TEXT, prev_off,
                                    prev_len + length + next_len)
                    i += 2
                    continue
                elif left_text:
                    prev_t, prev_off, prev_len = absorbed[-1]
                    absorbed[-1] = (CHUNK_TYPE_TEXT, prev_off,
                                    prev_len + length)
                    i += 1
                    continue
                elif right_text:
                    absorbed.append((CHUNK_TYPE_TEXT, off, length))
                    i += 1
                    continue
            absorbed.append((t, off, length))
            i += 1
        runs = _merge_adjacent(absorbed)

    return runs


def _merge_adjacent(runs):
    if not runs:
        return runs
    merged = [runs[0]]
    for t, off, length in runs[1:]:
        if t == merged[-1][0]:
            prev_t, prev_off, prev_len = merged[-1]
            merged[-1] = (prev_t, prev_off, prev_len + length)
        else:
            merged.append((t, off, length))
    return merged


# ==================================================================
# Text compression helpers
# ==================================================================

def _bisect(sorted_arr, val):
    lo, hi = 0, len(sorted_arr) - 1
    while lo <= hi:
        mid = (lo + hi) >> 1
        v = int(sorted_arr[mid])
        if v == val:
            return mid
        elif v < val:
            lo = mid + 1
        else:
            hi = mid - 1
    return -1


def _encode_uniform(encoder, value, total):
    if total <= 1:
        return
    if total <= 16384:
        cdf = list(range(total + 1))
        encoder.encode_symbol(cdf, value)
    else:
        hi_total = (total + 255) // 256
        hi_val = value // 256
        lo_total = min(256, total - hi_val * 256)
        lo_val = value % 256
        if lo_val >= lo_total:
            lo_val = lo_total - 1
        cdf_hi = list(range(hi_total + 1))
        encoder.encode_symbol(cdf_hi, hi_val)
        cdf_lo = list(range(lo_total + 1))
        encoder.encode_symbol(cdf_lo, lo_val)


def _decode_uniform(decoder, total):
    if total <= 1:
        return 0
    if total <= 16384:
        cdf = list(range(total + 1))
        return decoder.decode_symbol(cdf)
    else:
        hi_total = (total + 255) // 256
        cdf_hi = list(range(hi_total + 1))
        hi_val = decoder.decode_symbol(cdf_hi)
        lo_total = min(256, total - hi_val * 256)
        cdf_lo = list(range(lo_total + 1))
        lo_val = decoder.decode_symbol(cdf_lo)
        return hi_val * 256 + lo_val


def _rank_to_token(rank, excluded_sorted):
    token_id = 0
    remaining = rank
    ex_idx = 0
    n_ex = len(excluded_sorted)
    while True:
        while ex_idx < n_ex and excluded_sorted[ex_idx] == token_id:
            token_id += 1
            ex_idx += 1
        if remaining == 0:
            return token_id
        remaining -= 1
        token_id += 1


# ==================================================================
# Standalone trigram compression/decompression functions
# (used by main process for decompression and TC01 backward compat)
# ==================================================================

def _trigram_compress_chunk(model, tokenizer, chunk_bytes):
    """Compress one chunk with a given trigram model.

    Returns (num_tokens, stream_bytes).
    """
    text = chunk_bytes.decode('latin-1')
    token_ids = tokenizer.encode(text)
    num_tokens = len(token_ids)
    if num_tokens == 0:
        return 0, b''
    model.reset()
    encoder = ArithmeticEncoder()
    context = []

    for token_id in token_ids:
        sparse_tokens, cdf = model.get_sparse_cdf(context)
        excluded_sorted = sorted(sparse_tokens.tolist())
        pos = _bisect(sparse_tokens, token_id)

        if pos >= 0:
            encoder.encode_symbol(cdf, pos)
        else:
            rest_idx = len(sparse_tokens)
            encoder.encode_symbol(cdf, rest_idx)
            rest_size, rank = model.get_rest_rank(token_id, excluded_sorted)
            _encode_uniform(encoder, rank, rest_size)

        model.update(token_id)
        context.append(token_id)

    stream = encoder.finish()
    return num_tokens, stream


def _trigram_decompress_chunk(model, tokenizer, stream, num_tokens):
    """Decompress a trigram stream back to bytes using a given model."""
    if num_tokens == 0:
        return b''
    model.reset()
    decoder = ArithmeticDecoder(stream)
    context = []
    token_ids = []

    for i in range(num_tokens):
        sparse_tokens, cdf = model.get_sparse_cdf(context)
        excluded_sorted = sorted(sparse_tokens.tolist())
        sym = decoder.decode_symbol(cdf)

        if sym < len(sparse_tokens):
            token_id = int(sparse_tokens[sym])
        else:
            rest_size = model.vocab_size - len(excluded_sorted)
            if rest_size <= 0:
                rest_size = 1
            rank = _decode_uniform(decoder, rest_size)
            token_id = _rank_to_token(rank, excluded_sorted)

        token_ids.append(token_id)
        model.update(token_id)
        context.append(token_id)

    text = tokenizer.decode(token_ids)
    return text.encode('latin-1')


# ==================================================================
# Table discovery
# ==================================================================

def discover_trigram_tables(trigrams_dir):
    """Auto-discover all .npz trigram tables in the given directory.

    Returns list of (table_name, table_path) sorted by name.
    """
    if not os.path.isdir(trigrams_dir):
        return []

    tables = []
    for path in sorted(glob.glob(os.path.join(trigrams_dir, "*.npz"))):
        name = os.path.splitext(os.path.basename(path))[0]
        # Remove common prefixes for cleaner display
        display_name = name
        if display_name.startswith("trigram_"):
            display_name = display_name[8:]
        tables.append((display_name, os.path.abspath(path)))

    return tables


# ==================================================================
# Shared memory management for trigram tables
# ==================================================================

# Model constants (imported from trigram_model to keep behavior identical)
from trigram_model import (
    CDF_TOTAL, MIN_PROB, LAMBDA_TRI, LAMBDA_BI, LAMBDA_UNI,
    LAMBDA_BI_ONLY, LAMBDA_UNI_ONLY, MAX_ADAPTIVE_WEIGHT,
    ADAPTIVE_RAMP_TOKENS, SPARSE_TOP_K,
)


def _create_shm_for_array(arr):
    """Create a shared memory segment and copy a numpy array into it.

    Returns (shm_name, dtype_str, shape_tuple, shm_object).
    """
    nbytes = arr.nbytes
    shm = shared_memory.SharedMemory(create=True, size=max(nbytes, 1))
    # Copy array data into shared memory buffer
    shm_arr = np.ndarray(arr.shape, dtype=arr.dtype, buffer=shm.buf)
    shm_arr[:] = arr[:]
    return shm.name, str(arr.dtype), arr.shape, shm


def _load_table_to_shared_memory(table_path, table_name, verbose=True):
    """Load one .npz trigram table into shared memory segments.

    Memory-efficient: loads each array, copies to shared memory, then
    immediately frees the original numpy array to minimize peak usage.

    Returns (shm_info_dict, list_of_shm_objects).
    shm_info_dict contains everything workers need to reconstruct the table.
    """
    import gc

    if verbose:
        print(f"  Loading {table_name} into shared memory...",
              file=sys.stderr)

    data = np.load(table_path, allow_pickle=True)
    vocab_size = int(data["vocab_size"][0])
    tokenizer_name = str(data["tokenizer_name"][0])

    shm_objects = []
    shm_info = {
        "vocab_size": vocab_size,
        "tokenizer_name": tokenizer_name,
        "table_name": table_name,
        "arrays": {},
    }

    # Define array name -> (npz key, target dtype or None for original)
    # Keep original dtypes (float32) to minimize memory usage.
    # Only unigram_probs is promoted to float64 (it's tiny: 49K entries).
    # All other arrays stay in their native dtype from the .npz file.
    # The float32->float64 conversion for probability computation happens
    # on-the-fly during get_sparse_cdf (only for the small per-chunk slices).
    array_specs = [
        ("unigram_probs",  "unigram_probs",          np.float64),
        ("bi_ctx_keys",    "bigram_context_keys",     None),
        ("bi_top_tokens",  "bigram_top_tokens",       None),
        ("bi_top_probs",   "bigram_top_probs",        None),
        ("bi_remaining",   "bigram_remaining_mass",   None),
        ("tri_ctx_keys",   "trigram_context_keys",    None),
        ("tri_top_tokens", "trigram_top_tokens",       None),
        ("tri_top_probs",  "trigram_top_probs",        None),
        ("tri_remaining",  "trigram_remaining_mass",   None),
    ]

    # Load each array one at a time, copy to shm, free original
    uni_probs_shm_name = None
    for arr_name, npz_key, target_dtype in array_specs:
        arr = data[npz_key]
        if target_dtype is not None:
            arr = arr.astype(target_dtype)
        name, dtype_str, shape, shm_obj = _create_shm_for_array(arr)
        shm_info["arrays"][arr_name] = {
            "shm_name": name,
            "dtype": dtype_str,
            "shape": shape,
        }
        shm_objects.append(shm_obj)
        # Remember unigram shm for top-K computation
        if arr_name == "unigram_probs":
            uni_probs_shm_name = name
            uni_probs_shape = shape
            uni_probs_dtype = dtype_str
        del arr
        gc.collect()

    # Compute unigram top-K indices from the shared memory copy
    # (avoids keeping the original in regular memory)
    uni_shm = shared_memory.SharedMemory(
        name=uni_probs_shm_name, create=False)
    uni_probs = np.ndarray(
        uni_probs_shape, dtype=np.dtype(uni_probs_dtype),
        buffer=uni_shm.buf)
    uni_top_idx = np.argsort(uni_probs)[::-1][:SPARSE_TOP_K].copy()
    uni_top_idx = uni_top_idx.astype(np.int64)
    uni_shm.close()

    name, dtype_str, shape, shm_obj = _create_shm_for_array(uni_top_idx)
    shm_info["arrays"]["uni_top_idx"] = {
        "shm_name": name,
        "dtype": dtype_str,
        "shape": shape,
    }
    shm_objects.append(shm_obj)

    del uni_top_idx, data
    gc.collect()

    return shm_info, shm_objects


# ==================================================================
# Worker process: global state and initialization
# ==================================================================

# Global state in each worker process (set by _worker_init)
_worker_tables = None       # list of reconstructed table dicts
_worker_tokenizer = None    # tokenizer instance for this worker
_worker_shm_refs = None     # SharedMemory refs (keep alive in worker)


def _worker_init(all_table_shm_info, tokenizer_name):
    """Initialize a worker process.

    Attach to shared memory segments, reconstruct numpy array views
    (zero-copy), and load the tokenizer. Called once per worker process.
    """
    global _worker_tables, _worker_tokenizer, _worker_shm_refs

    _worker_shm_refs = []
    _worker_tables = []

    for tinfo in all_table_shm_info:
        table = {
            "vocab_size": tinfo["vocab_size"],
            "tokenizer_name": tinfo["tokenizer_name"],
            "table_name": tinfo["table_name"],
        }

        # Reconstruct numpy arrays from shared memory (zero-copy views)
        for arr_name, arr_info in tinfo["arrays"].items():
            shm = shared_memory.SharedMemory(
                name=arr_info["shm_name"], create=False)
            _worker_shm_refs.append(shm)
            arr = np.ndarray(
                arr_info["shape"],
                dtype=np.dtype(arr_info["dtype"]),
                buffer=shm.buf,
            )
            table[arr_name] = arr

        # Build unigram top set from shared uni_top_idx
        table["uni_top_set"] = set(table["uni_top_idx"].tolist())

        _worker_tables.append(table)

    # Load tokenizer (lightweight, each worker gets its own)
    _worker_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)


# ==================================================================
# Worker-side adaptive trigram model (stateful per chunk)
# ==================================================================

class _WorkerAdaptiveModel:
    """Lightweight adaptive trigram model for worker processes.

    Uses shared-memory numpy arrays (zero-copy read-only views) for
    the static trigram tables, with per-chunk adaptive counters.
    Produces identical output to AdaptiveTrigramModel from trigram_model.py.
    """

    def __init__(self, table_dict):
        self.vocab_size = table_dict["vocab_size"]
        self.tokenizer_name = table_dict["tokenizer_name"]

        # Shared-memory arrays (read-only views, no copy)
        self.unigram_probs = table_dict["unigram_probs"]
        self.bi_ctx_keys = table_dict["bi_ctx_keys"]
        self.bi_top_tokens = table_dict["bi_top_tokens"]
        self.bi_top_probs = table_dict["bi_top_probs"]
        self.bi_remaining = table_dict["bi_remaining"]
        self.tri_ctx_keys = table_dict["tri_ctx_keys"]
        self.tri_top_tokens = table_dict["tri_top_tokens"]
        self.tri_top_probs = table_dict["tri_top_probs"]
        self.tri_remaining = table_dict["tri_remaining"]
        self._uni_top_idx = table_dict["uni_top_idx"]
        self._uni_top_set = table_dict["uni_top_set"]

        self.reset()

    def reset(self):
        self.adapt_bi = defaultdict(Counter)
        self.adapt_tri = defaultdict(Counter)
        self.tokens_seen = 0
        self._prev1 = None
        self._prev2 = None

    def update(self, token_id):
        if self._prev1 is not None:
            self.adapt_bi[self._prev1][token_id] += 1
        if self._prev2 is not None and self._prev1 is not None:
            self.adapt_tri[(self._prev2, self._prev1)][token_id] += 1
        self._prev2 = self._prev1
        self._prev1 = token_id
        self.tokens_seen += 1

    def _lookup_bigram(self, prev1):
        idx = np.searchsorted(self.bi_ctx_keys, prev1)
        if idx < len(self.bi_ctx_keys) and self.bi_ctx_keys[idx] == prev1:
            # Cast small per-context slices to float64 for precision
            return (self.bi_top_tokens[idx],
                    self.bi_top_probs[idx].astype(np.float64),
                    float(self.bi_remaining[idx]))
        return None

    def _lookup_trigram(self, prev2, prev1):
        key = np.uint32((prev2 & 0xFFFF) << 16 | (prev1 & 0xFFFF))
        idx = np.searchsorted(self.tri_ctx_keys, key)
        if idx < len(self.tri_ctx_keys) and self.tri_ctx_keys[idx] == key:
            # Cast small per-context slices to float64 for precision
            return (self.tri_top_tokens[idx],
                    self.tri_top_probs[idx].astype(np.float64),
                    float(self.tri_remaining[idx]))
        return None

    def get_sparse_cdf(self, context):
        """Return (token_ids, cdf) -- identical semantics to
        AdaptiveTrigramModel.get_sparse_cdf.
        """
        # --- Step 1: Collect candidate token set ---
        candidates = set(self._uni_top_set)

        bi_result = None
        tri_result = None

        if len(context) >= 1:
            prev1 = context[-1]
            bi_result = self._lookup_bigram(prev1)
            if bi_result is not None:
                valid = bi_result[1] > 0
                candidates.update(bi_result[0][valid].tolist())

            if len(context) >= 2:
                prev2 = context[-2]
                tri_result = self._lookup_trigram(prev2, prev1)
                if tri_result is not None:
                    valid = tri_result[1] > 0
                    candidates.update(tri_result[0][valid].tolist())

        # Adaptive tokens
        lambda_a = min(MAX_ADAPTIVE_WEIGHT,
                       self.tokens_seen / ADAPTIVE_RAMP_TOKENS)
        ada_bi_counter = None
        ada_tri_counter = None
        if lambda_a > 1e-12 and len(context) >= 1:
            prev1 = context[-1]
            ada_bi_counter = self.adapt_bi.get(prev1)
            if ada_bi_counter:
                candidates.update(ada_bi_counter.keys())
            if len(context) >= 2:
                prev2 = context[-2]
                ada_tri_counter = self.adapt_tri.get((prev2, prev1))
                if ada_tri_counter:
                    candidates.update(ada_tri_counter.keys())

        token_ids = np.array(sorted(candidates), dtype=np.int64)
        n = len(token_ids)

        # --- Step 2: Build probability for each candidate ---
        uni_probs = self.unigram_probs[token_ids]

        if (len(context) >= 2 and tri_result is not None
                and bi_result is not None):
            bi_probs = uni_probs * bi_result[2]
            bi_tok = bi_result[0]
            bi_p = bi_result[1]
            bi_valid = bi_p > 0
            if bi_valid.any():
                _map_into(bi_probs, token_ids, bi_tok[bi_valid],
                          bi_p[bi_valid])

            tri_probs = bi_probs * tri_result[2]
            tri_tok = tri_result[0]
            tri_p = tri_result[1]
            tri_valid = tri_p > 0
            if tri_valid.any():
                _map_into(tri_probs, token_ids, tri_tok[tri_valid],
                          tri_p[tri_valid])

            static_probs = (LAMBDA_TRI * tri_probs + LAMBDA_BI * bi_probs
                            + LAMBDA_UNI * uni_probs)

        elif len(context) >= 2 and tri_result is not None:
            tri_probs = uni_probs * tri_result[2]
            tri_tok = tri_result[0]
            tri_p = tri_result[1]
            tri_valid = tri_p > 0
            if tri_valid.any():
                _map_into(tri_probs, token_ids, tri_tok[tri_valid],
                          tri_p[tri_valid])
            static_probs = ((LAMBDA_TRI + LAMBDA_BI) * tri_probs
                            + LAMBDA_UNI * uni_probs)

        elif bi_result is not None:
            bi_probs = uni_probs * bi_result[2]
            bi_tok = bi_result[0]
            bi_p = bi_result[1]
            bi_valid = bi_p > 0
            if bi_valid.any():
                _map_into(bi_probs, token_ids, bi_tok[bi_valid],
                          bi_p[bi_valid])
            static_probs = (LAMBDA_BI_ONLY * bi_probs
                            + LAMBDA_UNI_ONLY * uni_probs)

        else:
            static_probs = uni_probs.copy()

        # --- Step 3: Adaptive mixing ---
        if lambda_a > 1e-12 and (ada_bi_counter or ada_tri_counter):
            ada_bi_dist = None
            if ada_bi_counter:
                ada_bi_dist = _build_adaptive_sparse(
                    ada_bi_counter, token_ids, static_probs)
            ada_tri_dist = None
            if ada_tri_counter:
                ada_tri_dist = _build_adaptive_sparse(
                    ada_tri_counter, token_ids, static_probs)

            if ada_tri_dist is not None and ada_bi_dist is not None:
                adaptive = 0.6 * ada_tri_dist + 0.4 * ada_bi_dist
            elif ada_tri_dist is not None:
                adaptive = ada_tri_dist
            else:
                adaptive = ada_bi_dist

            final_probs = ((1.0 - lambda_a) * static_probs
                           + lambda_a * adaptive)
        else:
            final_probs = static_probs

        # --- Step 4: Rest mass ---
        final_probs = np.maximum(final_probs, 1e-10)
        candidate_sum = final_probs.sum()
        rest_mass = max(1e-10, 1.0 - candidate_sum)

        # --- Step 5: Build integer CDF ---
        total_symbols = n + 1
        usable = CDF_TOTAL - total_symbols * MIN_PROB

        all_probs = np.empty(total_symbols, dtype=np.float64)
        all_probs[:n] = final_probs
        all_probs[n] = rest_mass
        all_probs /= all_probs.sum()

        counts = (all_probs * usable).astype(np.int64)
        counts = np.maximum(counts, 0) + MIN_PROB
        diff = CDF_TOTAL - counts.sum()
        if diff != 0:
            counts[counts.argmax()] += diff

        cdf = np.zeros(total_symbols + 1, dtype=np.int64)
        np.cumsum(counts, out=cdf[1:])
        cdf[-1] = CDF_TOTAL

        return token_ids, cdf.tolist()

    def get_rest_rank(self, token_id, excluded_sorted):
        rest_size = self.vocab_size - len(excluded_sorted)
        if rest_size <= 0:
            rest_size = 1
        lo, hi = 0, len(excluded_sorted)
        while lo < hi:
            mid = (lo + hi) >> 1
            if excluded_sorted[mid] < token_id:
                lo = mid + 1
            else:
                hi = mid
        rank = token_id - lo
        return rest_size, rank


def _map_into(target, target_tokens, src_tokens, src_probs):
    """Set target[i] = src_probs[j] where target_tokens[i] == src_tokens[j]."""
    idx = np.searchsorted(target_tokens, src_tokens)
    valid = (idx < len(target_tokens)) & (target_tokens[idx] == src_tokens)
    target[idx[valid]] = src_probs[valid]


def _build_adaptive_sparse(counter, token_ids, static_probs):
    """Build adaptive distribution over sparse token_ids from a Counter."""
    n = len(token_ids)
    dist = static_probs.copy()

    if not counter:
        return dist

    obs_tokens = np.array(list(counter.keys()), dtype=np.int64)
    obs_counts = np.array(list(counter.values()), dtype=np.float64)
    total = obs_counts.sum()
    denom = total + len(obs_tokens) + 1.0
    smoothed = (obs_counts + 1.0) / denom
    remaining_frac = 1.0 / denom

    dist *= remaining_frac

    idx = np.searchsorted(token_ids, obs_tokens)
    valid = (idx < n) & (token_ids[idx] == obs_tokens)
    dist[idx[valid]] = smoothed[valid]

    s = dist.sum()
    if s > 0:
        dist /= s
    return dist


# ==================================================================
# Worker-side compression functions (run in child processes)
# ==================================================================

def _worker_compress_with_table(table_idx, chunk_bytes):
    """Compress chunk_bytes using trigram table[table_idx].

    Runs in a worker process. Uses global _worker_tables and
    _worker_tokenizer initialized by _worker_init.

    Returns (table_idx, METHOD_TRIGRAM, compressed_data) or
            (table_idx, None, None) on failure.
    """
    global _worker_tables, _worker_tokenizer

    try:
        table_dict = _worker_tables[table_idx]
        model = _WorkerAdaptiveModel(table_dict)

        text = chunk_bytes.decode('latin-1')
        token_ids_list = _worker_tokenizer.encode(text)
        num_tokens = len(token_ids_list)
        if num_tokens == 0:
            return (table_idx, METHOD_TRIGRAM, struct.pack('>I', 0))

        model.reset()
        encoder = ArithmeticEncoder()
        context = []

        for token_id in token_ids_list:
            sparse_tokens, cdf = model.get_sparse_cdf(context)
            excluded_sorted = sorted(sparse_tokens.tolist())
            pos = _bisect(sparse_tokens, token_id)

            if pos >= 0:
                encoder.encode_symbol(cdf, pos)
            else:
                rest_idx = len(sparse_tokens)
                encoder.encode_symbol(cdf, rest_idx)
                rest_size, rank = model.get_rest_rank(
                    token_id, excluded_sorted)
                _encode_uniform(encoder, rank, rest_size)

            model.update(token_id)
            context.append(token_id)

        stream = encoder.finish()
        tri_data = struct.pack('>I', num_tokens) + stream
        return (table_idx, METHOD_TRIGRAM, tri_data)

    except Exception:
        return (table_idx, None, None)


def _worker_compress_with_lzma(chunk_bytes):
    """Compress chunk_bytes using lzma.

    Runs in a worker process.
    Returns (-1, METHOD_LZMA, compressed_data).
    """
    return (-1, METHOD_LZMA, lzma.compress(chunk_bytes))


# ==================================================================
# Main compressor class (v7.6: true parallel multiprocessing)
# ==================================================================

class TrigramCompressor:
    """Multi-table compressor with TRUE PARALLEL multiprocessing.

    Architecture:
    1. Loads all trigram tables into shared memory (loaded ONCE, zero-copy)
    2. Creates ProcessPoolExecutor with N workers (bypasses GIL)
       N = min(cpu_count, num_tables + 1)
    3. Two competing plans built simultaneously:
       a) Individual plan: per-chunk best of trigram/lzma (dynamic chunk sizing)
       b) Full-file XZ: one contiguous lzma.compress(entire file) in background
       Winner (smallest total) is emitted
    4. Decompression uses the main process (sequential by nature)
    """

    def __init__(self, table_path=None, trigrams_dir=None, verbose=True):
        """Initialize the compressor.

        Args:
            table_path: Path to a single trigram table (backward compat /
                        used for decompression of NC03 / TC01).
            trigrams_dir: Path to directory containing multiple .npz tables.
                          If None, defaults to trigrams/ next to this file.
            verbose: Print progress information.
        """
        self.verbose = verbose

        # Discover tables
        if trigrams_dir is None:
            trigrams_dir = os.path.join(
                os.path.dirname(os.path.abspath(__file__)), "trigrams")

        self.table_entries = discover_trigram_tables(trigrams_dir)
        self.table_paths = [t[1] for t in self.table_entries]
        self.table_names = [t[0] for t in self.table_entries]

        # If no tables found in directory, fall back to single table
        if not self.table_entries and table_path:
            name = os.path.splitext(os.path.basename(table_path))[0]
            if name.startswith("trigram_"):
                name = name[8:]
            self.table_entries = [(name, os.path.abspath(table_path))]
            self.table_paths = [os.path.abspath(table_path)]
            self.table_names = [name]

        if not self.table_paths:
            raise ValueError(
                "No trigram tables found. Provide --table or put .npz files "
                "in trigrams/ directory.")

        # ---- Step 1: Load tables into shared memory ----
        self._shm_objects = []  # keep refs alive to prevent GC
        self._all_table_shm_info = []

        if self.verbose:
            print(f"Loading {len(self.table_paths)} tables into shared "
                  f"memory...", file=sys.stderr)

        for i, tp in enumerate(self.table_paths):
            if self.verbose:
                print(f"  [{i+1}/{len(self.table_paths)}] "
                      f"{self.table_names[i]} ({tp})", file=sys.stderr)
            shm_info, shm_objs = _load_table_to_shared_memory(
                tp, self.table_names[i], verbose=verbose)
            self._all_table_shm_info.append(shm_info)
            self._shm_objects.extend(shm_objs)

        # All tables must use the same tokenizer
        tokenizer_name = self._all_table_shm_info[0]["tokenizer_name"]
        for tinfo in self._all_table_shm_info[1:]:
            if tinfo["tokenizer_name"] != tokenizer_name:
                raise ValueError(
                    f"All tables must use same tokenizer. Got "
                    f"{tokenizer_name!r} and {tinfo['tokenizer_name']!r}")

        self._tokenizer_name = tokenizer_name

        # Load tokenizer in main process (for decompression)
        if self.verbose:
            print(f"Loading tokenizer: {tokenizer_name}", file=sys.stderr)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

        # Build main-process decompression models from shared memory
        # (NO extra copy -- reuses the same shared memory segments)
        self.models = []
        self._main_shm_refs = []  # keep refs alive
        for tinfo in self._all_table_shm_info:
            table_dict = {
                "vocab_size": tinfo["vocab_size"],
                "tokenizer_name": tinfo["tokenizer_name"],
                "table_name": tinfo["table_name"],
            }
            for arr_name, arr_info in tinfo["arrays"].items():
                shm = shared_memory.SharedMemory(
                    name=arr_info["shm_name"], create=False)
                self._main_shm_refs.append(shm)
                arr = np.ndarray(
                    arr_info["shape"],
                    dtype=np.dtype(arr_info["dtype"]),
                    buffer=shm.buf,
                )
                table_dict[arr_name] = arr
            table_dict["uni_top_set"] = set(
                table_dict["uni_top_idx"].tolist())
            model = _WorkerAdaptiveModel(table_dict)
            self.models.append(model)

        # ---- Step 2: Create process pool ----
        num_tables = len(self.table_paths)
        self._num_workers = min(cpu_count(), num_tables + 1)

        if self.verbose:
            print(f"Creating ProcessPoolExecutor: {self._num_workers} workers "
                  f"(cpu_count={cpu_count()}, tables={num_tables})",
                  file=sys.stderr)

        self._pool = ProcessPoolExecutor(
            max_workers=self._num_workers,
            initializer=_worker_init,
            initargs=(self._all_table_shm_info, tokenizer_name),
        )

        # Warm up workers: ensure all have initialized before compression
        if self.verbose:
            print(f"Warming up {self._num_workers} worker processes...",
                  file=sys.stderr)
        warmup_futures = []
        for _ in range(self._num_workers):
            f = self._pool.submit(_worker_compress_with_lzma, b"warmup")
            warmup_futures.append(f)
        for f in warmup_futures:
            f.result()

        if self.verbose:
            print(f"Ready: {num_tables} tables | "
                  f"Names: {', '.join(self.table_names)} | "
                  f"Workers: {self._num_workers} processes",
                  file=sys.stderr)

        # Table name -> index mapping for NC05
        self._table_name_to_idx = {
            name: idx for idx, name in enumerate(self.table_names)
        }

    def shutdown(self):
        """Shutdown the process pool and clean up shared memory."""
        if self._pool is not None:
            self._pool.shutdown(wait=True)
            self._pool = None

        # Close main-process shared memory refs
        for shm in getattr(self, '_main_shm_refs', []):
            try:
                shm.close()
            except Exception:
                pass
        self._main_shm_refs = []

        # Close and unlink shared memory segments (owner refs)
        for shm in self._shm_objects:
            try:
                shm.close()
            except Exception:
                pass
            try:
                shm.unlink()
            except Exception:
                pass
        self._shm_objects = []

    # ---- per-chunk: test all trigram tables in TRUE PARALLEL ----

    def _compress_text_chunk_trigram_only(self, sub_data):
        """Test all trigram tables on a single text chunk (no lzma).

        Submits N trigram table tasks to the process pool simultaneously.
        The lzma comparison is handled by the accumulation logic in
        compress_bytes().

        Returns (table_idx, comp_data, winner_name) for the best trigram
        result, or (None, None, None) if all tables failed.
        """
        sub_len = len(sub_data)

        # Submit N trigram table tasks in parallel
        futures = []
        for ti in range(len(self.table_names)):
            future = self._pool.submit(
                _worker_compress_with_table, ti, sub_data)
            futures.append(future)

        # Wait for ALL results
        results = [f.result() for f in futures]

        # Find best trigram result
        best_tri_idx = None
        best_tri_data = None
        best_tri_size = sub_len + 1  # worse than raw

        for table_idx, method, comp_data in results:
            if method == METHOD_TRIGRAM and comp_data is not None:
                tri_size = len(comp_data)
                if tri_size < best_tri_size:
                    best_tri_idx = table_idx
                    best_tri_data = comp_data
                    best_tri_size = tri_size

        if best_tri_idx is not None:
            return (best_tri_idx, best_tri_data,
                    self.table_names[best_tri_idx])
        else:
            return (None, None, None)

    # ---- public API: compress text (TC01, backward compat) ----

    def compress(self, text):
        """Compress a text string -> TC01 format bytes (uses first table)."""
        if not text:
            return MAGIC_TEXT + struct.pack('>II', 0, 0)
        chunk_bytes = text.encode('latin-1')
        num_tokens, stream = _trigram_compress_chunk(
            self.models[0], self.tokenizer, chunk_bytes)
        bit_count = len(stream) * 8
        return MAGIC_TEXT + struct.pack('>II', num_tokens, bit_count) + stream

    def decompress_text(self, data):
        """Decompress TC01 format -> text string."""
        if len(data) < 12:
            raise ValueError("Data too short")
        magic = data[:4]
        if magic != MAGIC_TEXT:
            raise ValueError(f"Expected TC01, got {magic!r}")
        num_tokens, _ = struct.unpack('>II', data[4:12])
        if num_tokens == 0:
            return ""
        raw = _trigram_decompress_chunk(
            self.models[0], self.tokenizer, data[12:], num_tokens)
        return raw.decode('latin-1')

    # ---- public API: compress bytes (NC05 multi-table) ----

    def compress_bytes(self, data: bytes) -> bytes:
        """Compress raw bytes -> NC05 multi-table format.

        1. Segment into binary vs text regions
        2. Binary regions -> lzma (always)
        3. Two competing plans built simultaneously:
           a) Individual: best of trigram/lzma per text chunk, lzma per binary
           b) Full-file XZ: one contiguous lzma of entire input
           Emits whichever plan produces smaller total output

        NC05 format:
          [4B] Magic "NC05"
          [4B] Original total size (uint32 BE)
          [2B] Number of tables (uint16 BE)
          Per table:
            [2B] Name length (uint16 BE)
            [NB] Table name (UTF-8)
          [4B] Number of entries (uint32 BE)
          Per entry:
            [1B] Method: 'B' binary/lzma, 'T' trigram, 'L' text/lzma
            [1B] Table index (only meaningful for 'T', 0 otherwise)
            [4B] Original size (uint32 BE)
            [4B] Compressed size (uint32 BE)
            For 'B'/'L': raw compressed data
            For 'T': [4B] token_count (uint32 BE) + stream
        """
        total_size = len(data)
        if total_size == 0:
            return MAGIC_CHUNK + struct.pack('>II', 0, 0)

        # Step 1: segment binary vs text
        segments = _segment_chunks(data)

        total_binary = sum(l for t, _, l in segments if t == CHUNK_TYPE_BINARY)
        total_text = sum(l for t, _, l in segments if t == CHUNK_TYPE_TEXT)
        n_bin_segs = sum(1 for t, _, _ in segments if t == CHUNK_TYPE_BINARY)
        n_txt_segs = sum(1 for t, _, _ in segments if t == CHUNK_TYPE_TEXT)

        if self.verbose:
            print(f"Segments: {len(segments)} ({n_bin_segs} binary: "
                  f"{total_binary} bytes, {n_txt_segs} text: "
                  f"{total_text} bytes)", file=sys.stderr)
            print(f"Tables: {len(self.table_names)} "
                  f"({', '.join(self.table_names)}) | "
                  f"Workers: {self._num_workers} processes",
                  file=sys.stderr)

        # Step 2: build individual per-chunk entries AND full-file XZ
        # Strategy: compute best per-chunk compression (trigram vs lzma)
        # for each chunk. Simultaneously, compute one contiguous lzma of
        # the entire file. At the end, emit whichever is smaller.

        # Submit full-file XZ to worker pool (runs in background while
        # we process individual chunks)
        full_xz_future = self._pool.submit(lzma.compress, data)

        individual_entries = []  # (method, table_idx, orig_size, comp_data)
        individual_total_comp = 0
        bytes_done = 0
        trigram_wins = 0
        lzma_text_wins = 0
        lzma_bin_wins = 0
        table_win_counts = {name: 0 for name in self.table_names}

        for seg_type, offset, length in segments:
            seg_data = data[offset:offset + length]

            if seg_type == CHUNK_TYPE_BINARY:
                # Binary -> lzma for individual plan
                comp = lzma.compress(seg_data)
                individual_entries.append((METHOD_BINARY, 0, length, comp))
                individual_total_comp += len(comp)
                lzma_bin_wins += 1

                if self.verbose:
                    ratio = len(comp) / length if length > 0 else 0
                    overall = 100 * bytes_done / total_size
                    print(f"  Binary: {length} -> {len(comp)} ({ratio:.1%})"
                          f"  [total: {overall:.1f}%]", file=sys.stderr)
                bytes_done += length

            else:
                # Text -> dynamic chunk sizing, best of trigram/lzma per chunk
                chunk_size = max(2048, min(65536, length // 10))

                for sub_off in range(0, length, chunk_size):
                    sub_end = min(sub_off + chunk_size, length)
                    sub_data = seg_data[sub_off:sub_end]
                    sub_len = len(sub_data)

                    overall = 100 * bytes_done / total_size

                    # Test all trigram tables in parallel (workers)
                    tri_idx, tri_data, tri_name = \
                        self._compress_text_chunk_trigram_only(sub_data)

                    # Per-chunk lzma (main process)
                    chunk_lzma = lzma.compress(sub_data)

                    # Pick best individual compression for this chunk
                    if tri_data is not None and len(tri_data) <= len(chunk_lzma):
                        individual_entries.append(
                            (METHOD_TRIGRAM, tri_idx, sub_len, tri_data))
                        individual_total_comp += len(tri_data)
                        trigram_wins += 1
                        table_win_counts[tri_name] = \
                            table_win_counts.get(tri_name, 0) + 1
                        tag = f"T:{tri_name}"
                        comp_size = len(tri_data)
                    else:
                        individual_entries.append(
                            (METHOD_LZMA, 0, sub_len, chunk_lzma))
                        individual_total_comp += len(chunk_lzma)
                        lzma_text_wins += 1
                        tag = "L"
                        comp_size = len(chunk_lzma)

                    if self.verbose:
                        ratio = comp_size / sub_len if sub_len > 0 else 0
                        print(f"  Text: {sub_len} -> {comp_size} "
                              f"({tag}, {ratio:.1%})"
                              f"  [total: {overall:.1f}%]",
                              file=sys.stderr)

                    bytes_done += sub_len

        # Collect full-file contiguous XZ result
        full_xz = full_xz_future.result()

        if self.verbose:
            full_ratio = len(full_xz) / total_size if total_size else 0
            ind_ratio = individual_total_comp / total_size if total_size else 0
            print(f"  Full-file XZ: {total_size} -> {len(full_xz)} "
                  f"({full_ratio:.1%})", file=sys.stderr)
            print(f"  Individual entries: {total_size} -> "
                  f"{individual_total_comp} ({ind_ratio:.1%})",
                  file=sys.stderr)

        # Final decision: full-file XZ vs individual entries
        if len(full_xz) <= individual_total_comp:
            entries = [(METHOD_LZMA, 0, total_size, full_xz)]
            if self.verbose:
                saved = individual_total_comp - len(full_xz)
                print(f"  Winner: full-file XZ (saves {saved} bytes)",
                      file=sys.stderr)
        else:
            entries = individual_entries
            if self.verbose:
                saved = len(full_xz) - individual_total_comp
                print(f"  Winner: individual entries (saves {saved} bytes)",
                      file=sys.stderr)
                print(f"  Breakdown: {lzma_bin_wins} binary(lzma), "
                      f"{trigram_wins} text(trigram), "
                      f"{lzma_text_wins} text(lzma)", file=sys.stderr)
                if trigram_wins > 0:
                    wins_str = ", ".join(
                        f"{name}={cnt}"
                        for name, cnt in table_win_counts.items()
                        if cnt > 0)
                    print(f"  Table wins: {wins_str}", file=sys.stderr)

        # Assemble NC05
        num_entries = len(entries)

        # Header
        header_parts = [MAGIC_CHUNK, struct.pack('>I', total_size)]

        # Table directory
        n_tables = len(self.table_names)
        header_parts.append(struct.pack('>H', n_tables))
        for name in self.table_names:
            name_bytes = name.encode('utf-8')
            header_parts.append(struct.pack('>H', len(name_bytes)))
            header_parts.append(name_bytes)

        # Entry count
        header_parts.append(struct.pack('>I', num_entries))

        # Entries
        for method, table_idx, orig_size, comp_data in entries:
            header_parts.append(struct.pack('>BBII', method, table_idx,
                                            orig_size, len(comp_data)))
            header_parts.append(comp_data)

        return b''.join(header_parts)

    # ---- decompression: NC05 ----

    def _decompress_nc05(self, data: bytes) -> bytes:
        """Decompress NC05 multi-table format -> raw bytes."""
        if len(data) < 10:
            raise ValueError("NC05 data too short")

        pos = 4  # skip magic
        total_size = struct.unpack('>I', data[pos:pos + 4])[0]
        pos += 4

        if total_size == 0:
            return b""

        # Read table directory
        n_tables = struct.unpack('>H', data[pos:pos + 2])[0]
        pos += 2
        file_table_names = []
        for _ in range(n_tables):
            name_len = struct.unpack('>H', data[pos:pos + 2])[0]
            pos += 2
            name = data[pos:pos + name_len].decode('utf-8')
            pos += name_len
            file_table_names.append(name)

        # Map file table indices to our loaded model indices
        table_map = {}
        for fi, fname in enumerate(file_table_names):
            if fname in self._table_name_to_idx:
                table_map[fi] = self._table_name_to_idx[fname]
            else:
                raise ValueError(
                    f"Compressed file requires table '{fname}' which is not "
                    f"loaded. Available: {', '.join(self.table_names)}")

        # Read entries
        num_entries = struct.unpack('>I', data[pos:pos + 4])[0]
        pos += 4

        output_parts = []
        bytes_done = 0

        for ci in range(num_entries):
            method, file_table_idx, orig_size, comp_size = struct.unpack(
                '>BBII', data[pos:pos + 10])
            pos += 10
            comp_data = data[pos:pos + comp_size]
            pos += comp_size

            if method == METHOD_BINARY:
                mname = "B"
            elif method == METHOD_TRIGRAM:
                tname = file_table_names[file_table_idx]
                mname = f"T:{tname}"
            else:
                mname = "L"

            if self.verbose:
                overall = 100 * bytes_done / total_size if total_size else 0
                print(f"\r  Chunk {ci+1}/{num_entries}: {comp_size} -> "
                      f"{orig_size} ({mname})  [total: {overall:.1f}%]",
                      end="", file=sys.stderr)

            if method == METHOD_BINARY or method == METHOD_LZMA:
                output_parts.append(lzma.decompress(comp_data))
            elif method == METHOD_TRIGRAM:
                model_idx = table_map[file_table_idx]
                num_tokens = struct.unpack('>I', comp_data[:4])[0]
                stream = comp_data[4:]
                output_parts.append(
                    _trigram_decompress_chunk(
                        self.models[model_idx], self.tokenizer,
                        stream, num_tokens))
            else:
                raise ValueError(f"Unknown method: {method:#x}")

            bytes_done += orig_size

        if self.verbose:
            print(f"\r  Done: {num_entries} chunks, {total_size} bytes"
                  f"  [total: 100.0%]       ", file=sys.stderr)
            print(file=sys.stderr)

        return b''.join(output_parts)

    # ---- decompression: NC03 (backward compat) ----

    def _decompress_nc03(self, data: bytes) -> bytes:
        """Decompress NC03 format -> raw bytes (backward compat)."""
        if len(data) < 12:
            raise ValueError("NC03 data too short")

        total_size, num_entries = struct.unpack('>II', data[4:12])
        if num_entries == 0:
            return b""

        pos = 12
        output_parts = []
        bytes_done = 0

        for ci in range(num_entries):
            method, orig_size, comp_size = struct.unpack(
                '>BII', data[pos:pos + 9])
            pos += 9
            comp_data = data[pos:pos + comp_size]
            pos += comp_size

            if method == METHOD_BINARY:
                mname = "B"
            elif method == METHOD_TRIGRAM:
                mname = "T"
            else:
                mname = "L"

            if self.verbose:
                overall = 100 * bytes_done / total_size if total_size else 0
                print(f"\r  Chunk {ci+1}/{num_entries}: {comp_size} -> "
                      f"{orig_size} ({mname})  [total: {overall:.1f}%]",
                      end="", file=sys.stderr)

            if method == METHOD_BINARY or method == METHOD_LZMA:
                output_parts.append(lzma.decompress(comp_data))
            elif method == METHOD_TRIGRAM:
                num_tokens = struct.unpack('>I', comp_data[:4])[0]
                stream = comp_data[4:]
                # Use first model (NC03 only had one table)
                output_parts.append(
                    _trigram_decompress_chunk(
                        self.models[0], self.tokenizer,
                        stream, num_tokens))
            else:
                raise ValueError(f"Unknown method: {method:#x}")

            bytes_done += orig_size

        if self.verbose:
            print(f"\r  Done: {num_entries} chunks, {total_size} bytes"
                  f"  [total: 100.0%]       ", file=sys.stderr)
            print(file=sys.stderr)

        return b''.join(output_parts)

    # ---- decompression: auto-detect ----

    def decompress_bytes(self, data: bytes) -> bytes:
        """Decompress NC03 or NC05 format -> raw bytes."""
        magic = data[:4]
        if magic == MAGIC_CHUNK:
            return self._decompress_nc05(data)
        elif magic == MAGIC_NC03:
            return self._decompress_nc03(data)
        else:
            raise ValueError(f"Expected NC03 or NC05, got {magic!r}")

    # ---- unified API ----

    def decompress(self, data: bytes):
        """Auto-detect format and decompress."""
        magic = data[:4]
        if magic == MAGIC_TEXT:
            return self.decompress_text(data)
        elif magic == MAGIC_CHUNK:
            return self._decompress_nc05(data)
        elif magic == MAGIC_NC03:
            return self._decompress_nc03(data)
        else:
            raise ValueError(f"Unknown format magic: {magic!r}")