Spaces:

MihaiPopa-1
/

CinnabarCompress-CPU

Sleeping

App Files Files Community

MihaiPopa-1 commited on 8 days ago

Commit

a64d7cf

verified ·

1 Parent(s): 826146f

Upload cinnabar_compress.py

Browse files

Files changed (1) hide show

cinnabar_compress.py +51 -35

cinnabar_compress.py CHANGED Viewed

@@ -190,42 +190,64 @@ def load_model(tag: str):
     print(f"  Loading {MODEL_NAMES[tag]} from {repo} …", flush=True)
     from transformers import AutoTokenizer, AutoModelForCausalLM
-    tok   = AutoTokenizer.from_pretrained(repo)
-    model = AutoModelForCausalLM.from_pretrained(repo, torch_dtype=torch.float32)
     model.eval()
     _model_cache[tag] = (tok, model)
     print(f"  {MODEL_NAMES[tag]} loaded.", flush=True)
     return tok, model
 # ─────────────────────────────────────────────────────────────────────────────
-# Probability helpers  (KV-cache enabled)
 # ─────────────────────────────────────────────────────────────────────────────
 SCALE = (1 << 16)  # cumulative frequency scale for arithmetic coder
-def _prefill(model, token_ids: list):
     """
-    Run the model on a prompt and return (probs, past_key_values).
-    Called once at the start to prime the KV cache.
     """
     inp = torch.tensor([token_ids], dtype=torch.long)
     with torch.no_grad():
         out = model(inp, use_cache=True)
-    probs = F.softmax(out.logits[0, -1, :], dim=-1)
     return probs, out.past_key_values
-def _get_probs_cached(model, token_id: int, past_key_values):
-    """
-    Run ONE new token through the model, reusing past_key_values.
-    Returns (probs, updated_past_key_values).
-    O(1) in context length — this is the KV-cache speedup.
-    """
     inp = torch.tensor([[token_id]], dtype=torch.long)
     with torch.no_grad():
         out = model(inp, past_key_values=past_key_values, use_cache=True)
-    probs = F.softmax(out.logits[0, -1, :], dim=-1)
     return probs, out.past_key_values
@@ -280,36 +302,31 @@ def _probs_to_cum_freqs(probs: torch.Tensor):
 def encode(text: bytes, tag: str = "4", verbose: bool = True) -> bytes:
     tok, model = load_model(tag)
-    # Tokenise input text
     token_ids = tok.encode(text.decode("utf-8", errors="replace"))
     n_tokens  = len(token_ids)
-    if verbose:
-        print(f"  Tokens: {n_tokens}")
-    # Use EOS as the stream terminator; fall back to 0 if undefined
     eos = tok.eos_token_id if tok.eos_token_id is not None else 0
     bos = tok.bos_token_id if tok.bos_token_id is not None else 0
-    enc = ArithmeticEncoder()
-    # Prefill with BOS → get P(first token | BOS)
-    probs, past = _prefill(model, [bos])
     for step, tid in enumerate(token_ids):
-        cum, total = _probs_to_cum_freqs(probs)
         lo, hi     = cum[tid]
         enc.encode_symbol(lo, hi, total)
-        probs, past = _get_probs_cached(model, tid, past)
-        if verbose and (step % 50 == 0 or step == n_tokens - 1):
-            print(f"\r  Encoding token {step+1}/{n_tokens} …", end="", flush=True)
-    # Encode EOS sentinel so the decoder knows exactly where to stop
-    cum, total = _probs_to_cum_freqs(probs)
-    lo, hi = cum[eos]
-    enc.encode_symbol(lo, hi, total)
     if verbose:
-        print()
     enc.flush()
     compressed = enc.get_bytes()
@@ -344,20 +361,19 @@ def decode(data: bytes, verbose: bool = True) -> bytes:
     dec     = ArithmeticDecoder(compressed)
     out_ids = []
-    # Prefill with BOS → get P(first token | BOS), matching encode exactly
-    probs, past = _prefill(model, [bos])
     step = 0
     while True:
         cum, total = _probs_to_cum_freqs(probs)
         sym        = dec.decode_symbol(cum, total)
-        # EOS sentinel = end of stream
         if sym == eos:
             break
         out_ids.append(sym)
-        probs, past = _get_probs_cached(model, sym, past)
         step += 1
         if verbose and (step % 50 == 0):

     print(f"  Loading {MODEL_NAMES[tag]} from {repo} …", flush=True)
     from transformers import AutoTokenizer, AutoModelForCausalLM
+    tok = AutoTokenizer.from_pretrained(repo)
+    # bfloat16 halves memory bandwidth on modern CPUs; fall back to float32
+    try:
+        model = AutoModelForCausalLM.from_pretrained(repo, torch_dtype=torch.bfloat16)
+        # Verify bfloat16 actually works with a tiny test forward pass
+        test = torch.zeros(1, 1, dtype=torch.long)
+        model(test)
+    except Exception:
+        model = AutoModelForCausalLM.from_pretrained(repo, torch_dtype=torch.float32)
     model.eval()
+    # torch.compile speeds up the repeated single-token decode loop
+    try:
+        model = torch.compile(model, mode="reduce-overhead")
+    except Exception:
+        pass
     _model_cache[tag] = (tok, model)
     print(f"  {MODEL_NAMES[tag]} loaded.", flush=True)
     return tok, model
 # ─────────────────────────────────────────────────────────────────────────────
+# Probability helpers
 # ─────────────────────────────────────────────────────────────────────────────
 SCALE = (1 << 16)  # cumulative frequency scale for arithmetic coder
+def _all_probs_batched(model, bos: int, token_ids: list) -> list:
     """
+    ENCODE fast-path: one forward pass over [BOS, t0, t1, ..., t_n-1].
+    Returns n+1 float32 probability tensors (one per position).
+    This is O(n) instead of the naive O(n^2) token-by-token approach.
     """
+    inp = torch.tensor([[bos] + token_ids], dtype=torch.long)
+    with torch.no_grad():
+        logits = model(inp).logits[0]          # [n+1, vocab]
+    return [F.softmax(logits[i].float(), dim=-1) for i in range(logits.shape[0])]
+def _prefill_cached(model, token_ids: list):
+    """Prime the KV cache with a prompt. Returns (last_probs, past_key_values)."""
     inp = torch.tensor([token_ids], dtype=torch.long)
     with torch.no_grad():
         out = model(inp, use_cache=True)
+    probs = F.softmax(out.logits[0, -1, :].float(), dim=-1)
     return probs, out.past_key_values
+def _step_cached(model, token_id: int, past_key_values):
+    """One autoregressive decode step with KV cache. O(1) per step."""
     inp = torch.tensor([[token_id]], dtype=torch.long)
     with torch.no_grad():
         out = model(inp, past_key_values=past_key_values, use_cache=True)
+    probs = F.softmax(out.logits[0, -1, :].float(), dim=-1)
     return probs, out.past_key_values
 def encode(text: bytes, tag: str = "4", verbose: bool = True) -> bytes:
     tok, model = load_model(tag)
     token_ids = tok.encode(text.decode("utf-8", errors="replace"))
     n_tokens  = len(token_ids)
     eos = tok.eos_token_id if tok.eos_token_id is not None else 0
     bos = tok.bos_token_id if tok.bos_token_id is not None else 0
+    if verbose:
+        print(f"  Tokens: {n_tokens} — running single batched forward pass…", flush=True)
+    # ONE forward pass gives all n+1 probability distributions at once.
+    # probs_list[i] = P(next | BOS, t0..t_{i-1})
+    # probs_list[n] = P(next | BOS, t0..t_{n-1})  ← used to encode EOS
+    probs_list = _all_probs_batched(model, bos, token_ids)
+    enc = ArithmeticEncoder()
     for step, tid in enumerate(token_ids):
+        cum, total = _probs_to_cum_freqs(probs_list[step])
         lo, hi     = cum[tid]
         enc.encode_symbol(lo, hi, total)
+    # EOS sentinel
+    cum, total = _probs_to_cum_freqs(probs_list[n_tokens])
+    enc.encode_symbol(*cum[eos], total)
     if verbose:
+        print(f"  Done.", flush=True)
     enc.flush()
     compressed = enc.get_bytes()
     dec     = ArithmeticDecoder(compressed)
     out_ids = []
+    # Prime KV cache with BOS → get P(first token | BOS)
+    probs, past = _prefill_cached(model, [bos])
     step = 0
     while True:
         cum, total = _probs_to_cum_freqs(probs)
         sym        = dec.decode_symbol(cum, total)
         if sym == eos:
             break
         out_ids.append(sym)
+        probs, past = _step_cached(model, sym, past)
         step += 1
         if verbose and (step % 50 == 0):