cstr
/

zerank-1-small-ONNX

+#!/usr/bin/env python3
+"""
+Re-export zerank-1-small with dynamic batch support.
+Key change from v1: ZeRankScorerV2 builds the 4D causal+padding attention mask
+explicitly using input_ids.shape[0] (dynamic). This makes the batch dimension
+symbolic in the ONNX graph — batch > 1 works correctly.
+Also bakes the Qwen3 chat template into the expected input format:
+  "<|im_start|>user\\nQuery: {q}\\nDocument: {d}\\nRelevant:<|im_end|>\\n<|im_start|>assistant\\n"
+Tokenize the formatted string as a SINGLE sequence (not a pair) in fastembed.
+Output:
+  /private/tmp/zerank_export/zerank_onnx_v2/model.onnx + model.onnx_data  (FP16)
+  (INT8/INT4 re-quantization: run stream_int8.py and export_int4.py after this)
+"""
+import gc
+from pathlib import Path
+import numpy as np
+import torch
+import torch.nn as nn
+MODEL_ID  = "zeroentropy/zerank-1-small"
+YES_TOKEN_ID = 9454
+OUT_DIR   = Path("/private/tmp/zerank_export/zerank_onnx_v2")
+OUT_MODEL = OUT_DIR / "model.onnx"
+OUT_DIR.mkdir(parents=True, exist_ok=True)
+class ZeRankScorerV2(nn.Module):
+    """
+    Wraps Qwen3ForCausalLM + last-token Yes-logit extraction.
+    Difference from V1: builds 4D causal+padding mask explicitly so the batch
+    dimension is dynamic in the ONNX graph (V1 had it hardcoded to 1).
+    Input:
+      input_ids      [batch, seq] — pre-formatted with chat template
+      attention_mask [batch, seq] — 1 for real tokens, 0 for padding
+    Output:
+      logits [batch, 1] — raw Yes-token logit, higher = more relevant
+    """
+    def __init__(self, base_model, yes_token_id: int):
+        super().__init__()
+        self.base = base_model
+        self.yes_token_id = yes_token_id
+        self._dtype = next(base_model.parameters()).dtype
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
+        batch_size = input_ids.shape[0]
+        seq_len    = input_ids.shape[1]
+        device     = input_ids.device
+        min_val    = torch.finfo(self._dtype).min
+        # Causal mask: upper-triangular = min_val, lower-triangular = 0
+        # Shape [1, 1, seq, seq] → expand to [batch, 1, seq, seq]
+        upper = torch.ones(seq_len, seq_len, dtype=torch.bool, device=device).triu(diagonal=1)
+        causal = torch.zeros(1, 1, seq_len, seq_len, dtype=self._dtype, device=device)
+        causal = causal.masked_fill(upper.view(1, 1, seq_len, seq_len), min_val)
+        causal = causal.expand(batch_size, 1, seq_len, seq_len)
+        # Padding mask: positions with attention_mask=0 get min_val
+        pad = (1.0 - attention_mask.to(self._dtype)) * min_val  # [batch, seq]
+        pad = pad.unsqueeze(1).unsqueeze(2)                     # [batch, 1, 1, seq]
+        pad = pad.expand(batch_size, 1, seq_len, seq_len)
+        full_mask = causal + pad
+        # Transformer body → [batch, seq, hidden]
+        hidden = self.base.model(
+            input_ids=input_ids,
+            attention_mask=full_mask,
+        )[0]
+        # Gather at last real-token position: sum(mask) - 1
+        last_pos = attention_mask.sum(dim=-1) - 1  # [batch]
+        idx = last_pos.view(-1, 1, 1).expand(-1, 1, hidden.shape[-1])
+        last_hidden = torch.gather(hidden, 1, idx).squeeze(1)  # [batch, hidden]
+        yes_logit = self.base.lm_head(last_hidden)[:, self.yes_token_id]  # [batch]
+        return yes_logit.unsqueeze(-1)  # [batch, 1]
+def run_export():
+    from transformers import Qwen3ForCausalLM, AutoTokenizer
+    import torch.onnx as torch_onnx
+    print(f"Loading {MODEL_ID}...")
+    model = Qwen3ForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        attn_implementation="eager",
+    ).eval()
+    scorer = ZeRankScorerV2(model, YES_TOKEN_ID).eval()
+    tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    # Dummy batch=2 — forces dynamic batch to trace correctly
+    template = "<|im_start|>user\nQuery: {q}\nDocument: {d}\nRelevant:<|im_end|>\n<|im_start|>assistant\n"
+    pairs = [
+        ("what is a panda?", "A panda is a large black-and-white bear."),
+        ("what is a cat?",   "A cat is a small domesticated carnivorous mammal."),
+    ]
+    formatted = [template.format(q=q, d=d) for q, d in pairs]
+    enc = tok(formatted, padding=True, truncation=True, max_length=64, return_tensors="pt")
+    dummy_ids  = enc["input_ids"]
+    dummy_mask = enc["attention_mask"]
+    print(f"  Dummy batch shape: {dummy_ids.shape}")
+    # Verify correct batch behaviour before exporting
+    with torch.no_grad():
+        out_batch = scorer(dummy_ids, dummy_mask)
+        out_single = scorer(dummy_ids[:1], dummy_mask[:1])
+    assert abs(float(out_batch[0, 0]) - float(out_single[0, 0])) < 0.01, \
+        f"Batch/single mismatch: {float(out_batch[0,0]):.3f} vs {float(out_single[0,0]):.3f}"
+    print(f"  Batch consistency check PASS: {float(out_batch[0,0]):.3f} vs {float(out_single[0,0]):.3f}")
+    print(f"Exporting to {OUT_MODEL} ...")
+    with torch.no_grad():
+        torch_onnx.export(
+            scorer,
+            (dummy_ids, dummy_mask),
+            str(OUT_MODEL),
+            input_names=["input_ids", "attention_mask"],
+            output_names=["logits"],
+            dynamic_axes={
+                "input_ids":      {0: "batch_size", 1: "sequence_length"},
+                "attention_mask": {0: "batch_size", 1: "sequence_length"},
+                "logits":         {0: "batch_size"},
+            },
+            opset_version=18,
+            do_constant_folding=False,
+        )
+    import onnx
+    from onnx.external_data_helper import convert_model_to_external_data
+    print("  Converting to external data format...")
+    m = onnx.load(str(OUT_MODEL))
+    convert_model_to_external_data(
+        m, all_tensors_to_one_file=True,
+        location="model.onnx_data", size_threshold=1024,
+    )
+    onnx.save(m, str(OUT_MODEL))
+    print("Export complete:")
+    for f in sorted(OUT_DIR.iterdir()):
+        print(f"  {f.name:40s} {f.stat().st_size / 1e6:.0f} MB")
+    del m, scorer, model, tok, enc, dummy_ids, dummy_mask
+    gc.collect()
+def verify_batch():
+    import onnxruntime as ort
+    print(f"\nVerifying batch > 1...")
+    sess = ort.InferenceSession(str(OUT_MODEL), providers=["CPUExecutionProvider"])
+    from transformers import AutoTokenizer
+    tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
+    template = "<|im_start|>user\nQuery: {q}\nDocument: {d}\nRelevant:<|im_end|>\n<|im_start|>assistant\n"
+    q = "what is a panda?"
+    docs = [
+        "The giant panda is a bear species endemic to China.",
+        "The sky is blue.",
+        "panda is an animal",
+    ]
+    # Single inference
+    single_scores = []
+    for d in docs:
+        fmt = template.format(q=q, d=d)
+        enc = tok(fmt, return_tensors="np", truncation=True, max_length=256)
+        logit = sess.run(["logits"], {
+            "input_ids":      enc["input_ids"].astype(np.int64),
+            "attention_mask": enc["attention_mask"].astype(np.int64),
+        })[0]
+        single_scores.append(float(logit[0, 0]))
+    # Batch inference
+    formatted = [template.format(q=q, d=d) for d in docs]
+    enc = tok(formatted, return_tensors="np", truncation=True, max_length=256, padding=True)
+    logits = sess.run(["logits"], {
+        "input_ids":      enc["input_ids"].astype(np.int64),
+        "attention_mask": enc["attention_mask"].astype(np.int64),
+    })[0]
+    batch_scores = [float(logits[i, 0]) for i in range(len(docs))]
+    print("  Single vs batch scores:")
+    for d, s, b in zip(docs, single_scores, batch_scores):
+        diff = abs(s - b)
+        print(f"  [{s:.3f} vs {b:.3f}] diff={diff:.4f} | {d[:50]}")
+        assert diff < 0.1, f"Mismatch too large: {diff}"
+    assert batch_scores[0] > batch_scores[1], "Panda should rank higher than sky"
+    print("  OK — batch scores match single, correct ranking")
+if __name__ == "__main__":
+    if OUT_MODEL.exists():
+        print(f"Model already exists at {OUT_MODEL}, skipping export.")
+        print("Delete it to re-export.")
+    else:
+        run_export()
+        gc.collect()
+    verify_batch()
+    print("\nNext steps:")
+    print(f"  1. Run stream_int8_v2.py to quantize INT8 from {OUT_MODEL}")
+    print(f"  2. Upload to HF: huggingface-cli upload cstr/zerank-1-small-ONNX {OUT_DIR}/ . --repo-type model")