zerank-1-small-ONNX / export_zerank_v2.py

Upload export_zerank_v2.py with huggingface_hub

b19c63d verified 23 days ago

8.32 kB

	#!/usr/bin/env python3
	"""
	Re-export zerank-1-small with dynamic batch support.

	Key change from v1: ZeRankScorerV2 builds the 4D causal+padding attention mask
	explicitly using input_ids.shape[0] (dynamic). This makes the batch dimension
	symbolic in the ONNX graph — batch > 1 works correctly.

	Also bakes the Qwen3 chat template into the expected input format:
	"<\|im_start\|>user\\nQuery: {q}\\nDocument: {d}\\nRelevant:<\|im_end\|>\\n<\|im_start\|>assistant\\n"

	Tokenize the formatted string as a SINGLE sequence (not a pair) in fastembed.

	Output:
	/private/tmp/zerank_export/zerank_onnx_v2/model.onnx + model.onnx_data (FP16)
	(INT8/INT4 re-quantization: run stream_int8.py and export_int4.py after this)
	"""

	import gc
	from pathlib import Path
	import numpy as np
	import torch
	import torch.nn as nn

	MODEL_ID = "zeroentropy/zerank-1-small"
	YES_TOKEN_ID = 9454

	OUT_DIR = Path("/private/tmp/zerank_export/zerank_onnx_v2")
	OUT_MODEL = OUT_DIR / "model.onnx"
	OUT_DIR.mkdir(parents=True, exist_ok=True)


	class ZeRankScorerV2(nn.Module):
	"""
	Wraps Qwen3ForCausalLM + last-token Yes-logit extraction.

	Difference from V1: builds 4D causal+padding mask explicitly so the batch
	dimension is dynamic in the ONNX graph (V1 had it hardcoded to 1).

	Input:
	input_ids [batch, seq] — pre-formatted with chat template
	attention_mask [batch, seq] — 1 for real tokens, 0 for padding

	Output:
	logits [batch, 1] — raw Yes-token logit, higher = more relevant
	"""
	def __init__(self, base_model, yes_token_id: int):
	super().__init__()
	self.base = base_model
	self.yes_token_id = yes_token_id
	self._dtype = next(base_model.parameters()).dtype

	def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor):
	batch_size = input_ids.shape[0]
	seq_len = input_ids.shape[1]
	device = input_ids.device
	min_val = torch.finfo(self._dtype).min

	# Causal mask: upper-triangular = min_val, lower-triangular = 0
	# Shape [1, 1, seq, seq] → expand to [batch, 1, seq, seq]
	upper = torch.ones(seq_len, seq_len, dtype=torch.bool, device=device).triu(diagonal=1)
	causal = torch.zeros(1, 1, seq_len, seq_len, dtype=self._dtype, device=device)
	causal = causal.masked_fill(upper.view(1, 1, seq_len, seq_len), min_val)
	causal = causal.expand(batch_size, 1, seq_len, seq_len)

	# Padding mask: positions with attention_mask=0 get min_val
	pad = (1.0 - attention_mask.to(self._dtype)) * min_val # [batch, seq]
	pad = pad.unsqueeze(1).unsqueeze(2) # [batch, 1, 1, seq]
	pad = pad.expand(batch_size, 1, seq_len, seq_len)

	full_mask = causal + pad

	# Transformer body → [batch, seq, hidden]
	hidden = self.base.model(
	input_ids=input_ids,
	attention_mask=full_mask,
	)[0]

	# Gather at last real-token position: sum(mask) - 1
	last_pos = attention_mask.sum(dim=-1) - 1 # [batch]
	idx = last_pos.view(-1, 1, 1).expand(-1, 1, hidden.shape[-1])
	last_hidden = torch.gather(hidden, 1, idx).squeeze(1) # [batch, hidden]

	yes_logit = self.base.lm_head(last_hidden)[:, self.yes_token_id] # [batch]
	return yes_logit.unsqueeze(-1) # [batch, 1]


	def run_export():
	from transformers import Qwen3ForCausalLM, AutoTokenizer
	import torch.onnx as torch_onnx

	print(f"Loading {MODEL_ID}...")
	model = Qwen3ForCausalLM.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	low_cpu_mem_usage=True,
	attn_implementation="eager",
	).eval()

	scorer = ZeRankScorerV2(model, YES_TOKEN_ID).eval()

	tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

	# Dummy batch=2 — forces dynamic batch to trace correctly
	template = "<\|im_start\|>user\nQuery: {q}\nDocument: {d}\nRelevant:<\|im_end\|>\n<\|im_start\|>assistant\n"
	pairs = [
	("what is a panda?", "A panda is a large black-and-white bear."),
	("what is a cat?", "A cat is a small domesticated carnivorous mammal."),
	]
	formatted = [template.format(q=q, d=d) for q, d in pairs]
	enc = tok(formatted, padding=True, truncation=True, max_length=64, return_tensors="pt")
	dummy_ids = enc["input_ids"]
	dummy_mask = enc["attention_mask"]
	print(f" Dummy batch shape: {dummy_ids.shape}")

	# Verify correct batch behaviour before exporting
	with torch.no_grad():
	out_batch = scorer(dummy_ids, dummy_mask)
	out_single = scorer(dummy_ids[:1], dummy_mask[:1])
	assert abs(float(out_batch[0, 0]) - float(out_single[0, 0])) < 0.01, \
	f"Batch/single mismatch: {float(out_batch[0,0]):.3f} vs {float(out_single[0,0]):.3f}"
	print(f" Batch consistency check PASS: {float(out_batch[0,0]):.3f} vs {float(out_single[0,0]):.3f}")

	print(f"Exporting to {OUT_MODEL} ...")
	with torch.no_grad():
	torch_onnx.export(
	scorer,
	(dummy_ids, dummy_mask),
	str(OUT_MODEL),
	input_names=["input_ids", "attention_mask"],
	output_names=["logits"],
	dynamic_axes={
	"input_ids": {0: "batch_size", 1: "sequence_length"},
	"attention_mask": {0: "batch_size", 1: "sequence_length"},
	"logits": {0: "batch_size"},
	},
	opset_version=18,
	do_constant_folding=False,
	)

	import onnx
	from onnx.external_data_helper import convert_model_to_external_data
	print(" Converting to external data format...")
	m = onnx.load(str(OUT_MODEL))
	convert_model_to_external_data(
	m, all_tensors_to_one_file=True,
	location="model.onnx_data", size_threshold=1024,
	)
	onnx.save(m, str(OUT_MODEL))
	print("Export complete:")
	for f in sorted(OUT_DIR.iterdir()):
	print(f" {f.name:40s} {f.stat().st_size / 1e6:.0f} MB")

	del m, scorer, model, tok, enc, dummy_ids, dummy_mask
	gc.collect()


	def verify_batch():
	import onnxruntime as ort

	print(f"\nVerifying batch > 1...")
	sess = ort.InferenceSession(str(OUT_MODEL), providers=["CPUExecutionProvider"])

	from transformers import AutoTokenizer
	tok = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	template = "<\|im_start\|>user\nQuery: {q}\nDocument: {d}\nRelevant:<\|im_end\|>\n<\|im_start\|>assistant\n"

	q = "what is a panda?"
	docs = [
	"The giant panda is a bear species endemic to China.",
	"The sky is blue.",
	"panda is an animal",
	]

	# Single inference
	single_scores = []
	for d in docs:
	fmt = template.format(q=q, d=d)
	enc = tok(fmt, return_tensors="np", truncation=True, max_length=256)
	logit = sess.run(["logits"], {
	"input_ids": enc["input_ids"].astype(np.int64),
	"attention_mask": enc["attention_mask"].astype(np.int64),
	})[0]
	single_scores.append(float(logit[0, 0]))

	# Batch inference
	formatted = [template.format(q=q, d=d) for d in docs]
	enc = tok(formatted, return_tensors="np", truncation=True, max_length=256, padding=True)
	logits = sess.run(["logits"], {
	"input_ids": enc["input_ids"].astype(np.int64),
	"attention_mask": enc["attention_mask"].astype(np.int64),
	})[0]
	batch_scores = [float(logits[i, 0]) for i in range(len(docs))]

	print(" Single vs batch scores:")
	for d, s, b in zip(docs, single_scores, batch_scores):
	diff = abs(s - b)
	print(f" [{s:.3f} vs {b:.3f}] diff={diff:.4f} \| {d[:50]}")
	assert diff < 0.1, f"Mismatch too large: {diff}"
	assert batch_scores[0] > batch_scores[1], "Panda should rank higher than sky"
	print(" OK — batch scores match single, correct ranking")


	if __name__ == "__main__":
	if OUT_MODEL.exists():
	print(f"Model already exists at {OUT_MODEL}, skipping export.")
	print("Delete it to re-export.")
	else:
	run_export()
	gc.collect()

	verify_batch()

	print("\nNext steps:")
	print(f" 1. Run stream_int8_v2.py to quantize INT8 from {OUT_MODEL}")
	print(f" 2. Upload to HF: huggingface-cli upload cstr/zerank-1-small-ONNX {OUT_DIR}/ . --repo-type model")