Clarify --ckpt help: retriever weights only, not full DSv4 model

bcc6c02 about 11 hours ago

6.6 kB

	"""
	demo.py — minimal standalone demo for the FlashMemory DS-V4 Retriever
	=====================================================================

	Builds random mock inputs, loads the FlashMemory DS-V4 joint checkpoint, runs
	a forward pass, and prints per-chunk scores plus a top-K selection summary.

	Run::

	python demo.py --ckpt weights/flashmemory_ds_v4.safetensors

	Runs on CPU by default; pass ``--device cuda`` to use a GPU.
	"""

	from __future__ import annotations

	import argparse

	import torch

	from retriever import FlashMemoryRetriever, dequant_compressed_k


	def make_mock_compressed_k(
	batch: int,
	n_chunks: int,
	head_dim: int = 128,
	device: str = "cpu",
	seed: int = 0,
	) -> torch.Tensor:
	"""Construct a valid mock ``compressed_k`` tensor [B, N, head_dim + 4] uint8.

	Layout per chunk: ``head_dim`` float8_e4m3 bytes followed by one float32 scale
	(4 bytes). We build it the same way the real CSA cache stores it:
	1. sample random key vectors, cast to float8_e4m3, view as uint8;
	2. sample a small positive per-chunk scale, view its float32 as 4 uint8 bytes;
	3. concatenate along the last dim.
	"""
	g = torch.Generator(device=device).manual_seed(seed)

	# 1) fp8 key bytes
	k_vals = torch.randn(batch, n_chunks, head_dim, generator=g, device=device) * 0.5
	k_fp8 = k_vals.to(torch.float8_e4m3fn)
	fp8_bytes = k_fp8.view(torch.uint8) # [B, N, head_dim]

	# 2) float32 per-chunk scale → 4 bytes
	scale = (0.05 + 0.15 * torch.rand(batch, n_chunks, 1, generator=g, device=device)).float()
	scale_bytes = scale.view(torch.uint8) # [B, N, 4]

	compressed = torch.cat([fp8_bytes, scale_bytes], dim=-1) # [B, N, head_dim + 4]
	assert compressed.shape[-1] == head_dim + 4
	return compressed.contiguous()


	def main():
	ap = argparse.ArgumentParser(description="FlashMemory DS-V4 Retriever demo")
	ap.add_argument("--ckpt", required=True,
	help="path to retriever checkpoint (flashmemory_ds_v4.safetensors from HuggingFace, NOT a full DSv4 model)")
	ap.add_argument("--device", default="cpu", help="cpu or cuda (default: cpu)")
	ap.add_argument("--batch", type=int, default=2, help="number of decode tokens")
	ap.add_argument("--n-chunks", type=int, default=64, help="number of compressed-K chunks")
	ap.add_argument("--max-position", type=int, default=524288,
	help="RoPE table length (raise to 1048576 for 1M context)")
	ap.add_argument("--top-k", type=int, default=16, help="top-K chunks to select")
	ap.add_argument("--threshold", type=float, default=0.5, help="sigmoid keep threshold")
	ap.add_argument("--ensemble", default="max", choices=["max", "mean"],
	help="cross-layer ensemble mode")
	ap.add_argument("--seed", type=int, default=0)
	args = ap.parse_args()

	torch.manual_seed(args.seed)
	device = args.device

	print(f"[demo] loading checkpoint: {args.ckpt}")
	model = FlashMemoryRetriever.from_checkpoint(
	args.ckpt, device=device, max_position=args.max_position
	)
	model.eval()
	print(f"[demo] loaded layers={model.layer_names} n_heads={model.n_heads} "
	f"head_dim={model.head_dim} max_position={model.max_position}")

	# ── Mock inputs ─────────────────────────────────────────────────────────
	B, N = args.batch, args.n_chunks
	hidden = torch.randn(B, 4096, device=device, dtype=torch.float32)
	compressed_k = make_mock_compressed_k(B, N, head_dim=model.head_dim,
	device=device, seed=args.seed)
	# token positions for each decode token (arbitrary; here spaced out)
	positions = torch.arange(B, device=device, dtype=torch.int64) * 1000 + 4096

	print(f"\n[demo] mock inputs: hidden={tuple(hidden.shape)} "
	f"compressed_k={tuple(compressed_k.shape)} ({compressed_k.dtype}) "
	f"positions={positions.tolist()}")

	# sanity: show dequant works
	k_float = dequant_compressed_k(compressed_k, head_dim=model.head_dim)
	print(f"[demo] dequantized K: shape={tuple(k_float.shape)} "
	f"mean={k_float.mean().item():+.4f} std={k_float.std().item():.4f}")

	# ── Per-layer scores ──────────────────────────────────────────────────────
	per_layer = model(hidden, compressed_k, positions, apply_sigmoid=True)
	print("\n[demo] per-layer sigmoid score stats (over all chunks):")
	for name, s in per_layer.items():
	print(f" {name}: min={s.min().item():.4f} mean={s.mean().item():.4f} "
	f"max={s.max().item():.4f}")

	# ── Cross-layer ensemble ──────────────────────────────────────────────────
	scores = model.ensemble(hidden, compressed_k, positions, mode=args.ensemble) # [B, N]
	print(f"\n[demo] ensembled ({args.ensemble}) per-chunk scores [B={B}, N={N}]:")
	for b in range(B):
	row = scores[b]
	preview = ", ".join(f"{v:.3f}" for v in row[:12].tolist())
	print(f" row {b}: [{preview}{', ...' if N > 12 else ''}]")

	# ── Selection: threshold ──────────────────────────────────────────────────
	keep_thr = model.select_topk(hidden, compressed_k, positions,
	threshold=args.threshold, mode=args.ensemble)
	print(f"\n[demo] threshold selection (sigmoid > {args.threshold}):")
	for b in range(B):
	n_keep = int(keep_thr[b].sum().item())
	print(f" row {b}: keep {n_keep}/{N} chunks (keep ratio {n_keep / N:.1%})")

	# ── Selection: top-K ──────────────────────────────────────────────────────
	keep_topk = model.select_topk(hidden, compressed_k, positions,
	top_k=args.top_k, mode=args.ensemble)
	print(f"\n[demo] top-K selection (k={args.top_k}):")
	for b in range(B):
	idx = keep_topk[b].nonzero(as_tuple=True)[0].tolist()
	print(f" row {b}: kept chunk indices = {idx}")

	print("\n[demo] done -- forward + scoring + selection all ran.")


	if __name__ == "__main__":
	main()