mxguru1
/

hsaq-tools

Model card Files Files and versions

xet

Community

mxguru1 commited on 8 days ago

Commit

4e75b89

verified ·

1 Parent(s): 48ec8ad

Add compare_strategies.py

Browse files

Files changed (1) hide show

compare_strategies.py +395 -0

compare_strategies.py ADDED Viewed

	@@ -0,0 +1,395 @@

+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "torch>=2.5,<2.10",
+#   "transformers>=4.46",
+#   "huggingface_hub>=0.26",
+#   "datasets>=3.0",
+#   "accelerate>=1.0",
+#   "sentencepiece",
+#   "protobuf",
+# ]
+# ///
+"""
+Strategy A vs Strategy B comparison for HSAQ KV-cache profiling.
+The kv_profiler.py shipped by web-Claude implements STRATEGY B (per-config
+joint): for each config, hook every layer simultaneously via
+kv_quant_active_multi, run ONE forward pass, capture per-layer attention-output
+drift in one shot. Total cost: 11 forwards.
+STRATEGY A (per-layer isolated): for each (layer, config) pair, hook ONLY
+that layer via kv_quant_active, run a forward pass, measure drift at the
+target layer. Total cost: 11 × N_layers = 440 forwards for OLMo's 40 layers.
+This script runs both on OLMo-2-13B-Instruct, diffs the resulting drift
+tables, and pipes each through assign_kv_bits to compare the allocation
+decisions. Outputs a comparison report to
+mxguru1/hsaq-strategy-comparison.
+If A and B agree: keep B for speed, allocator independence assumption holds.
+If A and B disagree: the disagreement IS the finding, and the allocator
+should consume A's data despite the cost.
+"""
+from __future__ import annotations
+import json
+import os
+import sys
+import time
+from datetime import datetime, timezone
+from pathlib import Path
+import torch
+from huggingface_hub import HfApi, hf_hub_download, login, snapshot_download, create_repo
+# ---------------------------------------------------------------------------
+# Auth + setup
+# ---------------------------------------------------------------------------
+token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
+if not token:
+    print("FATAL: no HF_TOKEN in env")
+    sys.exit(2)
+login(token=token)
+print("[auth] logged in as mxguru1")
+# Pull the HSAQ tools from the Hub (so we don't need a local checkout)
+print("[fetch] pulling HSAQ tools from mxguru1/hsaq-tools")
+local_tools = snapshot_download(
+    repo_id="mxguru1/hsaq-tools",
+    local_dir="/tmp/hsaq_tools",
+    allow_patterns=["kv_intercept.py", "kv_profiler.py", "assignment_v2.py"],
+)
+sys.path.insert(0, local_tools)
+print(f"[fetch] tools at {local_tools}")
+import kv_intercept as kvi
+import kv_profiler as kvp
+import assignment_v2 as asgn
+# ---------------------------------------------------------------------------
+# Model + calibration
+# ---------------------------------------------------------------------------
+TARGET_MODEL = "allenai/OLMo-2-1124-13B-Instruct"
+N_CALIB = 32         # calibration set size
+MAX_SEQ_LEN = 512    # truncation length
+KV_BUDGET_GB = 1.0   # for allocator comparison
+print(f"\n[stage] target model: {TARGET_MODEL}")
+print(f"[stage] calibration: {N_CALIB} prompts, max_seq_len={MAX_SEQ_LEN}")
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+t0 = time.time()
+tok = AutoTokenizer.from_pretrained(TARGET_MODEL, trust_remote_code=True)
+if tok.pad_token is None:
+    tok.pad_token = tok.eos_token
+model = AutoModelForCausalLM.from_pretrained(
+    TARGET_MODEL,
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    trust_remote_code=True,
+)
+model.eval()
+print(f"[stage] model loaded in {time.time()-t0:.1f}s, VRAM: {torch.cuda.memory_allocated()/1e9:.2f} GB")
+# Pull calibration prompts from the wargame corpus
+print(f"[stage] loading calibration from mxguru1/master-chief-wargame-corpus-v1")
+ds = load_dataset("mxguru1/master-chief-wargame-corpus-v1", split="train")
+calibration_texts = [row["attack"] for row in ds.select(range(N_CALIB))]
+print(f"[stage] {len(calibration_texts)} calibration prompts ready")
+# ---------------------------------------------------------------------------
+# Strategy B — already implemented in kv_profiler.profile_kv_sensitivity
+# ---------------------------------------------------------------------------
+print("\n" + "=" * 72)
+print("STRATEGY B (per-config JOINT — current default, 11 forwards)")
+print("=" * 72)
+t0 = time.time()
+rows_B = kvp.profile_kv_sensitivity(
+    model=model,
+    tokenizer=tok,
+    calibration_texts=calibration_texts,
+    model_hash="olmo-2-13b-strategy-B",
+    profiled_by_agent_id="strategy-compare",
+    profiled_by_agent_tier=1,
+    max_seq_len=MAX_SEQ_LEN,
+    drift_metric="mse_normalised",
+    progress_cb=lambda m: print(f"  {m}"),
+)
+elapsed_B = time.time() - t0
+print(f"[B] {len(rows_B)} rows in {elapsed_B:.1f}s")
+# ---------------------------------------------------------------------------
+# Strategy A — per (layer, config) isolated, hook ONE layer at a time
+# ---------------------------------------------------------------------------
+def profile_isolated(
+    model, tokenizer, calibration_texts, model_hash, max_seq_len, sweep,
+):
+    """Strategy A: hook one layer at a time, measure drift at that layer.
+    For each config, for each layer:
+      - install hook on ONLY this layer at this config
+      - run forward, capture all layer attention outputs (cheap)
+      - record drift at the target layer
+    """
+    from kv_intercept import KVQuantSpec, find_attention_modules, kv_quant_active
+    rows = []
+    attn_modules = find_attention_modules(model)
+    n_layers = len(attn_modules)
+    # Tokenize once
+    batch = tokenizer(
+        calibration_texts,
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+        max_length=max_seq_len,
+    ).to(model.device)
+    actual_seq_len = batch.input_ids.shape[1]
+    # Capture full-precision baseline (one forward)
+    print(f"  [A] capturing fp baseline ({n_layers} layers, 1 forward)")
+    t_b = time.time()
+    baseline_outputs = kvp._capture_attn_outputs(model, attn_modules, batch)
+    print(f"  [A] baseline in {time.time()-t_b:.1f}s")
+    # Per-layer dimensions (Llama-family assumption)
+    cfg = model.config
+    num_kv_heads = getattr(cfg, "num_key_value_heads",
+                          getattr(cfg, "num_attention_heads", 1))
+    head_dim = getattr(cfg, "head_dim", None) or (
+        cfg.hidden_size // cfg.num_attention_heads
+    )
+    calibration_hash = kvp.compute_calibration_hash(calibration_texts, max_seq_len)
+    profiled_at = datetime.now(timezone.utc).isoformat()
+    total_forwards = len(sweep) * n_layers
+    forward_idx = 0
+    t_loop = time.time()
+    for cfg_idx, swp in enumerate(sweep):
+        spec = KVQuantSpec(
+            k_bits=swp.k_bits,
+            v_bits=swp.v_bits,
+            quantizer=swp.quantizer,
+            group_size=swp.group_size,
+        )
+        bpt = kvp.kv_bytes_per_token(
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            k_bits=swp.k_bits,
+            v_bits=swp.v_bits,
+            quantizer=swp.quantizer,
+            group_size=swp.group_size,
+        )
+        for layer_idx, attn in attn_modules.items():
+            forward_idx += 1
+            if forward_idx % 40 == 0:
+                elapsed = time.time() - t_loop
+                eta = elapsed / forward_idx * (total_forwards - forward_idx)
+                print(f"  [A] forward {forward_idx}/{total_forwards} "
+                      f"(cfg {cfg_idx+1}/{len(sweep)} k={swp.k_bits} v={swp.v_bits} "
+                      f"{swp.quantizer}, eta {eta:.0f}s)")
+            # Hook ONLY this layer at this config
+            with kv_quant_active(attn, spec):
+                captured = kvp._capture_attn_outputs(model, attn_modules, batch)
+            if layer_idx not in captured or layer_idx not in baseline_outputs:
+                continue
+            drift = kvp.compute_drift(
+                captured[layer_idx],
+                baseline_outputs[layer_idx],
+                "mse_normalised",
+            )
+            rows.append(kvp.ProfileRow(
+                model_hash=model_hash,
+                calibration_hash=calibration_hash,
+                pipeline_version="strategy-A-1.0.0",
+                layer_idx=layer_idx,
+                k_bits=swp.k_bits,
+                v_bits=swp.v_bits,
+                quantizer=swp.quantizer,
+                drift_attn_output=float(drift),
+                drift_metric="mse_normalised",
+                bytes_per_kv_token=float(bpt),
+                max_seq_len_observed=actual_seq_len,
+                num_kv_heads=num_kv_heads,
+                head_dim=head_dim,
+                profiled_at=profiled_at,
+                profiled_by_agent_id="strategy-compare",
+                profiled_by_agent_tier=1,
+            ))
+    return rows
+print("\n" + "=" * 72)
+print("STRATEGY A (per-layer ISOLATED — 11 × N_layers forwards)")
+print("=" * 72)
+t0 = time.time()
+rows_A = profile_isolated(
+    model, tok, calibration_texts,
+    model_hash="olmo-2-13b-strategy-A",
+    max_seq_len=MAX_SEQ_LEN,
+    sweep=kvp.DEFAULT_SWEEP,
+)
+elapsed_A = time.time() - t0
+print(f"[A] {len(rows_A)} rows in {elapsed_A:.1f}s")
+# ---------------------------------------------------------------------------
+# Diff the drift tables
+# ---------------------------------------------------------------------------
+print("\n" + "=" * 72)
+print("COMPARISON")
+print("=" * 72)
+print(f"\n  Strategy B: {elapsed_B:.1f}s wall, {len(rows_B)} rows")
+print(f"  Strategy A: {elapsed_A:.1f}s wall, {len(rows_A)} rows")
+print(f"  Ratio A/B:  {elapsed_A/max(elapsed_B,0.1):.1f}× slower")
+# Build (config, layer) -> drift maps
+def key_drift(rows):
+    return {(r.k_bits, r.v_bits, r.quantizer, r.layer_idx): r.drift_attn_output
+            for r in rows}
+B_map = key_drift(rows_B)
+A_map = key_drift(rows_A)
+common = set(B_map.keys()) & set(A_map.keys())
+print(f"\n  Common (config, layer) pairs: {len(common)}")
+# Aggregate by config — drift averaged over layers
+def avg_by_config(d):
+    from collections import defaultdict
+    by_cfg = defaultdict(list)
+    for (k, v, q, _li), drift in d.items():
+        by_cfg[(k, v, q)].append(drift)
+    return {cfg: sum(vs) / len(vs) for cfg, vs in by_cfg.items()}
+avg_B = avg_by_config(B_map)
+avg_A = avg_by_config(A_map)
+all_cfgs = sorted(set(avg_B) | set(avg_A))
+print(f"\n  Per-config average drift (lower = quant is less harmful):")
+print(f"  {'config':<36} {'A_isolated':>13} {'B_joint':>13} {'A/B ratio':>10}")
+for cfg in all_cfgs:
+    a = avg_A.get(cfg, 0)
+    b = avg_B.get(cfg, 0)
+    ratio = a / max(b, 1e-12)
+    print(f"  {str(cfg):<36} {a:>13.4e} {b:>13.4e} {ratio:>10.2f}")
+# Allocator comparison
+print(f"\n  Running allocator on each (KV budget = {KV_BUDGET_GB} GB, max_seq_len={MAX_SEQ_LEN})")
+cands_B = kvp.rows_to_kv_candidates(rows_B)
+cands_A = kvp.rows_to_kv_candidates(rows_A)
+try:
+    res_B = asgn.assign_kv_bits(cands_B, kv_budget_gb=KV_BUDGET_GB, max_seq_len=MAX_SEQ_LEN)
+    res_A = asgn.assign_kv_bits(cands_A, kv_budget_gb=KV_BUDGET_GB, max_seq_len=MAX_SEQ_LEN)
+except Exception as e:
+    print(f"  allocator err: {e}")
+    res_B = res_A = None
+if res_A and res_B:
+    from collections import Counter
+    pick_B = Counter((a.chosen.k_bits, a.chosen.v_bits, a.chosen.quantizer)
+                     for a in res_B.assignments)
+    pick_A = Counter((a.chosen.k_bits, a.chosen.v_bits, a.chosen.quantizer)
+                     for a in res_A.assignments)
+    print(f"\n  Allocation distribution (B vs A):")
+    all_picks = sorted(set(pick_B) | set(pick_A))
+    for p in all_picks:
+        print(f"    {str(p):<40} B={pick_B[p]:>3} A={pick_A[p]:>3}")
+    print(f"\n  Total drift   B={res_B.total_drift:.4e}  A={res_A.total_drift:.4e}")
+    print(f"  Total KV GB   B={res_B.total_kv_gb:.3f}  A={res_A.total_kv_gb:.3f}")
+    # Layer-by-layer agreement
+    agree_count = sum(
+        1 for la, lb in zip(res_A.assignments, res_B.assignments)
+        if (la.chosen.k_bits, la.chosen.v_bits, la.chosen.quantizer) ==
+           (lb.chosen.k_bits, lb.chosen.v_bits, lb.chosen.quantizer)
+    )
+    print(f"\n  Per-layer agreement: {agree_count}/{len(res_A.assignments)}")
+# ---------------------------------------------------------------------------
+# Persist to HF Hub
+# ---------------------------------------------------------------------------
+out_repo = "mxguru1/hsaq-strategy-comparison"
+try:
+    create_repo(out_repo, repo_type="dataset", exist_ok=True, private=False)
+except Exception:
+    pass
+report = {
+    "captured_at": datetime.now(timezone.utc).isoformat(),
+    "target_model": TARGET_MODEL,
+    "calibration": {
+        "source": "mxguru1/master-chief-wargame-corpus-v1",
+        "n_prompts": N_CALIB,
+        "max_seq_len": MAX_SEQ_LEN,
+    },
+    "strategy_B": {
+        "elapsed_seconds": round(elapsed_B, 1),
+        "n_rows": len(rows_B),
+        "n_forwards": 11,
+        "avg_drift_by_config": {str(k): v for k, v in avg_B.items()},
+    },
+    "strategy_A": {
+        "elapsed_seconds": round(elapsed_A, 1),
+        "n_rows": len(rows_A),
+        "n_forwards": 11 * 40,
+        "avg_drift_by_config": {str(k): v for k, v in avg_A.items()},
+    },
+    "ratio_a_over_b": round(elapsed_A / max(elapsed_B, 0.1), 2),
+    "allocator_kv_budget_gb": KV_BUDGET_GB,
+}
+if res_A and res_B:
+    report["allocator_comparison"] = {
+        "agreement_per_layer": f"{agree_count}/{len(res_A.assignments)}",
+        "total_drift_A": res_A.total_drift,
+        "total_drift_B": res_B.total_drift,
+        "total_kv_gb_A": res_A.total_kv_gb,
+        "total_kv_gb_B": res_B.total_kv_gb,
+    }
+api = HfApi()
+report_path = "/tmp/comparison_report.json"
+with open(report_path, "w") as f:
+    json.dump(report, f, indent=2)
+api.upload_file(
+    path_or_fileobj=report_path,
+    path_in_repo="report.json",
+    repo_id=out_repo,
+    repo_type="dataset",
+    commit_message=f"Strategy A vs B on {TARGET_MODEL}",
+)
+# Also dump raw rows for follow-up analysis
+rows_dump = {
+    "strategy_A": [r.to_vault_payload() for r in rows_A],
+    "strategy_B": [r.to_vault_payload() for r in rows_B],
+}
+rows_path = "/tmp/comparison_rows.json"
+with open(rows_path, "w") as f:
+    json.dump(rows_dump, f, indent=2)
+api.upload_file(
+    path_or_fileobj=rows_path,
+    path_in_repo="rows.json",
+    repo_id=out_repo,
+    repo_type="dataset",
+    commit_message="Raw profile rows for follow-up analysis",
+)
+print(f"\n[done] report + rows pushed to https://huggingface.co/datasets/{out_repo}")