save experiment results
Browse files- looped_laguna/eval.py +10 -1
- scripts/run_eval.py +29 -11
- scripts/run_global_vs_sliding_loop.py +28 -9
- scripts/smoke_real.py +37 -5
looped_laguna/eval.py
CHANGED
|
@@ -136,13 +136,20 @@ def _assert_loop_active(model, tok_examples: list[_TokExample], configs, device)
|
|
| 136 |
|
| 137 |
|
| 138 |
def run_matrix(
|
| 139 |
-
model,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
) -> dict[str, dict]:
|
| 141 |
"""Run several named configs on the same examples; report metrics + delta vs baseline.
|
| 142 |
|
| 143 |
`configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
|
| 144 |
present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
|
| 145 |
With `verbose`, shows a per-config progress bar and prints each config's result.
|
|
|
|
|
|
|
| 146 |
"""
|
| 147 |
device = next(model.parameters()).device
|
| 148 |
tok_examples = _tokenize_examples(tokenizer, examples) # tokenize once, reuse
|
|
@@ -153,6 +160,8 @@ def run_matrix(
|
|
| 153 |
results[name] = r
|
| 154 |
if verbose:
|
| 155 |
print(f" [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
|
|
|
|
|
|
|
| 156 |
if "baseline" in results:
|
| 157 |
b = results["baseline"]
|
| 158 |
for name, r in results.items():
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
def run_matrix(
|
| 139 |
+
model,
|
| 140 |
+
tokenizer,
|
| 141 |
+
examples: list[MCExample],
|
| 142 |
+
configs: dict[str, LoopConfig | None],
|
| 143 |
+
verbose: bool = True,
|
| 144 |
+
on_result=None,
|
| 145 |
) -> dict[str, dict]:
|
| 146 |
"""Run several named configs on the same examples; report metrics + delta vs baseline.
|
| 147 |
|
| 148 |
`configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
|
| 149 |
present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
|
| 150 |
With `verbose`, shows a per-config progress bar and prints each config's result.
|
| 151 |
+
`on_result(name, result)`, if given, is called as each config completes — use it
|
| 152 |
+
to persist partial results so a mid-run crash doesn't lose finished configs.
|
| 153 |
"""
|
| 154 |
device = next(model.parameters()).device
|
| 155 |
tok_examples = _tokenize_examples(tokenizer, examples) # tokenize once, reuse
|
|
|
|
| 160 |
results[name] = r
|
| 161 |
if verbose:
|
| 162 |
print(f" [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
|
| 163 |
+
if on_result is not None:
|
| 164 |
+
on_result(name, r)
|
| 165 |
if "baseline" in results:
|
| 166 |
b = results["baseline"]
|
| 167 |
for name, r in results.items():
|
scripts/run_eval.py
CHANGED
|
@@ -21,6 +21,7 @@ import argparse
|
|
| 21 |
import json
|
| 22 |
import sys
|
| 23 |
import time
|
|
|
|
| 24 |
from pathlib import Path
|
| 25 |
|
| 26 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
@@ -75,7 +76,9 @@ def main() -> None:
|
|
| 75 |
p.add_argument("--tiny", action="store_true",
|
| 76 |
help="use a tiny random-weight model instead of real weights (plumbing check)")
|
| 77 |
p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
|
| 78 |
-
p.add_argument("--output", default=None,
|
|
|
|
|
|
|
| 79 |
args = p.parse_args()
|
| 80 |
|
| 81 |
ks = [int(x) for x in args.ks.split(",")]
|
|
@@ -94,27 +97,42 @@ def main() -> None:
|
|
| 94 |
print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
|
| 95 |
print(f"configs: {list(configs)}")
|
| 96 |
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
for task in tasks:
|
| 99 |
examples = DATASET_LOADERS[task](limit=args.limit)
|
| 100 |
print(f"\n=== {task} ({len(examples)} items) ===")
|
|
|
|
| 101 |
t0 = time.time()
|
| 102 |
-
results = run_matrix(
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
| 105 |
print(f" {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
|
| 106 |
for name, r in results.items():
|
| 107 |
print(
|
| 108 |
f" {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
|
| 109 |
f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
|
| 110 |
)
|
| 111 |
-
print(f" ({
|
| 112 |
|
| 113 |
-
if
|
| 114 |
-
|
| 115 |
-
"window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
|
| 116 |
-
Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
|
| 117 |
-
print(f"\nwrote {args.output}")
|
| 118 |
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
|
|
|
| 21 |
import json
|
| 22 |
import sys
|
| 23 |
import time
|
| 24 |
+
from datetime import datetime
|
| 25 |
from pathlib import Path
|
| 26 |
|
| 27 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
| 76 |
p.add_argument("--tiny", action="store_true",
|
| 77 |
help="use a tiny random-weight model instead of real weights (plumbing check)")
|
| 78 |
p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
|
| 79 |
+
p.add_argument("--output", default=None,
|
| 80 |
+
help="results JSON path (default: auto-named results_eval_<timestamp>.json)")
|
| 81 |
+
p.add_argument("--no-save", action="store_true", help="do not write results to disk")
|
| 82 |
args = p.parse_args()
|
| 83 |
|
| 84 |
ks = [int(x) for x in args.ks.split(",")]
|
|
|
|
| 97 |
print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
|
| 98 |
print(f"configs: {list(configs)}")
|
| 99 |
|
| 100 |
+
# Always persist (incrementally) unless --no-save, so a crash never loses finished work.
|
| 101 |
+
save_path = None if args.no_save else (
|
| 102 |
+
args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
| 103 |
+
)
|
| 104 |
+
meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
|
| 105 |
+
"window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
|
| 106 |
+
all_results: dict = {}
|
| 107 |
+
|
| 108 |
+
def save() -> None:
|
| 109 |
+
if save_path:
|
| 110 |
+
Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
|
| 111 |
+
|
| 112 |
+
if save_path:
|
| 113 |
+
print(f"saving results to {save_path} (updated after every config)")
|
| 114 |
+
|
| 115 |
for task in tasks:
|
| 116 |
examples = DATASET_LOADERS[task](limit=args.limit)
|
| 117 |
print(f"\n=== {task} ({len(examples)} items) ===")
|
| 118 |
+
all_results[task] = {} # partial configs land here as they finish
|
| 119 |
t0 = time.time()
|
| 120 |
+
results = run_matrix(
|
| 121 |
+
model, tokenizer, examples, configs,
|
| 122 |
+
on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
|
| 123 |
+
)
|
| 124 |
+
all_results[task] = results # full results, now with deltas
|
| 125 |
+
save()
|
| 126 |
print(f" {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
|
| 127 |
for name, r in results.items():
|
| 128 |
print(
|
| 129 |
f" {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
|
| 130 |
f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
|
| 131 |
)
|
| 132 |
+
print(f" ({time.time() - t0:.1f}s)")
|
| 133 |
|
| 134 |
+
if save_path:
|
| 135 |
+
print(f"\nresults saved to {save_path}")
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
if __name__ == "__main__":
|
scripts/run_global_vs_sliding_loop.py
CHANGED
|
@@ -27,6 +27,7 @@ import argparse
|
|
| 27 |
import json
|
| 28 |
import sys
|
| 29 |
import time
|
|
|
|
| 30 |
from pathlib import Path
|
| 31 |
|
| 32 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
@@ -74,7 +75,9 @@ def main() -> None:
|
|
| 74 |
p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
|
| 75 |
p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
|
| 76 |
p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
|
| 77 |
-
p.add_argument("--output", default=None,
|
|
|
|
|
|
|
| 78 |
args = p.parse_args()
|
| 79 |
|
| 80 |
tasks = [t.strip() for t in args.tasks.split(",")]
|
|
@@ -98,13 +101,33 @@ def main() -> None:
|
|
| 98 |
types = [lt[i][0] for i in cfg.loop_layers] # 'f' (full) / 's' (sliding)
|
| 99 |
print(f" {name:<24} layers={list(cfg.loop_layers)} types={types}")
|
| 100 |
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
for task in tasks:
|
| 103 |
examples = DATASET_LOADERS[task](limit=args.limit)
|
| 104 |
print(f"\n=== {task} ({len(examples)} items) ===")
|
|
|
|
| 105 |
t0 = time.time()
|
| 106 |
-
results = run_matrix(
|
|
|
|
|
|
|
|
|
|
| 107 |
all_results[task] = results
|
|
|
|
| 108 |
print(f" {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
|
| 109 |
for name, r in results.items():
|
| 110 |
print(
|
|
@@ -113,12 +136,8 @@ def main() -> None:
|
|
| 113 |
)
|
| 114 |
print(f" ({time.time() - t0:.1f}s)")
|
| 115 |
|
| 116 |
-
if
|
| 117 |
-
|
| 118 |
-
"configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
|
| 119 |
-
"tasks": tasks, "limit": args.limit}
|
| 120 |
-
Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
|
| 121 |
-
print(f"\nwrote {args.output}")
|
| 122 |
|
| 123 |
|
| 124 |
if __name__ == "__main__":
|
|
|
|
| 27 |
import json
|
| 28 |
import sys
|
| 29 |
import time
|
| 30 |
+
from datetime import datetime
|
| 31 |
from pathlib import Path
|
| 32 |
|
| 33 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
| 75 |
p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
|
| 76 |
p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
|
| 77 |
p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
|
| 78 |
+
p.add_argument("--output", default=None,
|
| 79 |
+
help="results JSON path (default: auto-named results_gvs_<timestamp>.json)")
|
| 80 |
+
p.add_argument("--no-save", action="store_true", help="do not write results to disk")
|
| 81 |
args = p.parse_args()
|
| 82 |
|
| 83 |
tasks = [t.strip() for t in args.tasks.split(",")]
|
|
|
|
| 101 |
types = [lt[i][0] for i in cfg.loop_layers] # 'f' (full) / 's' (sliding)
|
| 102 |
print(f" {name:<24} layers={list(cfg.loop_layers)} types={types}")
|
| 103 |
|
| 104 |
+
# Always persist (incrementally) unless --no-save, so a crash never loses finished work.
|
| 105 |
+
save_path = None if args.no_save else (
|
| 106 |
+
args.output or f"results_gvs_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
| 107 |
+
)
|
| 108 |
+
meta = {"model": "tiny" if args.tiny else args.model, "k": args.k,
|
| 109 |
+
"configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
|
| 110 |
+
"tasks": tasks, "limit": args.limit}
|
| 111 |
+
all_results: dict = {}
|
| 112 |
+
|
| 113 |
+
def save() -> None:
|
| 114 |
+
if save_path:
|
| 115 |
+
Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
|
| 116 |
+
|
| 117 |
+
if save_path:
|
| 118 |
+
print(f"saving results to {save_path} (updated after every config)")
|
| 119 |
+
|
| 120 |
for task in tasks:
|
| 121 |
examples = DATASET_LOADERS[task](limit=args.limit)
|
| 122 |
print(f"\n=== {task} ({len(examples)} items) ===")
|
| 123 |
+
all_results[task] = {}
|
| 124 |
t0 = time.time()
|
| 125 |
+
results = run_matrix(
|
| 126 |
+
model, tokenizer, examples, configs,
|
| 127 |
+
on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
|
| 128 |
+
)
|
| 129 |
all_results[task] = results
|
| 130 |
+
save()
|
| 131 |
print(f" {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
|
| 132 |
for name, r in results.items():
|
| 133 |
print(
|
|
|
|
| 136 |
)
|
| 137 |
print(f" ({time.time() - t0:.1f}s)")
|
| 138 |
|
| 139 |
+
if save_path:
|
| 140 |
+
print(f"\nresults saved to {save_path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
|
| 143 |
if __name__ == "__main__":
|
scripts/smoke_real.py
CHANGED
|
@@ -15,7 +15,9 @@ One command: uv run python scripts/smoke_real.py
|
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import argparse
|
|
|
|
| 18 |
import sys
|
|
|
|
| 19 |
from pathlib import Path
|
| 20 |
|
| 21 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
@@ -37,8 +39,21 @@ def main() -> None:
|
|
| 37 |
p.add_argument("--width", type=int, default=4, help="loop window width in layers")
|
| 38 |
p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
|
| 39 |
p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
|
|
|
|
|
|
|
|
|
|
| 40 |
args = p.parse_args()
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 43 |
|
| 44 |
print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
|
|
@@ -49,11 +64,14 @@ def main() -> None:
|
|
| 49 |
model.eval()
|
| 50 |
device = next(model.parameters()).device
|
| 51 |
n = model.config.num_hidden_layers
|
|
|
|
| 52 |
if device.type == "cuda":
|
| 53 |
-
|
| 54 |
-
print(f"loaded: {n} layers, peak GPU mem after load ~{
|
|
|
|
| 55 |
|
| 56 |
window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
|
|
|
|
| 57 |
print(f"loop window = {window} (depth fraction ~{args.center})")
|
| 58 |
|
| 59 |
prompt = "Question: What is the capital of France?\nAnswer:"
|
|
@@ -67,17 +85,21 @@ def main() -> None:
|
|
| 67 |
with torch.no_grad():
|
| 68 |
base1 = fwd_logits()
|
| 69 |
top = base1[0, -1].topk(5).indices.tolist()
|
| 70 |
-
|
|
|
|
| 71 |
if not args.no_generate:
|
| 72 |
unpatch(model)
|
| 73 |
with torch.no_grad():
|
| 74 |
gen = model.generate(ids, max_new_tokens=20, do_sample=False)
|
| 75 |
-
|
|
|
|
|
|
|
| 76 |
|
| 77 |
# noise floor: baseline vs itself (kernel run-to-run nondeterminism).
|
| 78 |
with torch.no_grad():
|
| 79 |
base2 = fwd_logits()
|
| 80 |
noise = (base1 - base2).abs().max().item()
|
|
|
|
| 81 |
print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
|
| 82 |
|
| 83 |
# [1] K=1 patched must match baseline to within the noise floor.
|
|
@@ -87,11 +109,14 @@ def main() -> None:
|
|
| 87 |
unpatch(model)
|
| 88 |
d_k1 = (k1 - base1).abs().max().item()
|
| 89 |
tol = max(noise * 4, 1e-3)
|
|
|
|
| 90 |
print(f" K=1 patched vs baseline: max|diff| = {d_k1:.2e} (tol {tol:.1e})")
|
|
|
|
| 91 |
assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
|
| 92 |
|
| 93 |
# [2] K>1 looping must be active (finite, and clearly above the noise floor).
|
| 94 |
print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
|
|
|
|
| 95 |
for mode in ("layer", "block"):
|
| 96 |
for K in (2, 3):
|
| 97 |
patch(model, LoopConfig(window=window, K=K, mode=mode))
|
|
@@ -99,7 +124,9 @@ def main() -> None:
|
|
| 99 |
out = fwd_logits()
|
| 100 |
unpatch(model)
|
| 101 |
d = (out - base1).abs().max().item()
|
| 102 |
-
finite = torch.isfinite(out).all().item()
|
|
|
|
|
|
|
| 103 |
assert finite, f"{mode} K={K}: non-finite logits"
|
| 104 |
assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
|
| 105 |
print(f" {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite} OK")
|
|
@@ -111,10 +138,15 @@ def main() -> None:
|
|
| 111 |
model(input_ids=ids, use_cache=True)
|
| 112 |
raise AssertionError("expected NotImplementedError for use_cache=True")
|
| 113 |
except NotImplementedError:
|
|
|
|
| 114 |
print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
|
| 115 |
unpatch(model)
|
| 116 |
|
|
|
|
|
|
|
| 117 |
print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
if __name__ == "__main__":
|
|
|
|
| 15 |
from __future__ import annotations
|
| 16 |
|
| 17 |
import argparse
|
| 18 |
+
import json
|
| 19 |
import sys
|
| 20 |
+
from datetime import datetime
|
| 21 |
from pathlib import Path
|
| 22 |
|
| 23 |
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
| 39 |
p.add_argument("--width", type=int, default=4, help="loop window width in layers")
|
| 40 |
p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
|
| 41 |
p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
|
| 42 |
+
p.add_argument("--output", default=None,
|
| 43 |
+
help="diagnostics JSON path (default: auto-named results_smoke_real_<timestamp>.json)")
|
| 44 |
+
p.add_argument("--no-save", action="store_true", help="do not write diagnostics to disk")
|
| 45 |
args = p.parse_args()
|
| 46 |
|
| 47 |
+
# Persist diagnostics incrementally (even before an assert that might fail).
|
| 48 |
+
save_path = None if args.no_save else (
|
| 49 |
+
args.output or f"results_smoke_real_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
|
| 50 |
+
)
|
| 51 |
+
diag: dict = {"model": args.model, "dtype": args.dtype, "device": args.device, "passed": False}
|
| 52 |
+
|
| 53 |
+
def save() -> None:
|
| 54 |
+
if save_path:
|
| 55 |
+
Path(save_path).write_text(json.dumps(diag, indent=2))
|
| 56 |
+
|
| 57 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 58 |
|
| 59 |
print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
|
|
|
|
| 64 |
model.eval()
|
| 65 |
device = next(model.parameters()).device
|
| 66 |
n = model.config.num_hidden_layers
|
| 67 |
+
diag["num_layers"] = n
|
| 68 |
if device.type == "cuda":
|
| 69 |
+
diag["peak_gpu_gb"] = round(torch.cuda.max_memory_allocated() / 1e9, 1)
|
| 70 |
+
print(f"loaded: {n} layers, peak GPU mem after load ~{diag['peak_gpu_gb']} GB")
|
| 71 |
+
save()
|
| 72 |
|
| 73 |
window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
|
| 74 |
+
diag["window"] = list(window)
|
| 75 |
print(f"loop window = {window} (depth fraction ~{args.center})")
|
| 76 |
|
| 77 |
prompt = "Question: What is the capital of France?\nAnswer:"
|
|
|
|
| 85 |
with torch.no_grad():
|
| 86 |
base1 = fwd_logits()
|
| 87 |
top = base1[0, -1].topk(5).indices.tolist()
|
| 88 |
+
diag["top5_tokens"] = [tok.decode([t]) for t in top]
|
| 89 |
+
print(f"\n[0] baseline top-5 next tokens: {[repr(t) for t in diag['top5_tokens']]}")
|
| 90 |
if not args.no_generate:
|
| 91 |
unpatch(model)
|
| 92 |
with torch.no_grad():
|
| 93 |
gen = model.generate(ids, max_new_tokens=20, do_sample=False)
|
| 94 |
+
diag["greedy_continuation"] = tok.decode(gen[0, ids.shape[1]:], skip_special_tokens=True)
|
| 95 |
+
print(f" greedy continuation: {diag['greedy_continuation']!r}")
|
| 96 |
+
save()
|
| 97 |
|
| 98 |
# noise floor: baseline vs itself (kernel run-to-run nondeterminism).
|
| 99 |
with torch.no_grad():
|
| 100 |
base2 = fwd_logits()
|
| 101 |
noise = (base1 - base2).abs().max().item()
|
| 102 |
+
diag["noise_floor"] = noise
|
| 103 |
print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
|
| 104 |
|
| 105 |
# [1] K=1 patched must match baseline to within the noise floor.
|
|
|
|
| 109 |
unpatch(model)
|
| 110 |
d_k1 = (k1 - base1).abs().max().item()
|
| 111 |
tol = max(noise * 4, 1e-3)
|
| 112 |
+
diag["k1_diff"], diag["tol"] = d_k1, tol
|
| 113 |
print(f" K=1 patched vs baseline: max|diff| = {d_k1:.2e} (tol {tol:.1e})")
|
| 114 |
+
save()
|
| 115 |
assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
|
| 116 |
|
| 117 |
# [2] K>1 looping must be active (finite, and clearly above the noise floor).
|
| 118 |
print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
|
| 119 |
+
diag["k_active"] = {}
|
| 120 |
for mode in ("layer", "block"):
|
| 121 |
for K in (2, 3):
|
| 122 |
patch(model, LoopConfig(window=window, K=K, mode=mode))
|
|
|
|
| 124 |
out = fwd_logits()
|
| 125 |
unpatch(model)
|
| 126 |
d = (out - base1).abs().max().item()
|
| 127 |
+
finite = bool(torch.isfinite(out).all().item())
|
| 128 |
+
diag["k_active"][f"{mode}-rk-K{K}"] = {"diff": d, "finite": finite}
|
| 129 |
+
save()
|
| 130 |
assert finite, f"{mode} K={K}: non-finite logits"
|
| 131 |
assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
|
| 132 |
print(f" {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite} OK")
|
|
|
|
| 138 |
model(input_ids=ids, use_cache=True)
|
| 139 |
raise AssertionError("expected NotImplementedError for use_cache=True")
|
| 140 |
except NotImplementedError:
|
| 141 |
+
diag["decode_guard"] = "ok"
|
| 142 |
print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
|
| 143 |
unpatch(model)
|
| 144 |
|
| 145 |
+
diag["passed"] = True
|
| 146 |
+
save()
|
| 147 |
print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
|
| 148 |
+
if save_path:
|
| 149 |
+
print(f"diagnostics saved to {save_path}")
|
| 150 |
|
| 151 |
|
| 152 |
if __name__ == "__main__":
|