Spaces:

poolside-laguna-hackathon
/

looped-laguna

Running

App Files Files Community

e-p commited on 11 days ago

Commit

de93199

1 Parent(s): 20c019c

save experiment results

Browse files

Files changed (4) hide show

looped_laguna/eval.py +10 -1
scripts/run_eval.py +29 -11
scripts/run_global_vs_sliding_loop.py +28 -9
scripts/smoke_real.py +37 -5

looped_laguna/eval.py CHANGED Viewed

@@ -136,13 +136,20 @@ def _assert_loop_active(model, tok_examples: list[_TokExample], configs, device)
 def run_matrix(
-    model, tokenizer, examples: list[MCExample], configs: dict[str, LoopConfig | None], verbose: bool = True
 ) -> dict[str, dict]:
     """Run several named configs on the same examples; report metrics + delta vs baseline.
     `configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
     present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
     With `verbose`, shows a per-config progress bar and prints each config's result.
     """
     device = next(model.parameters()).device
     tok_examples = _tokenize_examples(tokenizer, examples)  # tokenize once, reuse
@@ -153,6 +160,8 @@ def run_matrix(
         results[name] = r
         if verbose:
             print(f"  [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
     if "baseline" in results:
         b = results["baseline"]
         for name, r in results.items():

 def run_matrix(
+    model,
+    tokenizer,
+    examples: list[MCExample],
+    configs: dict[str, LoopConfig | None],
+    verbose: bool = True,
+    on_result=None,
 ) -> dict[str, dict]:
     """Run several named configs on the same examples; report metrics + delta vs baseline.
     `configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
     present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
     With `verbose`, shows a per-config progress bar and prints each config's result.
+    `on_result(name, result)`, if given, is called as each config completes — use it
+    to persist partial results so a mid-run crash doesn't lose finished configs.
     """
     device = next(model.parameters()).device
     tok_examples = _tokenize_examples(tokenizer, examples)  # tokenize once, reuse
         results[name] = r
         if verbose:
             print(f"  [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
+        if on_result is not None:
+            on_result(name, r)
     if "baseline" in results:
         b = results["baseline"]
         for name, r in results.items():

scripts/run_eval.py CHANGED Viewed

@@ -21,6 +21,7 @@ import argparse
 import json
 import sys
 import time
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -75,7 +76,9 @@ def main() -> None:
     p.add_argument("--tiny", action="store_true",
                    help="use a tiny random-weight model instead of real weights (plumbing check)")
     p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
-    p.add_argument("--output", default=None, help="path to write results JSON (default: don't save)")
     args = p.parse_args()
     ks = [int(x) for x in args.ks.split(",")]
@@ -94,27 +97,42 @@ def main() -> None:
     print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
     print(f"configs: {list(configs)}")
-    all_results = {}
     for task in tasks:
         examples = DATASET_LOADERS[task](limit=args.limit)
         print(f"\n=== {task} ({len(examples)} items) ===")
         t0 = time.time()
-        results = run_matrix(model, tokenizer, examples, configs)
-        dt = time.time() - t0
-        all_results[task] = results
         print(f"  {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
         for name, r in results.items():
             print(
                 f"  {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
                 f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
             )
-        print(f"  ({dt:.1f}s)")
-    if args.output:
-        meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
-                "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
-        Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
-        print(f"\nwrote {args.output}")
 if __name__ == "__main__":

 import json
 import sys
 import time
+from datetime import datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     p.add_argument("--tiny", action="store_true",
                    help="use a tiny random-weight model instead of real weights (plumbing check)")
     p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
+    p.add_argument("--output", default=None,
+                   help="results JSON path (default: auto-named results_eval_<timestamp>.json)")
+    p.add_argument("--no-save", action="store_true", help="do not write results to disk")
     args = p.parse_args()
     ks = [int(x) for x in args.ks.split(",")]
     print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
     print(f"configs: {list(configs)}")
+    # Always persist (incrementally) unless --no-save, so a crash never loses finished work.
+    save_path = None if args.no_save else (
+        args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
+            "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
+    all_results: dict = {}
+    def save() -> None:
+        if save_path:
+            Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
+    if save_path:
+        print(f"saving results to {save_path} (updated after every config)")
     for task in tasks:
         examples = DATASET_LOADERS[task](limit=args.limit)
         print(f"\n=== {task} ({len(examples)} items) ===")
+        all_results[task] = {}  # partial configs land here as they finish
         t0 = time.time()
+        results = run_matrix(
+            model, tokenizer, examples, configs,
+            on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
+        )
+        all_results[task] = results  # full results, now with deltas
+        save()
         print(f"  {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
         for name, r in results.items():
             print(
                 f"  {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
                 f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
             )
+        print(f"  ({time.time() - t0:.1f}s)")
+    if save_path:
+        print(f"\nresults saved to {save_path}")
 if __name__ == "__main__":

scripts/run_global_vs_sliding_loop.py CHANGED Viewed

@@ -27,6 +27,7 @@ import argparse
 import json
 import sys
 import time
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -74,7 +75,9 @@ def main() -> None:
     p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
     p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
     p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
-    p.add_argument("--output", default=None, help="path to write results JSON")
     args = p.parse_args()
     tasks = [t.strip() for t in args.tasks.split(",")]
@@ -98,13 +101,33 @@ def main() -> None:
             types = [lt[i][0] for i in cfg.loop_layers]  # 'f' (full) / 's' (sliding)
             print(f"  {name:<24} layers={list(cfg.loop_layers)} types={types}")
-    all_results = {}
     for task in tasks:
         examples = DATASET_LOADERS[task](limit=args.limit)
         print(f"\n=== {task} ({len(examples)} items) ===")
         t0 = time.time()
-        results = run_matrix(model, tokenizer, examples, configs)
         all_results[task] = results
         print(f"  {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
         for name, r in results.items():
             print(
@@ -113,12 +136,8 @@ def main() -> None:
             )
         print(f"  ({time.time() - t0:.1f}s)")
-    if args.output:
-        meta = {"model": "tiny" if args.tiny else args.model, "k": args.k,
-                "configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
-                "tasks": tasks, "limit": args.limit}
-        Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
-        print(f"\nwrote {args.output}")
 if __name__ == "__main__":

 import json
 import sys
 import time
+from datetime import datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
     p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
     p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
+    p.add_argument("--output", default=None,
+                   help="results JSON path (default: auto-named results_gvs_<timestamp>.json)")
+    p.add_argument("--no-save", action="store_true", help="do not write results to disk")
     args = p.parse_args()
     tasks = [t.strip() for t in args.tasks.split(",")]
             types = [lt[i][0] for i in cfg.loop_layers]  # 'f' (full) / 's' (sliding)
             print(f"  {name:<24} layers={list(cfg.loop_layers)} types={types}")
+    # Always persist (incrementally) unless --no-save, so a crash never loses finished work.
+    save_path = None if args.no_save else (
+        args.output or f"results_gvs_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    meta = {"model": "tiny" if args.tiny else args.model, "k": args.k,
+            "configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
+            "tasks": tasks, "limit": args.limit}
+    all_results: dict = {}
+    def save() -> None:
+        if save_path:
+            Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
+    if save_path:
+        print(f"saving results to {save_path} (updated after every config)")
     for task in tasks:
         examples = DATASET_LOADERS[task](limit=args.limit)
         print(f"\n=== {task} ({len(examples)} items) ===")
+        all_results[task] = {}
         t0 = time.time()
+        results = run_matrix(
+            model, tokenizer, examples, configs,
+            on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
+        )
         all_results[task] = results
+        save()
         print(f"  {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
         for name, r in results.items():
             print(
             )
         print(f"  ({time.time() - t0:.1f}s)")
+    if save_path:
+        print(f"\nresults saved to {save_path}")
 if __name__ == "__main__":

scripts/smoke_real.py CHANGED Viewed

@@ -15,7 +15,9 @@ One command:  uv run python scripts/smoke_real.py
 from __future__ import annotations
 import argparse
 import sys
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -37,8 +39,21 @@ def main() -> None:
     p.add_argument("--width", type=int, default=4, help="loop window width in layers")
     p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
     p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
     args = p.parse_args()
     from transformers import AutoModelForCausalLM, AutoTokenizer
     print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
@@ -49,11 +64,14 @@ def main() -> None:
     model.eval()
     device = next(model.parameters()).device
     n = model.config.num_hidden_layers
     if device.type == "cuda":
-        mem = torch.cuda.max_memory_allocated() / 1e9
-        print(f"loaded: {n} layers, peak GPU mem after load ~{mem:.1f} GB")
     window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
     print(f"loop window = {window} (depth fraction ~{args.center})")
     prompt = "Question: What is the capital of France?\nAnswer:"
@@ -67,17 +85,21 @@ def main() -> None:
     with torch.no_grad():
         base1 = fwd_logits()
     top = base1[0, -1].topk(5).indices.tolist()
-    print(f"\n[0] baseline top-5 next tokens: {[repr(tok.decode([t])) for t in top]}")
     if not args.no_generate:
         unpatch(model)
         with torch.no_grad():
             gen = model.generate(ids, max_new_tokens=20, do_sample=False)
-        print(f"    greedy continuation: {tok.decode(gen[0, ids.shape[1]:], skip_special_tokens=True)!r}")
     # noise floor: baseline vs itself (kernel run-to-run nondeterminism).
     with torch.no_grad():
         base2 = fwd_logits()
     noise = (base1 - base2).abs().max().item()
     print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
     # [1] K=1 patched must match baseline to within the noise floor.
@@ -87,11 +109,14 @@ def main() -> None:
     unpatch(model)
     d_k1 = (k1 - base1).abs().max().item()
     tol = max(noise * 4, 1e-3)
     print(f"    K=1 patched vs baseline:                 max|diff| = {d_k1:.2e} (tol {tol:.1e})")
     assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
     # [2] K>1 looping must be active (finite, and clearly above the noise floor).
     print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
     for mode in ("layer", "block"):
         for K in (2, 3):
             patch(model, LoopConfig(window=window, K=K, mode=mode))
@@ -99,7 +124,9 @@ def main() -> None:
                 out = fwd_logits()
             unpatch(model)
             d = (out - base1).abs().max().item()
-            finite = torch.isfinite(out).all().item()
             assert finite, f"{mode} K={K}: non-finite logits"
             assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
             print(f"    {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite}  OK")
@@ -111,10 +138,15 @@ def main() -> None:
             model(input_ids=ids, use_cache=True)
         raise AssertionError("expected NotImplementedError for use_cache=True")
     except NotImplementedError:
         print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
     unpatch(model)
     print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
 if __name__ == "__main__":

 from __future__ import annotations
 import argparse
+import json
 import sys
+from datetime import datetime
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
     p.add_argument("--width", type=int, default=4, help="loop window width in layers")
     p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
     p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
+    p.add_argument("--output", default=None,
+                   help="diagnostics JSON path (default: auto-named results_smoke_real_<timestamp>.json)")
+    p.add_argument("--no-save", action="store_true", help="do not write diagnostics to disk")
     args = p.parse_args()
+    # Persist diagnostics incrementally (even before an assert that might fail).
+    save_path = None if args.no_save else (
+        args.output or f"results_smoke_real_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
+    )
+    diag: dict = {"model": args.model, "dtype": args.dtype, "device": args.device, "passed": False}
+    def save() -> None:
+        if save_path:
+            Path(save_path).write_text(json.dumps(diag, indent=2))
     from transformers import AutoModelForCausalLM, AutoTokenizer
     print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
     model.eval()
     device = next(model.parameters()).device
     n = model.config.num_hidden_layers
+    diag["num_layers"] = n
     if device.type == "cuda":
+        diag["peak_gpu_gb"] = round(torch.cuda.max_memory_allocated() / 1e9, 1)
+        print(f"loaded: {n} layers, peak GPU mem after load ~{diag['peak_gpu_gb']} GB")
+    save()
     window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
+    diag["window"] = list(window)
     print(f"loop window = {window} (depth fraction ~{args.center})")
     prompt = "Question: What is the capital of France?\nAnswer:"
     with torch.no_grad():
         base1 = fwd_logits()
     top = base1[0, -1].topk(5).indices.tolist()
+    diag["top5_tokens"] = [tok.decode([t]) for t in top]
+    print(f"\n[0] baseline top-5 next tokens: {[repr(t) for t in diag['top5_tokens']]}")
     if not args.no_generate:
         unpatch(model)
         with torch.no_grad():
             gen = model.generate(ids, max_new_tokens=20, do_sample=False)
+        diag["greedy_continuation"] = tok.decode(gen[0, ids.shape[1]:], skip_special_tokens=True)
+        print(f"    greedy continuation: {diag['greedy_continuation']!r}")
+    save()
     # noise floor: baseline vs itself (kernel run-to-run nondeterminism).
     with torch.no_grad():
         base2 = fwd_logits()
     noise = (base1 - base2).abs().max().item()
+    diag["noise_floor"] = noise
     print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
     # [1] K=1 patched must match baseline to within the noise floor.
     unpatch(model)
     d_k1 = (k1 - base1).abs().max().item()
     tol = max(noise * 4, 1e-3)
+    diag["k1_diff"], diag["tol"] = d_k1, tol
     print(f"    K=1 patched vs baseline:                 max|diff| = {d_k1:.2e} (tol {tol:.1e})")
+    save()
     assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
     # [2] K>1 looping must be active (finite, and clearly above the noise floor).
     print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
+    diag["k_active"] = {}
     for mode in ("layer", "block"):
         for K in (2, 3):
             patch(model, LoopConfig(window=window, K=K, mode=mode))
                 out = fwd_logits()
             unpatch(model)
             d = (out - base1).abs().max().item()
+            finite = bool(torch.isfinite(out).all().item())
+            diag["k_active"][f"{mode}-rk-K{K}"] = {"diff": d, "finite": finite}
+            save()
             assert finite, f"{mode} K={K}: non-finite logits"
             assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
             print(f"    {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite}  OK")
             model(input_ids=ids, use_cache=True)
         raise AssertionError("expected NotImplementedError for use_cache=True")
     except NotImplementedError:
+        diag["decode_guard"] = "ok"
         print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
     unpatch(model)
+    diag["passed"] = True
+    save()
     print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
+    if save_path:
+        print(f"diagnostics saved to {save_path}")
 if __name__ == "__main__":