e-p commited on
Commit
de93199
·
1 Parent(s): 20c019c

save experiment results

Browse files
looped_laguna/eval.py CHANGED
@@ -136,13 +136,20 @@ def _assert_loop_active(model, tok_examples: list[_TokExample], configs, device)
136
 
137
 
138
  def run_matrix(
139
- model, tokenizer, examples: list[MCExample], configs: dict[str, LoopConfig | None], verbose: bool = True
 
 
 
 
 
140
  ) -> dict[str, dict]:
141
  """Run several named configs on the same examples; report metrics + delta vs baseline.
142
 
143
  `configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
144
  present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
145
  With `verbose`, shows a per-config progress bar and prints each config's result.
 
 
146
  """
147
  device = next(model.parameters()).device
148
  tok_examples = _tokenize_examples(tokenizer, examples) # tokenize once, reuse
@@ -153,6 +160,8 @@ def run_matrix(
153
  results[name] = r
154
  if verbose:
155
  print(f" [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
 
 
156
  if "baseline" in results:
157
  b = results["baseline"]
158
  for name, r in results.items():
 
136
 
137
 
138
  def run_matrix(
139
+ model,
140
+ tokenizer,
141
+ examples: list[MCExample],
142
+ configs: dict[str, LoopConfig | None],
143
+ verbose: bool = True,
144
+ on_result=None,
145
  ) -> dict[str, dict]:
146
  """Run several named configs on the same examples; report metrics + delta vs baseline.
147
 
148
  `configs` maps name -> LoopConfig (or None for baseline). If a "baseline" key is
149
  present, each row also gets `d_acc` / `d_acc_norm` deltas in percentage points.
150
  With `verbose`, shows a per-config progress bar and prints each config's result.
151
+ `on_result(name, result)`, if given, is called as each config completes — use it
152
+ to persist partial results so a mid-run crash doesn't lose finished configs.
153
  """
154
  device = next(model.parameters()).device
155
  tok_examples = _tokenize_examples(tokenizer, examples) # tokenize once, reuse
 
160
  results[name] = r
161
  if verbose:
162
  print(f" [{name}] acc={r['acc']:.4f} acc_norm={r['acc_norm']:.4f}", flush=True)
163
+ if on_result is not None:
164
+ on_result(name, r)
165
  if "baseline" in results:
166
  b = results["baseline"]
167
  for name, r in results.items():
scripts/run_eval.py CHANGED
@@ -21,6 +21,7 @@ import argparse
21
  import json
22
  import sys
23
  import time
 
24
  from pathlib import Path
25
 
26
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -75,7 +76,9 @@ def main() -> None:
75
  p.add_argument("--tiny", action="store_true",
76
  help="use a tiny random-weight model instead of real weights (plumbing check)")
77
  p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
78
- p.add_argument("--output", default=None, help="path to write results JSON (default: don't save)")
 
 
79
  args = p.parse_args()
80
 
81
  ks = [int(x) for x in args.ks.split(",")]
@@ -94,27 +97,42 @@ def main() -> None:
94
  print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
95
  print(f"configs: {list(configs)}")
96
 
97
- all_results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  for task in tasks:
99
  examples = DATASET_LOADERS[task](limit=args.limit)
100
  print(f"\n=== {task} ({len(examples)} items) ===")
 
101
  t0 = time.time()
102
- results = run_matrix(model, tokenizer, examples, configs)
103
- dt = time.time() - t0
104
- all_results[task] = results
 
 
 
105
  print(f" {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
106
  for name, r in results.items():
107
  print(
108
  f" {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
109
  f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
110
  )
111
- print(f" ({dt:.1f}s)")
112
 
113
- if args.output:
114
- meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
115
- "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
116
- Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
117
- print(f"\nwrote {args.output}")
118
 
119
 
120
  if __name__ == "__main__":
 
21
  import json
22
  import sys
23
  import time
24
+ from datetime import datetime
25
  from pathlib import Path
26
 
27
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
76
  p.add_argument("--tiny", action="store_true",
77
  help="use a tiny random-weight model instead of real weights (plumbing check)")
78
  p.add_argument("--tiny-layers", type=int, default=8, help="number of layers for the --tiny model")
79
+ p.add_argument("--output", default=None,
80
+ help="results JSON path (default: auto-named results_eval_<timestamp>.json)")
81
+ p.add_argument("--no-save", action="store_true", help="do not write results to disk")
82
  args = p.parse_args()
83
 
84
  ks = [int(x) for x in args.ks.split(",")]
 
97
  print(f"model: {'tiny' if args.tiny else args.model} | layers={num_layers} | loop window={window}")
98
  print(f"configs: {list(configs)}")
99
 
100
+ # Always persist (incrementally) unless --no-save, so a crash never loses finished work.
101
+ save_path = None if args.no_save else (
102
+ args.output or f"results_eval_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
103
+ )
104
+ meta = {"model": "tiny" if args.tiny else args.model, "num_layers": num_layers,
105
+ "window": list(window), "ks": ks, "tasks": tasks, "limit": args.limit}
106
+ all_results: dict = {}
107
+
108
+ def save() -> None:
109
+ if save_path:
110
+ Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
111
+
112
+ if save_path:
113
+ print(f"saving results to {save_path} (updated after every config)")
114
+
115
  for task in tasks:
116
  examples = DATASET_LOADERS[task](limit=args.limit)
117
  print(f"\n=== {task} ({len(examples)} items) ===")
118
+ all_results[task] = {} # partial configs land here as they finish
119
  t0 = time.time()
120
+ results = run_matrix(
121
+ model, tokenizer, examples, configs,
122
+ on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
123
+ )
124
+ all_results[task] = results # full results, now with deltas
125
+ save()
126
  print(f" {'config':<12} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
127
  for name, r in results.items():
128
  print(
129
  f" {name:<12} {r['acc']:>7.4f} {r['acc_norm']:>9.4f} "
130
  f"{r.get('d_acc', 0.0):>+10.2f} {r.get('d_acc_norm', 0.0):>+11.2f}"
131
  )
132
+ print(f" ({time.time() - t0:.1f}s)")
133
 
134
+ if save_path:
135
+ print(f"\nresults saved to {save_path}")
 
 
 
136
 
137
 
138
  if __name__ == "__main__":
scripts/run_global_vs_sliding_loop.py CHANGED
@@ -27,6 +27,7 @@ import argparse
27
  import json
28
  import sys
29
  import time
 
30
  from pathlib import Path
31
 
32
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -74,7 +75,9 @@ def main() -> None:
74
  p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
75
  p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
76
  p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
77
- p.add_argument("--output", default=None, help="path to write results JSON")
 
 
78
  args = p.parse_args()
79
 
80
  tasks = [t.strip() for t in args.tasks.split(",")]
@@ -98,13 +101,33 @@ def main() -> None:
98
  types = [lt[i][0] for i in cfg.loop_layers] # 'f' (full) / 's' (sliding)
99
  print(f" {name:<24} layers={list(cfg.loop_layers)} types={types}")
100
 
101
- all_results = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  for task in tasks:
103
  examples = DATASET_LOADERS[task](limit=args.limit)
104
  print(f"\n=== {task} ({len(examples)} items) ===")
 
105
  t0 = time.time()
106
- results = run_matrix(model, tokenizer, examples, configs)
 
 
 
107
  all_results[task] = results
 
108
  print(f" {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
109
  for name, r in results.items():
110
  print(
@@ -113,12 +136,8 @@ def main() -> None:
113
  )
114
  print(f" ({time.time() - t0:.1f}s)")
115
 
116
- if args.output:
117
- meta = {"model": "tiny" if args.tiny else args.model, "k": args.k,
118
- "configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
119
- "tasks": tasks, "limit": args.limit}
120
- Path(args.output).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
121
- print(f"\nwrote {args.output}")
122
 
123
 
124
  if __name__ == "__main__":
 
27
  import json
28
  import sys
29
  import time
30
+ from datetime import datetime
31
  from pathlib import Path
32
 
33
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
75
  p.add_argument("--device", default="cuda", help="device or device_map: cuda/cpu/auto")
76
  p.add_argument("--tiny", action="store_true", help="use a tiny random-weight model (plumbing check)")
77
  p.add_argument("--tiny-layers", type=int, default=8, help="layers for the --tiny model")
78
+ p.add_argument("--output", default=None,
79
+ help="results JSON path (default: auto-named results_gvs_<timestamp>.json)")
80
+ p.add_argument("--no-save", action="store_true", help="do not write results to disk")
81
  args = p.parse_args()
82
 
83
  tasks = [t.strip() for t in args.tasks.split(",")]
 
101
  types = [lt[i][0] for i in cfg.loop_layers] # 'f' (full) / 's' (sliding)
102
  print(f" {name:<24} layers={list(cfg.loop_layers)} types={types}")
103
 
104
+ # Always persist (incrementally) unless --no-save, so a crash never loses finished work.
105
+ save_path = None if args.no_save else (
106
+ args.output or f"results_gvs_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
107
+ )
108
+ meta = {"model": "tiny" if args.tiny else args.model, "k": args.k,
109
+ "configs": {n: (None if c is None else list(c.loop_layers)) for n, c in configs.items()},
110
+ "tasks": tasks, "limit": args.limit}
111
+ all_results: dict = {}
112
+
113
+ def save() -> None:
114
+ if save_path:
115
+ Path(save_path).write_text(json.dumps({"meta": meta, "results": all_results}, indent=2))
116
+
117
+ if save_path:
118
+ print(f"saving results to {save_path} (updated after every config)")
119
+
120
  for task in tasks:
121
  examples = DATASET_LOADERS[task](limit=args.limit)
122
  print(f"\n=== {task} ({len(examples)} items) ===")
123
+ all_results[task] = {}
124
  t0 = time.time()
125
+ results = run_matrix(
126
+ model, tokenizer, examples, configs,
127
+ on_result=lambda name, r, t=task: (all_results[t].__setitem__(name, r), save()),
128
+ )
129
  all_results[task] = results
130
+ save()
131
  print(f" {'config':<24} {'acc':>7} {'acc_norm':>9} {'d_acc(pp)':>10} {'d_norm(pp)':>11}")
132
  for name, r in results.items():
133
  print(
 
136
  )
137
  print(f" ({time.time() - t0:.1f}s)")
138
 
139
+ if save_path:
140
+ print(f"\nresults saved to {save_path}")
 
 
 
 
141
 
142
 
143
  if __name__ == "__main__":
scripts/smoke_real.py CHANGED
@@ -15,7 +15,9 @@ One command: uv run python scripts/smoke_real.py
15
  from __future__ import annotations
16
 
17
  import argparse
 
18
  import sys
 
19
  from pathlib import Path
20
 
21
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
@@ -37,8 +39,21 @@ def main() -> None:
37
  p.add_argument("--width", type=int, default=4, help="loop window width in layers")
38
  p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
39
  p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
 
 
 
40
  args = p.parse_args()
41
 
 
 
 
 
 
 
 
 
 
 
42
  from transformers import AutoModelForCausalLM, AutoTokenizer
43
 
44
  print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
@@ -49,11 +64,14 @@ def main() -> None:
49
  model.eval()
50
  device = next(model.parameters()).device
51
  n = model.config.num_hidden_layers
 
52
  if device.type == "cuda":
53
- mem = torch.cuda.max_memory_allocated() / 1e9
54
- print(f"loaded: {n} layers, peak GPU mem after load ~{mem:.1f} GB")
 
55
 
56
  window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
 
57
  print(f"loop window = {window} (depth fraction ~{args.center})")
58
 
59
  prompt = "Question: What is the capital of France?\nAnswer:"
@@ -67,17 +85,21 @@ def main() -> None:
67
  with torch.no_grad():
68
  base1 = fwd_logits()
69
  top = base1[0, -1].topk(5).indices.tolist()
70
- print(f"\n[0] baseline top-5 next tokens: {[repr(tok.decode([t])) for t in top]}")
 
71
  if not args.no_generate:
72
  unpatch(model)
73
  with torch.no_grad():
74
  gen = model.generate(ids, max_new_tokens=20, do_sample=False)
75
- print(f" greedy continuation: {tok.decode(gen[0, ids.shape[1]:], skip_special_tokens=True)!r}")
 
 
76
 
77
  # noise floor: baseline vs itself (kernel run-to-run nondeterminism).
78
  with torch.no_grad():
79
  base2 = fwd_logits()
80
  noise = (base1 - base2).abs().max().item()
 
81
  print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
82
 
83
  # [1] K=1 patched must match baseline to within the noise floor.
@@ -87,11 +109,14 @@ def main() -> None:
87
  unpatch(model)
88
  d_k1 = (k1 - base1).abs().max().item()
89
  tol = max(noise * 4, 1e-3)
 
90
  print(f" K=1 patched vs baseline: max|diff| = {d_k1:.2e} (tol {tol:.1e})")
 
91
  assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
92
 
93
  # [2] K>1 looping must be active (finite, and clearly above the noise floor).
94
  print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
 
95
  for mode in ("layer", "block"):
96
  for K in (2, 3):
97
  patch(model, LoopConfig(window=window, K=K, mode=mode))
@@ -99,7 +124,9 @@ def main() -> None:
99
  out = fwd_logits()
100
  unpatch(model)
101
  d = (out - base1).abs().max().item()
102
- finite = torch.isfinite(out).all().item()
 
 
103
  assert finite, f"{mode} K={K}: non-finite logits"
104
  assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
105
  print(f" {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite} OK")
@@ -111,10 +138,15 @@ def main() -> None:
111
  model(input_ids=ids, use_cache=True)
112
  raise AssertionError("expected NotImplementedError for use_cache=True")
113
  except NotImplementedError:
 
114
  print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
115
  unpatch(model)
116
 
 
 
117
  print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
 
 
118
 
119
 
120
  if __name__ == "__main__":
 
15
  from __future__ import annotations
16
 
17
  import argparse
18
+ import json
19
  import sys
20
+ from datetime import datetime
21
  from pathlib import Path
22
 
23
  sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
39
  p.add_argument("--width", type=int, default=4, help="loop window width in layers")
40
  p.add_argument("--center", type=float, default=0.5, help="loop window center as a depth fraction (0-1)")
41
  p.add_argument("--no-generate", action="store_true", help="skip the greedy-generation coherence check")
42
+ p.add_argument("--output", default=None,
43
+ help="diagnostics JSON path (default: auto-named results_smoke_real_<timestamp>.json)")
44
+ p.add_argument("--no-save", action="store_true", help="do not write diagnostics to disk")
45
  args = p.parse_args()
46
 
47
+ # Persist diagnostics incrementally (even before an assert that might fail).
48
+ save_path = None if args.no_save else (
49
+ args.output or f"results_smoke_real_{datetime.now().strftime('%Y%m%d-%H%M%S')}.json"
50
+ )
51
+ diag: dict = {"model": args.model, "dtype": args.dtype, "device": args.device, "passed": False}
52
+
53
+ def save() -> None:
54
+ if save_path:
55
+ Path(save_path).write_text(json.dumps(diag, indent=2))
56
+
57
  from transformers import AutoModelForCausalLM, AutoTokenizer
58
 
59
  print(f"loading {args.model} ({args.dtype}) on {args.device} ...")
 
64
  model.eval()
65
  device = next(model.parameters()).device
66
  n = model.config.num_hidden_layers
67
+ diag["num_layers"] = n
68
  if device.type == "cuda":
69
+ diag["peak_gpu_gb"] = round(torch.cuda.max_memory_allocated() / 1e9, 1)
70
+ print(f"loaded: {n} layers, peak GPU mem after load ~{diag['peak_gpu_gb']} GB")
71
+ save()
72
 
73
  window = LoopConfig.from_depth_fraction(n, center_frac=args.center, width=args.width).window
74
+ diag["window"] = list(window)
75
  print(f"loop window = {window} (depth fraction ~{args.center})")
76
 
77
  prompt = "Question: What is the capital of France?\nAnswer:"
 
85
  with torch.no_grad():
86
  base1 = fwd_logits()
87
  top = base1[0, -1].topk(5).indices.tolist()
88
+ diag["top5_tokens"] = [tok.decode([t]) for t in top]
89
+ print(f"\n[0] baseline top-5 next tokens: {[repr(t) for t in diag['top5_tokens']]}")
90
  if not args.no_generate:
91
  unpatch(model)
92
  with torch.no_grad():
93
  gen = model.generate(ids, max_new_tokens=20, do_sample=False)
94
+ diag["greedy_continuation"] = tok.decode(gen[0, ids.shape[1]:], skip_special_tokens=True)
95
+ print(f" greedy continuation: {diag['greedy_continuation']!r}")
96
+ save()
97
 
98
  # noise floor: baseline vs itself (kernel run-to-run nondeterminism).
99
  with torch.no_grad():
100
  base2 = fwd_logits()
101
  noise = (base1 - base2).abs().max().item()
102
+ diag["noise_floor"] = noise
103
  print(f"\n[1] kernel noise floor (baseline vs baseline): max|diff| = {noise:.2e}")
104
 
105
  # [1] K=1 patched must match baseline to within the noise floor.
 
109
  unpatch(model)
110
  d_k1 = (k1 - base1).abs().max().item()
111
  tol = max(noise * 4, 1e-3)
112
+ diag["k1_diff"], diag["tol"] = d_k1, tol
113
  print(f" K=1 patched vs baseline: max|diff| = {d_k1:.2e} (tol {tol:.1e})")
114
+ save()
115
  assert d_k1 <= tol, f"K=1 patch is NOT a faithful no-op on real kernels (diff {d_k1:.2e} > {tol:.1e})"
116
 
117
  # [2] K>1 looping must be active (finite, and clearly above the noise floor).
118
  print("\n[2] K>1 looping is active (diff should exceed the noise floor):")
119
+ diag["k_active"] = {}
120
  for mode in ("layer", "block"):
121
  for K in (2, 3):
122
  patch(model, LoopConfig(window=window, K=K, mode=mode))
 
124
  out = fwd_logits()
125
  unpatch(model)
126
  d = (out - base1).abs().max().item()
127
+ finite = bool(torch.isfinite(out).all().item())
128
+ diag["k_active"][f"{mode}-rk-K{K}"] = {"diff": d, "finite": finite}
129
+ save()
130
  assert finite, f"{mode} K={K}: non-finite logits"
131
  assert d > tol, f"{mode} K={K}: indistinguishable from baseline (loop not active?)"
132
  print(f" {mode}-rk-K{K}: max|diff| = {d:.3f}, finite={finite} OK")
 
138
  model(input_ids=ids, use_cache=True)
139
  raise AssertionError("expected NotImplementedError for use_cache=True")
140
  except NotImplementedError:
141
+ diag["decode_guard"] = "ok"
142
  print("\n[3] decode-time (use_cache=True) correctly guarded: OK")
143
  unpatch(model)
144
 
145
+ diag["passed"] = True
146
+ save()
147
  print("\nREAL-WEIGHTS SMOKE TEST PASSED — safe to run scripts/run_eval.py")
148
+ if save_path:
149
+ print(f"diagnostics saved to {save_path}")
150
 
151
 
152
  if __name__ == "__main__":