theapemachine commited on
Commit
fed2333
·
verified ·
1 Parent(s): e014dff

Add benchmark harness: runner.py - Main comparison orchestrator

Browse files
Files changed (1) hide show
  1. benchmark/runner.py +362 -0
benchmark/runner.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benchmark Runner: Orchestrates base vs Cortex-enhanced model comparison.
3
+
4
+ Usage:
5
+ runner = BenchmarkRunner(model_name="HuggingFaceTB/SmolLM2-135M")
6
+ results = runner.run_comparison(tasks=["hellaswag", "piqa"], n=50)
7
+ runner.print_results(results)
8
+ """
9
+
10
+ import sys
11
+ import os
12
+ import time
13
+ import json
14
+ import torch
15
+ from typing import Dict, List, Optional, Any
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+
18
+ # Add parent dir so cortex can be imported
19
+ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
20
+
21
+ from benchmark.scoring import log_likelihood_score, accuracy_from_loglikelihoods
22
+ from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
23
+ from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
24
+
25
+
26
+ class BenchmarkRunner:
27
+ """
28
+ Runs a full comparison between base model and Cortex-enhanced model.
29
+
30
+ Workflow:
31
+ 1. Load base model, run all tasks → base results
32
+ 2. Inject Cortex modules via CortexSurgeon → enhanced model
33
+ 3. Run all tasks again → cortex results
34
+ 4. Compare and report
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ model_name: str = "HuggingFaceTB/SmolLM2-135M",
40
+ device: str = "auto",
41
+ dtype: str = "float32",
42
+ ):
43
+ self.model_name = model_name
44
+
45
+ if device == "auto":
46
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
47
+ else:
48
+ self.device = device
49
+
50
+ self.dtype = getattr(torch, dtype)
51
+
52
+ print(f"Loading model: {model_name} on {self.device} ({dtype})")
53
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
54
+ self.model = AutoModelForCausalLM.from_pretrained(
55
+ model_name,
56
+ dtype=self.dtype,
57
+ device_map=self.device,
58
+ )
59
+
60
+ if self.tokenizer.pad_token is None:
61
+ self.tokenizer.pad_token = self.tokenizer.eos_token
62
+
63
+ self.model.eval()
64
+ print(f"Model loaded: {self.model.config.hidden_size}d, {self.model.config.num_hidden_layers}L")
65
+
66
+ def _run_loglikelihood_task(
67
+ self,
68
+ task: BenchmarkTask,
69
+ model,
70
+ n: Optional[int] = None,
71
+ ) -> Dict:
72
+ """Run a log-likelihood scoring task."""
73
+ print(f" Loading examples for {task.name}...")
74
+ examples = task.load_examples(n=n)
75
+
76
+ print(f" Scoring {len(examples)} examples...")
77
+ scores_and_golds = []
78
+
79
+ for i, ex in enumerate(examples):
80
+ if (i + 1) % 10 == 0:
81
+ print(f" [{i+1}/{len(examples)}]")
82
+
83
+ scores = log_likelihood_score(
84
+ model, self.tokenizer,
85
+ ex["context"], ex["continuations"],
86
+ device=self.device,
87
+ )
88
+ scores_and_golds.append((scores, ex["gold_idx"]))
89
+
90
+ return accuracy_from_loglikelihoods(scores_and_golds)
91
+
92
+ def _run_memory_tasks(
93
+ self,
94
+ model,
95
+ n_passkey: int = 5,
96
+ passkey_lengths: Optional[List[int]] = None,
97
+ n_multihop: Optional[int] = None,
98
+ ) -> Dict:
99
+ """Run memory-specific benchmarks."""
100
+ results = {}
101
+
102
+ # Passkey retrieval
103
+ print(" Running passkey retrieval...")
104
+ passkey = PasskeyRetrieval(context_lengths=passkey_lengths or [128, 256, 512])
105
+ results["passkey_retrieval"] = passkey.run(
106
+ model, self.tokenizer,
107
+ n_per_length=n_passkey, device=self.device,
108
+ )
109
+
110
+ # Multi-hop memory
111
+ print(" Running multi-hop memory...")
112
+ multihop = MultiHopMemory()
113
+ results["multi_hop_memory"] = multihop.run(
114
+ model, self.tokenizer,
115
+ n=n_multihop, device=self.device,
116
+ )
117
+
118
+ return results
119
+
120
+ def inject_cortex(self) -> Dict:
121
+ """
122
+ Inject all Cortex modules into the model.
123
+
124
+ Returns dict with module info.
125
+ """
126
+ from cortex import (
127
+ CortexSurgeon, MemoryBank, HallucinationGate,
128
+ PauseAndThink, BacktrackHead, SteeringVector, AdaptiveDepth,
129
+ )
130
+
131
+ surgeon = CortexSurgeon(self.model)
132
+ hidden_dim = surgeon.hidden_dim
133
+ num_layers = surgeon.num_layers
134
+
135
+ # Find valid num_heads for cross-attention
136
+ num_heads = 8
137
+ while hidden_dim % num_heads != 0 and num_heads > 1:
138
+ num_heads -= 1
139
+
140
+ middle_layers = list(range(num_layers // 3, 2 * num_layers // 3))
141
+ deep_layers = list(range(2 * num_layers // 3, num_layers))
142
+
143
+ surgeon.add_module("memory", MemoryBank(
144
+ hidden_dim=hidden_dim, num_slots=32, num_heads=num_heads,
145
+ target_layers=middle_layers,
146
+ ))
147
+ surgeon.add_module("halluc_gate", HallucinationGate(
148
+ hidden_dim=hidden_dim, bottleneck_dim=32,
149
+ target_layers=deep_layers,
150
+ ))
151
+ surgeon.add_module("pause_think", PauseAndThink(
152
+ hidden_dim=hidden_dim, num_think_tokens=4,
153
+ target_layers=middle_layers,
154
+ ))
155
+ surgeon.add_module("backtrack", BacktrackHead(
156
+ hidden_dim=hidden_dim, confidence_bottleneck=32,
157
+ num_layers=num_layers, target_layers="all",
158
+ ))
159
+ surgeon.add_module("steering", SteeringVector(
160
+ hidden_dim=hidden_dim, num_directions=2,
161
+ direction_names=["truthfulness", "helpfulness"],
162
+ target_layers=middle_layers,
163
+ ))
164
+ surgeon.add_module("adaptive_depth", AdaptiveDepth(
165
+ hidden_dim=hidden_dim, target_layers="all",
166
+ ))
167
+
168
+ surgeon.operate(freeze_base=True)
169
+
170
+ report = surgeon.get_parameter_report()
171
+ total_cortex = sum(info["trainable"] for info in report.values())
172
+ total_model = sum(p.numel() for p in self.model.parameters())
173
+
174
+ self._surgeon = surgeon
175
+
176
+ return {
177
+ "total_cortex_params": total_cortex,
178
+ "total_model_params": total_model,
179
+ "overhead_pct": total_cortex / total_model * 100,
180
+ "per_module": report,
181
+ }
182
+
183
+ def remove_cortex(self):
184
+ """Remove Cortex modules and restore base model."""
185
+ if hasattr(self, "_surgeon"):
186
+ self._surgeon.remove_all()
187
+ del self._surgeon
188
+
189
+ def run_comparison(
190
+ self,
191
+ tasks: Optional[List[str]] = None,
192
+ n: int = 50,
193
+ include_memory: bool = True,
194
+ n_passkey: int = 5,
195
+ passkey_lengths: Optional[List[int]] = None,
196
+ ) -> Dict:
197
+ """
198
+ Run full comparison: base model vs Cortex-enhanced.
199
+
200
+ Args:
201
+ tasks: List of task names from TASK_REGISTRY. None = all.
202
+ n: Number of examples per task.
203
+ include_memory: Whether to run memory benchmarks.
204
+ n_passkey: Number of passkey examples per context length.
205
+ passkey_lengths: Context lengths for passkey test.
206
+
207
+ Returns:
208
+ Dict with base_results, cortex_results, and comparison.
209
+ """
210
+ if tasks is None:
211
+ tasks = ["hellaswag", "piqa", "arc-easy", "winogrande"]
212
+
213
+ results = {
214
+ "model": self.model_name,
215
+ "device": self.device,
216
+ "dtype": str(self.dtype),
217
+ "n_per_task": n,
218
+ "tasks": tasks,
219
+ "base": {},
220
+ "cortex": {},
221
+ "comparison": {},
222
+ }
223
+
224
+ # ===== BASE MODEL =====
225
+ print("\n" + "=" * 60)
226
+ print("PHASE 1: BASE MODEL EVALUATION")
227
+ print("=" * 60)
228
+
229
+ for task_name in tasks:
230
+ print(f"\n[BASE] Running {task_name}...")
231
+ t0 = time.time()
232
+
233
+ task_cls = TASK_REGISTRY[task_name]
234
+ task = task_cls() if callable(task_cls) else task_cls
235
+
236
+ result = self._run_loglikelihood_task(task, self.model, n=n)
237
+ result["time_seconds"] = time.time() - t0
238
+ results["base"][task_name] = result
239
+
240
+ print(f" {task_name}: {result['accuracy']:.4f} ({result['correct']}/{result['total']}) "
241
+ f"[{result['time_seconds']:.1f}s]")
242
+
243
+ if include_memory:
244
+ print(f"\n[BASE] Running memory benchmarks...")
245
+ t0 = time.time()
246
+ mem_results = self._run_memory_tasks(
247
+ self.model, n_passkey=n_passkey,
248
+ passkey_lengths=passkey_lengths,
249
+ )
250
+ mem_results["time_seconds"] = time.time() - t0
251
+ results["base"]["memory"] = mem_results
252
+
253
+ pk = mem_results["passkey_retrieval"]["overall"]
254
+ mh = mem_results["multi_hop_memory"]
255
+ print(f" passkey: {pk['accuracy']:.4f} ({pk['correct']}/{pk['total']})")
256
+ print(f" multi_hop: {mh['accuracy']:.4f} ({mh['correct']}/{mh['total']})")
257
+
258
+ # ===== CORTEX-ENHANCED MODEL =====
259
+ print("\n" + "=" * 60)
260
+ print("PHASE 2: CORTEX-ENHANCED MODEL EVALUATION")
261
+ print("=" * 60)
262
+
263
+ print("\nInjecting Cortex modules...")
264
+ module_info = self.inject_cortex()
265
+ print(f" Cortex params: {module_info['total_cortex_params']:,} "
266
+ f"({module_info['overhead_pct']:.2f}% overhead)")
267
+ results["cortex_info"] = module_info
268
+
269
+ for task_name in tasks:
270
+ print(f"\n[CORTEX] Running {task_name}...")
271
+ t0 = time.time()
272
+
273
+ task_cls = TASK_REGISTRY[task_name]
274
+ task = task_cls() if callable(task_cls) else task_cls
275
+
276
+ result = self._run_loglikelihood_task(task, self.model, n=n)
277
+ result["time_seconds"] = time.time() - t0
278
+ results["cortex"][task_name] = result
279
+
280
+ print(f" {task_name}: {result['accuracy']:.4f} ({result['correct']}/{result['total']}) "
281
+ f"[{result['time_seconds']:.1f}s]")
282
+
283
+ if include_memory:
284
+ print(f"\n[CORTEX] Running memory benchmarks...")
285
+ t0 = time.time()
286
+ mem_results = self._run_memory_tasks(
287
+ self.model, n_passkey=n_passkey,
288
+ passkey_lengths=passkey_lengths,
289
+ )
290
+ mem_results["time_seconds"] = time.time() - t0
291
+ results["cortex"]["memory"] = mem_results
292
+
293
+ pk = mem_results["passkey_retrieval"]["overall"]
294
+ mh = mem_results["multi_hop_memory"]
295
+ print(f" passkey: {pk['accuracy']:.4f} ({pk['correct']}/{pk['total']})")
296
+ print(f" multi_hop: {mh['accuracy']:.4f} ({mh['correct']}/{mh['total']})")
297
+
298
+ # ===== COMPARISON =====
299
+ print("\n" + "=" * 60)
300
+ print("COMPARISON: BASE vs CORTEX")
301
+ print("=" * 60)
302
+
303
+ for task_name in tasks:
304
+ base_acc = results["base"][task_name]["accuracy"]
305
+ cortex_acc = results["cortex"][task_name]["accuracy"]
306
+ delta = cortex_acc - base_acc
307
+ symbol = "↑" if delta > 0 else "↓" if delta < 0 else "="
308
+
309
+ results["comparison"][task_name] = {
310
+ "base": base_acc,
311
+ "cortex": cortex_acc,
312
+ "delta": delta,
313
+ }
314
+
315
+ print(f" {task_name:20s} base={base_acc:.4f} cortex={cortex_acc:.4f} "
316
+ f"Δ={delta:+.4f} {symbol}")
317
+
318
+ if include_memory:
319
+ base_pk = results["base"]["memory"]["passkey_retrieval"]["overall"]["accuracy"]
320
+ cortex_pk = results["cortex"]["memory"]["passkey_retrieval"]["overall"]["accuracy"]
321
+ base_mh = results["base"]["memory"]["multi_hop_memory"]["accuracy"]
322
+ cortex_mh = results["cortex"]["memory"]["multi_hop_memory"]["accuracy"]
323
+
324
+ results["comparison"]["passkey"] = {
325
+ "base": base_pk, "cortex": cortex_pk, "delta": cortex_pk - base_pk,
326
+ }
327
+ results["comparison"]["multi_hop"] = {
328
+ "base": base_mh, "cortex": cortex_mh, "delta": cortex_mh - base_mh,
329
+ }
330
+
331
+ print(f" {'passkey':20s} base={base_pk:.4f} cortex={cortex_pk:.4f} "
332
+ f"Δ={cortex_pk - base_pk:+.4f}")
333
+ print(f" {'multi_hop':20s} base={base_mh:.4f} cortex={cortex_mh:.4f} "
334
+ f"Δ={cortex_mh - base_mh:+.4f}")
335
+
336
+ # Remove cortex modules to restore base model
337
+ self.remove_cortex()
338
+
339
+ return results
340
+
341
+ @staticmethod
342
+ def print_summary(results: Dict):
343
+ """Print a formatted summary of benchmark results."""
344
+ print("\n" + "=" * 70)
345
+ print(f"BENCHMARK SUMMARY: {results['model']}")
346
+ print(f"n={results['n_per_task']} per task, device={results['device']}")
347
+ print("=" * 70)
348
+
349
+ print(f"\n{'Task':22s} {'Base':>8s} {'Cortex':>8s} {'Delta':>8s}")
350
+ print("-" * 50)
351
+
352
+ for task_name, comp in results["comparison"].items():
353
+ delta_str = f"{comp['delta']:+.4f}"
354
+ symbol = " ↑" if comp["delta"] > 0.001 else " ↓" if comp["delta"] < -0.001 else " "
355
+ print(f"{task_name:22s} {comp['base']:8.4f} {comp['cortex']:8.4f} {delta_str:>8s}{symbol}")
356
+
357
+ if "cortex_info" in results:
358
+ info = results["cortex_info"]
359
+ print(f"\nCortex overhead: {info['total_cortex_params']:,} params "
360
+ f"({info['overhead_pct']:.2f}%)")
361
+
362
+ print("=" * 70)