Add benchmark harness: runner.py - Main comparison orchestrator
Browse files- benchmark/runner.py +362 -0
benchmark/runner.py
ADDED
|
@@ -0,0 +1,362 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Benchmark Runner: Orchestrates base vs Cortex-enhanced model comparison.
|
| 3 |
+
|
| 4 |
+
Usage:
|
| 5 |
+
runner = BenchmarkRunner(model_name="HuggingFaceTB/SmolLM2-135M")
|
| 6 |
+
results = runner.run_comparison(tasks=["hellaswag", "piqa"], n=50)
|
| 7 |
+
runner.print_results(results)
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
import time
|
| 13 |
+
import json
|
| 14 |
+
import torch
|
| 15 |
+
from typing import Dict, List, Optional, Any
|
| 16 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 17 |
+
|
| 18 |
+
# Add parent dir so cortex can be imported
|
| 19 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 20 |
+
|
| 21 |
+
from benchmark.scoring import log_likelihood_score, accuracy_from_loglikelihoods
|
| 22 |
+
from benchmark.tasks import TASK_REGISTRY, BenchmarkTask
|
| 23 |
+
from benchmark.memory_tasks import PasskeyRetrieval, MultiHopMemory
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class BenchmarkRunner:
|
| 27 |
+
"""
|
| 28 |
+
Runs a full comparison between base model and Cortex-enhanced model.
|
| 29 |
+
|
| 30 |
+
Workflow:
|
| 31 |
+
1. Load base model, run all tasks → base results
|
| 32 |
+
2. Inject Cortex modules via CortexSurgeon → enhanced model
|
| 33 |
+
3. Run all tasks again → cortex results
|
| 34 |
+
4. Compare and report
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(
|
| 38 |
+
self,
|
| 39 |
+
model_name: str = "HuggingFaceTB/SmolLM2-135M",
|
| 40 |
+
device: str = "auto",
|
| 41 |
+
dtype: str = "float32",
|
| 42 |
+
):
|
| 43 |
+
self.model_name = model_name
|
| 44 |
+
|
| 45 |
+
if device == "auto":
|
| 46 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 47 |
+
else:
|
| 48 |
+
self.device = device
|
| 49 |
+
|
| 50 |
+
self.dtype = getattr(torch, dtype)
|
| 51 |
+
|
| 52 |
+
print(f"Loading model: {model_name} on {self.device} ({dtype})")
|
| 53 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 54 |
+
self.model = AutoModelForCausalLM.from_pretrained(
|
| 55 |
+
model_name,
|
| 56 |
+
dtype=self.dtype,
|
| 57 |
+
device_map=self.device,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
if self.tokenizer.pad_token is None:
|
| 61 |
+
self.tokenizer.pad_token = self.tokenizer.eos_token
|
| 62 |
+
|
| 63 |
+
self.model.eval()
|
| 64 |
+
print(f"Model loaded: {self.model.config.hidden_size}d, {self.model.config.num_hidden_layers}L")
|
| 65 |
+
|
| 66 |
+
def _run_loglikelihood_task(
|
| 67 |
+
self,
|
| 68 |
+
task: BenchmarkTask,
|
| 69 |
+
model,
|
| 70 |
+
n: Optional[int] = None,
|
| 71 |
+
) -> Dict:
|
| 72 |
+
"""Run a log-likelihood scoring task."""
|
| 73 |
+
print(f" Loading examples for {task.name}...")
|
| 74 |
+
examples = task.load_examples(n=n)
|
| 75 |
+
|
| 76 |
+
print(f" Scoring {len(examples)} examples...")
|
| 77 |
+
scores_and_golds = []
|
| 78 |
+
|
| 79 |
+
for i, ex in enumerate(examples):
|
| 80 |
+
if (i + 1) % 10 == 0:
|
| 81 |
+
print(f" [{i+1}/{len(examples)}]")
|
| 82 |
+
|
| 83 |
+
scores = log_likelihood_score(
|
| 84 |
+
model, self.tokenizer,
|
| 85 |
+
ex["context"], ex["continuations"],
|
| 86 |
+
device=self.device,
|
| 87 |
+
)
|
| 88 |
+
scores_and_golds.append((scores, ex["gold_idx"]))
|
| 89 |
+
|
| 90 |
+
return accuracy_from_loglikelihoods(scores_and_golds)
|
| 91 |
+
|
| 92 |
+
def _run_memory_tasks(
|
| 93 |
+
self,
|
| 94 |
+
model,
|
| 95 |
+
n_passkey: int = 5,
|
| 96 |
+
passkey_lengths: Optional[List[int]] = None,
|
| 97 |
+
n_multihop: Optional[int] = None,
|
| 98 |
+
) -> Dict:
|
| 99 |
+
"""Run memory-specific benchmarks."""
|
| 100 |
+
results = {}
|
| 101 |
+
|
| 102 |
+
# Passkey retrieval
|
| 103 |
+
print(" Running passkey retrieval...")
|
| 104 |
+
passkey = PasskeyRetrieval(context_lengths=passkey_lengths or [128, 256, 512])
|
| 105 |
+
results["passkey_retrieval"] = passkey.run(
|
| 106 |
+
model, self.tokenizer,
|
| 107 |
+
n_per_length=n_passkey, device=self.device,
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
# Multi-hop memory
|
| 111 |
+
print(" Running multi-hop memory...")
|
| 112 |
+
multihop = MultiHopMemory()
|
| 113 |
+
results["multi_hop_memory"] = multihop.run(
|
| 114 |
+
model, self.tokenizer,
|
| 115 |
+
n=n_multihop, device=self.device,
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
return results
|
| 119 |
+
|
| 120 |
+
def inject_cortex(self) -> Dict:
|
| 121 |
+
"""
|
| 122 |
+
Inject all Cortex modules into the model.
|
| 123 |
+
|
| 124 |
+
Returns dict with module info.
|
| 125 |
+
"""
|
| 126 |
+
from cortex import (
|
| 127 |
+
CortexSurgeon, MemoryBank, HallucinationGate,
|
| 128 |
+
PauseAndThink, BacktrackHead, SteeringVector, AdaptiveDepth,
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
surgeon = CortexSurgeon(self.model)
|
| 132 |
+
hidden_dim = surgeon.hidden_dim
|
| 133 |
+
num_layers = surgeon.num_layers
|
| 134 |
+
|
| 135 |
+
# Find valid num_heads for cross-attention
|
| 136 |
+
num_heads = 8
|
| 137 |
+
while hidden_dim % num_heads != 0 and num_heads > 1:
|
| 138 |
+
num_heads -= 1
|
| 139 |
+
|
| 140 |
+
middle_layers = list(range(num_layers // 3, 2 * num_layers // 3))
|
| 141 |
+
deep_layers = list(range(2 * num_layers // 3, num_layers))
|
| 142 |
+
|
| 143 |
+
surgeon.add_module("memory", MemoryBank(
|
| 144 |
+
hidden_dim=hidden_dim, num_slots=32, num_heads=num_heads,
|
| 145 |
+
target_layers=middle_layers,
|
| 146 |
+
))
|
| 147 |
+
surgeon.add_module("halluc_gate", HallucinationGate(
|
| 148 |
+
hidden_dim=hidden_dim, bottleneck_dim=32,
|
| 149 |
+
target_layers=deep_layers,
|
| 150 |
+
))
|
| 151 |
+
surgeon.add_module("pause_think", PauseAndThink(
|
| 152 |
+
hidden_dim=hidden_dim, num_think_tokens=4,
|
| 153 |
+
target_layers=middle_layers,
|
| 154 |
+
))
|
| 155 |
+
surgeon.add_module("backtrack", BacktrackHead(
|
| 156 |
+
hidden_dim=hidden_dim, confidence_bottleneck=32,
|
| 157 |
+
num_layers=num_layers, target_layers="all",
|
| 158 |
+
))
|
| 159 |
+
surgeon.add_module("steering", SteeringVector(
|
| 160 |
+
hidden_dim=hidden_dim, num_directions=2,
|
| 161 |
+
direction_names=["truthfulness", "helpfulness"],
|
| 162 |
+
target_layers=middle_layers,
|
| 163 |
+
))
|
| 164 |
+
surgeon.add_module("adaptive_depth", AdaptiveDepth(
|
| 165 |
+
hidden_dim=hidden_dim, target_layers="all",
|
| 166 |
+
))
|
| 167 |
+
|
| 168 |
+
surgeon.operate(freeze_base=True)
|
| 169 |
+
|
| 170 |
+
report = surgeon.get_parameter_report()
|
| 171 |
+
total_cortex = sum(info["trainable"] for info in report.values())
|
| 172 |
+
total_model = sum(p.numel() for p in self.model.parameters())
|
| 173 |
+
|
| 174 |
+
self._surgeon = surgeon
|
| 175 |
+
|
| 176 |
+
return {
|
| 177 |
+
"total_cortex_params": total_cortex,
|
| 178 |
+
"total_model_params": total_model,
|
| 179 |
+
"overhead_pct": total_cortex / total_model * 100,
|
| 180 |
+
"per_module": report,
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
def remove_cortex(self):
|
| 184 |
+
"""Remove Cortex modules and restore base model."""
|
| 185 |
+
if hasattr(self, "_surgeon"):
|
| 186 |
+
self._surgeon.remove_all()
|
| 187 |
+
del self._surgeon
|
| 188 |
+
|
| 189 |
+
def run_comparison(
|
| 190 |
+
self,
|
| 191 |
+
tasks: Optional[List[str]] = None,
|
| 192 |
+
n: int = 50,
|
| 193 |
+
include_memory: bool = True,
|
| 194 |
+
n_passkey: int = 5,
|
| 195 |
+
passkey_lengths: Optional[List[int]] = None,
|
| 196 |
+
) -> Dict:
|
| 197 |
+
"""
|
| 198 |
+
Run full comparison: base model vs Cortex-enhanced.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
tasks: List of task names from TASK_REGISTRY. None = all.
|
| 202 |
+
n: Number of examples per task.
|
| 203 |
+
include_memory: Whether to run memory benchmarks.
|
| 204 |
+
n_passkey: Number of passkey examples per context length.
|
| 205 |
+
passkey_lengths: Context lengths for passkey test.
|
| 206 |
+
|
| 207 |
+
Returns:
|
| 208 |
+
Dict with base_results, cortex_results, and comparison.
|
| 209 |
+
"""
|
| 210 |
+
if tasks is None:
|
| 211 |
+
tasks = ["hellaswag", "piqa", "arc-easy", "winogrande"]
|
| 212 |
+
|
| 213 |
+
results = {
|
| 214 |
+
"model": self.model_name,
|
| 215 |
+
"device": self.device,
|
| 216 |
+
"dtype": str(self.dtype),
|
| 217 |
+
"n_per_task": n,
|
| 218 |
+
"tasks": tasks,
|
| 219 |
+
"base": {},
|
| 220 |
+
"cortex": {},
|
| 221 |
+
"comparison": {},
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
# ===== BASE MODEL =====
|
| 225 |
+
print("\n" + "=" * 60)
|
| 226 |
+
print("PHASE 1: BASE MODEL EVALUATION")
|
| 227 |
+
print("=" * 60)
|
| 228 |
+
|
| 229 |
+
for task_name in tasks:
|
| 230 |
+
print(f"\n[BASE] Running {task_name}...")
|
| 231 |
+
t0 = time.time()
|
| 232 |
+
|
| 233 |
+
task_cls = TASK_REGISTRY[task_name]
|
| 234 |
+
task = task_cls() if callable(task_cls) else task_cls
|
| 235 |
+
|
| 236 |
+
result = self._run_loglikelihood_task(task, self.model, n=n)
|
| 237 |
+
result["time_seconds"] = time.time() - t0
|
| 238 |
+
results["base"][task_name] = result
|
| 239 |
+
|
| 240 |
+
print(f" {task_name}: {result['accuracy']:.4f} ({result['correct']}/{result['total']}) "
|
| 241 |
+
f"[{result['time_seconds']:.1f}s]")
|
| 242 |
+
|
| 243 |
+
if include_memory:
|
| 244 |
+
print(f"\n[BASE] Running memory benchmarks...")
|
| 245 |
+
t0 = time.time()
|
| 246 |
+
mem_results = self._run_memory_tasks(
|
| 247 |
+
self.model, n_passkey=n_passkey,
|
| 248 |
+
passkey_lengths=passkey_lengths,
|
| 249 |
+
)
|
| 250 |
+
mem_results["time_seconds"] = time.time() - t0
|
| 251 |
+
results["base"]["memory"] = mem_results
|
| 252 |
+
|
| 253 |
+
pk = mem_results["passkey_retrieval"]["overall"]
|
| 254 |
+
mh = mem_results["multi_hop_memory"]
|
| 255 |
+
print(f" passkey: {pk['accuracy']:.4f} ({pk['correct']}/{pk['total']})")
|
| 256 |
+
print(f" multi_hop: {mh['accuracy']:.4f} ({mh['correct']}/{mh['total']})")
|
| 257 |
+
|
| 258 |
+
# ===== CORTEX-ENHANCED MODEL =====
|
| 259 |
+
print("\n" + "=" * 60)
|
| 260 |
+
print("PHASE 2: CORTEX-ENHANCED MODEL EVALUATION")
|
| 261 |
+
print("=" * 60)
|
| 262 |
+
|
| 263 |
+
print("\nInjecting Cortex modules...")
|
| 264 |
+
module_info = self.inject_cortex()
|
| 265 |
+
print(f" Cortex params: {module_info['total_cortex_params']:,} "
|
| 266 |
+
f"({module_info['overhead_pct']:.2f}% overhead)")
|
| 267 |
+
results["cortex_info"] = module_info
|
| 268 |
+
|
| 269 |
+
for task_name in tasks:
|
| 270 |
+
print(f"\n[CORTEX] Running {task_name}...")
|
| 271 |
+
t0 = time.time()
|
| 272 |
+
|
| 273 |
+
task_cls = TASK_REGISTRY[task_name]
|
| 274 |
+
task = task_cls() if callable(task_cls) else task_cls
|
| 275 |
+
|
| 276 |
+
result = self._run_loglikelihood_task(task, self.model, n=n)
|
| 277 |
+
result["time_seconds"] = time.time() - t0
|
| 278 |
+
results["cortex"][task_name] = result
|
| 279 |
+
|
| 280 |
+
print(f" {task_name}: {result['accuracy']:.4f} ({result['correct']}/{result['total']}) "
|
| 281 |
+
f"[{result['time_seconds']:.1f}s]")
|
| 282 |
+
|
| 283 |
+
if include_memory:
|
| 284 |
+
print(f"\n[CORTEX] Running memory benchmarks...")
|
| 285 |
+
t0 = time.time()
|
| 286 |
+
mem_results = self._run_memory_tasks(
|
| 287 |
+
self.model, n_passkey=n_passkey,
|
| 288 |
+
passkey_lengths=passkey_lengths,
|
| 289 |
+
)
|
| 290 |
+
mem_results["time_seconds"] = time.time() - t0
|
| 291 |
+
results["cortex"]["memory"] = mem_results
|
| 292 |
+
|
| 293 |
+
pk = mem_results["passkey_retrieval"]["overall"]
|
| 294 |
+
mh = mem_results["multi_hop_memory"]
|
| 295 |
+
print(f" passkey: {pk['accuracy']:.4f} ({pk['correct']}/{pk['total']})")
|
| 296 |
+
print(f" multi_hop: {mh['accuracy']:.4f} ({mh['correct']}/{mh['total']})")
|
| 297 |
+
|
| 298 |
+
# ===== COMPARISON =====
|
| 299 |
+
print("\n" + "=" * 60)
|
| 300 |
+
print("COMPARISON: BASE vs CORTEX")
|
| 301 |
+
print("=" * 60)
|
| 302 |
+
|
| 303 |
+
for task_name in tasks:
|
| 304 |
+
base_acc = results["base"][task_name]["accuracy"]
|
| 305 |
+
cortex_acc = results["cortex"][task_name]["accuracy"]
|
| 306 |
+
delta = cortex_acc - base_acc
|
| 307 |
+
symbol = "↑" if delta > 0 else "↓" if delta < 0 else "="
|
| 308 |
+
|
| 309 |
+
results["comparison"][task_name] = {
|
| 310 |
+
"base": base_acc,
|
| 311 |
+
"cortex": cortex_acc,
|
| 312 |
+
"delta": delta,
|
| 313 |
+
}
|
| 314 |
+
|
| 315 |
+
print(f" {task_name:20s} base={base_acc:.4f} cortex={cortex_acc:.4f} "
|
| 316 |
+
f"Δ={delta:+.4f} {symbol}")
|
| 317 |
+
|
| 318 |
+
if include_memory:
|
| 319 |
+
base_pk = results["base"]["memory"]["passkey_retrieval"]["overall"]["accuracy"]
|
| 320 |
+
cortex_pk = results["cortex"]["memory"]["passkey_retrieval"]["overall"]["accuracy"]
|
| 321 |
+
base_mh = results["base"]["memory"]["multi_hop_memory"]["accuracy"]
|
| 322 |
+
cortex_mh = results["cortex"]["memory"]["multi_hop_memory"]["accuracy"]
|
| 323 |
+
|
| 324 |
+
results["comparison"]["passkey"] = {
|
| 325 |
+
"base": base_pk, "cortex": cortex_pk, "delta": cortex_pk - base_pk,
|
| 326 |
+
}
|
| 327 |
+
results["comparison"]["multi_hop"] = {
|
| 328 |
+
"base": base_mh, "cortex": cortex_mh, "delta": cortex_mh - base_mh,
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
print(f" {'passkey':20s} base={base_pk:.4f} cortex={cortex_pk:.4f} "
|
| 332 |
+
f"Δ={cortex_pk - base_pk:+.4f}")
|
| 333 |
+
print(f" {'multi_hop':20s} base={base_mh:.4f} cortex={cortex_mh:.4f} "
|
| 334 |
+
f"Δ={cortex_mh - base_mh:+.4f}")
|
| 335 |
+
|
| 336 |
+
# Remove cortex modules to restore base model
|
| 337 |
+
self.remove_cortex()
|
| 338 |
+
|
| 339 |
+
return results
|
| 340 |
+
|
| 341 |
+
@staticmethod
|
| 342 |
+
def print_summary(results: Dict):
|
| 343 |
+
"""Print a formatted summary of benchmark results."""
|
| 344 |
+
print("\n" + "=" * 70)
|
| 345 |
+
print(f"BENCHMARK SUMMARY: {results['model']}")
|
| 346 |
+
print(f"n={results['n_per_task']} per task, device={results['device']}")
|
| 347 |
+
print("=" * 70)
|
| 348 |
+
|
| 349 |
+
print(f"\n{'Task':22s} {'Base':>8s} {'Cortex':>8s} {'Delta':>8s}")
|
| 350 |
+
print("-" * 50)
|
| 351 |
+
|
| 352 |
+
for task_name, comp in results["comparison"].items():
|
| 353 |
+
delta_str = f"{comp['delta']:+.4f}"
|
| 354 |
+
symbol = " ↑" if comp["delta"] > 0.001 else " ↓" if comp["delta"] < -0.001 else " "
|
| 355 |
+
print(f"{task_name:22s} {comp['base']:8.4f} {comp['cortex']:8.4f} {delta_str:>8s}{symbol}")
|
| 356 |
+
|
| 357 |
+
if "cortex_info" in results:
|
| 358 |
+
info = results["cortex_info"]
|
| 359 |
+
print(f"\nCortex overhead: {info['total_cortex_params']:,} params "
|
| 360 |
+
f"({info['overhead_pct']:.2f}%)")
|
| 361 |
+
|
| 362 |
+
print("=" * 70)
|