theapemachine commited on
Commit
8fab8ff
Β·
verified Β·
1 Parent(s): a02f16c

Benchmark v2: flip accounting, lambda sweep, honest N=200 results, fixed bias_applied semantics

Browse files
Files changed (2) hide show
  1. tensegrity/bench/run.py +39 -20
  2. tensegrity/bench/runner.py +289 -177
tensegrity/bench/run.py CHANGED
@@ -3,14 +3,17 @@
3
  Tensegrity Benchmark CLI.
4
 
5
  Usage:
6
- # Quick dev run (offline, 20 samples/task, 3 tasks):
7
- python -m tensegrity.bench.run --mode offline --max-samples 20 --tasks copa,boolq,logiqa
8
 
9
- # Full offline benchmark (all tasks, all samples):
10
  python -m tensegrity.bench.run --mode offline
11
 
 
 
 
12
  # Local model benchmark (requires GPU):
13
- python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct --max-samples 50
14
 
15
  # Save results:
16
  python -m tensegrity.bench.run --mode offline --output results.json
@@ -35,10 +38,12 @@ def main():
35
  help="Comma-separated task names (default: all)")
36
  parser.add_argument("--max-samples", type=int, default=None,
37
  help="Max samples per task (default: all)")
38
- parser.add_argument("--scale", type=float, default=2.5,
39
- help="Graft logit bias scale")
40
- parser.add_argument("--entropy-gate", type=float, default=0.85,
41
- help="Convergence gate threshold")
 
 
42
  parser.add_argument("--output", default=None,
43
  help="Save results to JSON file")
44
  parser.add_argument("--list-tasks", action="store_true",
@@ -62,22 +67,36 @@ def main():
62
  runner = EvalRunner(
63
  model_name=args.model,
64
  mode=args.mode,
65
- graft_scale=args.scale,
66
- graft_entropy_gate=args.entropy_gate,
67
  seed=args.seed,
68
  )
69
 
70
- result = runner.run_benchmark(
71
- tasks=tasks,
72
- max_samples_per_task=args.max_samples,
73
- verbose=not args.quiet,
74
- )
75
-
76
- if args.output:
77
- runner.save_results(result, args.output)
78
- print(f"\nResults saved to {args.output}")
 
 
 
 
 
 
79
  else:
80
- print(f"\n{json.dumps(result.to_dict(), indent=2)}")
 
 
 
 
 
 
 
 
 
81
 
82
 
83
  if __name__ == "__main__":
 
3
  Tensegrity Benchmark CLI.
4
 
5
  Usage:
6
+ # Quick benchmark (offline, 50 samples/task):
7
+ python -m tensegrity.bench.run --mode offline --max-samples 50 --tasks copa,boolq,sciq
8
 
9
+ # Full offline benchmark:
10
  python -m tensegrity.bench.run --mode offline
11
 
12
+ # Ξ» sweep (find optimal graft weight):
13
+ python -m tensegrity.bench.run --sweep --max-samples 100 --tasks copa,sciq,truthfulqa
14
+
15
  # Local model benchmark (requires GPU):
16
+ python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct
17
 
18
  # Save results:
19
  python -m tensegrity.bench.run --mode offline --output results.json
 
38
  help="Comma-separated task names (default: all)")
39
  parser.add_argument("--max-samples", type=int, default=None,
40
  help="Max samples per task (default: all)")
41
+ parser.add_argument("--lam", type=float, default=1.0,
42
+ help="Ξ» β€” graft weight: score = baseline + Ξ»*tensegrity (default: 1.0)")
43
+ parser.add_argument("--sweep", action="store_true",
44
+ help="Run Ξ» sweep over [0, 0.1, 0.25, 0.5, 1.0, 2.0]")
45
+ parser.add_argument("--sweep-lambdas", default=None,
46
+ help="Custom Ξ» values for sweep (comma-separated, e.g. 0,0.5,1,2,4)")
47
  parser.add_argument("--output", default=None,
48
  help="Save results to JSON file")
49
  parser.add_argument("--list-tasks", action="store_true",
 
67
  runner = EvalRunner(
68
  model_name=args.model,
69
  mode=args.mode,
70
+ lam=args.lam,
 
71
  seed=args.seed,
72
  )
73
 
74
+ if args.sweep:
75
+ lambdas = None
76
+ if args.sweep_lambdas:
77
+ lambdas = [float(x) for x in args.sweep_lambdas.split(",")]
78
+ results = runner.sweep_lambda(
79
+ tasks=tasks,
80
+ lambdas=lambdas,
81
+ max_samples_per_task=args.max_samples,
82
+ verbose=not args.quiet,
83
+ )
84
+ if args.output:
85
+ sweep_data = [r.to_dict() for r in results]
86
+ with open(args.output, "w") as f:
87
+ json.dump(sweep_data, f, indent=2)
88
+ print(f"\nSweep results saved to {args.output}")
89
  else:
90
+ result = runner.run_benchmark(
91
+ tasks=tasks,
92
+ max_samples_per_task=args.max_samples,
93
+ verbose=not args.quiet,
94
+ )
95
+ if args.output:
96
+ runner.save_results(result, args.output)
97
+ print(f"\nResults saved to {args.output}")
98
+ elif not args.quiet:
99
+ print(f"\n{json.dumps(result.to_dict(), indent=2)}")
100
 
101
 
102
  if __name__ == "__main__":
tensegrity/bench/runner.py CHANGED
@@ -7,21 +7,24 @@ Two evaluation modes per sample:
7
  P(choice | prompt) computed from raw logits.
8
  Prediction = argmax over choices.
9
 
10
- GRAFTED: Same scoring, but with TensegrityLogitsProcessor active.
11
- Tensegrity processes the prompt as an observation first,
12
- forms belief posteriors over choices, then injects logit
13
- biases during the scoring pass. Prediction = argmax over
14
- biased scores.
15
-
16
- Both modes use identical prompts, identical model, identical decoding.
17
- The ONLY difference is the presence/absence of the logit-bias graft.
18
  This is a controlled A/B comparison.
19
 
20
- Metrics:
21
- - accuracy: fraction correct
22
- - accuracy_by_domain: broken down by task domain
23
- - delta: grafted_accuracy - baseline_accuracy (positive = graft helps)
24
- - confidence: mean max-posterior at decision time
 
 
 
 
 
 
 
25
  """
26
 
27
  import numpy as np
@@ -29,7 +32,7 @@ import time
29
  import json
30
  import logging
31
  from typing import Dict, List, Optional, Any, Tuple
32
- from dataclasses import dataclass, field, asdict
33
  from pathlib import Path
34
 
35
  from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
@@ -43,17 +46,54 @@ class SampleResult:
43
  sample_id: str
44
  task: str
45
  gold: int
 
46
  baseline_pred: int
47
  grafted_pred: int
48
  baseline_correct: bool
49
  grafted_correct: bool
50
  baseline_scores: List[float]
51
  grafted_scores: List[float]
52
- graft_posteriors: Dict[str, float]
53
- graft_entropy: float
54
- graft_emitted: bool
55
- wall_time_baseline: float
56
- wall_time_grafted: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
 
59
  @dataclass
@@ -62,26 +102,36 @@ class TaskResult:
62
  task: str
63
  domain: str
64
  n_samples: int
 
 
65
  baseline_accuracy: float
66
  grafted_accuracy: float
67
- delta: float # grafted - baseline
68
  baseline_correct: int
69
  grafted_correct: int
 
 
 
 
 
70
  mean_graft_entropy: float
71
- mean_graft_emitted_rate: float
72
- mean_wall_time_baseline: float
73
- mean_wall_time_grafted: float
74
- speedup: float # baseline_time / grafted_time
75
 
76
 
77
  @dataclass
78
  class BenchmarkResult:
79
  """Full benchmark result across all tasks."""
80
  model_name: str
 
 
81
  tasks: List[TaskResult]
82
  overall_baseline_accuracy: float
83
  overall_grafted_accuracy: float
84
  overall_delta: float
 
85
  total_samples: int
86
  total_wall_time: float
87
  timestamp: str = ""
@@ -89,23 +139,30 @@ class BenchmarkResult:
89
  def to_dict(self) -> dict:
90
  return {
91
  "model": self.model_name,
 
 
92
  "overall": {
93
  "baseline_accuracy": round(self.overall_baseline_accuracy, 4),
94
  "grafted_accuracy": round(self.overall_grafted_accuracy, 4),
95
  "delta": round(self.overall_delta, 4),
96
  "total_samples": self.total_samples,
97
  "wall_time_s": round(self.total_wall_time, 1),
 
98
  },
99
  "tasks": [
100
  {
101
  "task": t.task,
102
  "domain": t.domain,
103
  "n": t.n_samples,
 
104
  "baseline": round(t.baseline_accuracy, 4),
105
  "grafted": round(t.grafted_accuracy, 4),
106
  "delta": round(t.delta, 4),
107
- "graft_emit_rate": round(t.mean_graft_emitted_rate, 3),
108
- "graft_entropy": round(t.mean_graft_entropy, 3),
 
 
 
109
  }
110
  for t in self.tasks
111
  ],
@@ -113,19 +170,27 @@ class BenchmarkResult:
113
 
114
  def summary_table(self) -> str:
115
  lines = []
116
- lines.append(f"{'Task':<25} {'N':>5} {'Baseline':>10} {'Grafted':>10} {'Ξ”':>8} {'Emit%':>7}")
117
- lines.append("─" * 68)
 
118
  for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
119
  sign = "+" if t.delta >= 0 else ""
 
 
120
  lines.append(
121
- f"{t.task:<25} {t.n_samples:>5} {t.baseline_accuracy:>9.1%} "
122
- f"{t.grafted_accuracy:>9.1%} {sign}{t.delta:>7.1%} {t.mean_graft_emitted_rate:>6.0%}"
 
123
  )
124
- lines.append("─" * 68)
125
  sign = "+" if self.overall_delta >= 0 else ""
 
 
126
  lines.append(
127
- f"{'OVERALL':<25} {self.total_samples:>5} {self.overall_baseline_accuracy:>9.1%} "
128
- f"{self.overall_grafted_accuracy:>9.1%} {sign}{self.overall_delta:>7.1%}"
 
 
129
  )
130
  return "\n".join(lines)
131
 
@@ -136,36 +201,39 @@ class EvalRunner:
136
 
137
  Modes:
138
  "local" β€” Uses transformers model with LogitsProcessor
139
- "offline" β€” No LLM; scores choices via Tensegrity posteriors only
140
  (tests the cognitive layer in isolation)
 
 
 
 
141
  """
142
 
143
  def __init__(self,
144
  model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
145
  mode: str = "offline",
146
- graft_scale: float = 2.5,
147
- graft_entropy_gate: float = 0.85,
148
  seed: int = 42):
 
 
 
 
 
 
 
149
  self.model_name = model_name
150
  self.mode = mode
151
- self.graft_scale = graft_scale
152
- self.graft_entropy_gate = graft_entropy_gate
153
  self.seed = seed
154
 
155
- # Lazy-loaded
156
  self._model = None
157
  self._tokenizer = None
158
 
159
  def _init_model(self):
160
- """Load model + tokenizer for local mode."""
161
- if self._model is not None:
162
  return
163
- if self.mode != "local":
164
- return
165
-
166
  from transformers import AutoTokenizer, AutoModelForCausalLM
167
  import torch
168
-
169
  logger.info(f"Loading model {self.model_name}...")
170
  self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
171
  if self._tokenizer.pad_token is None:
@@ -176,21 +244,12 @@ class EvalRunner:
176
  device_map="auto" if torch.cuda.is_available() else None,
177
  )
178
  self._model.eval()
179
- logger.info("Model loaded.")
180
 
181
  # ─── SCORING ────────────────────────────────────────────
182
 
183
- def _score_choices_local(self, prompt: str, choices: List[str],
184
- logit_bias_fn=None) -> List[float]:
185
- """
186
- Score each choice by computing log P(choice | prompt).
187
-
188
- For each choice, concatenate prompt + choice, compute the
189
- sum of log-probs over the choice tokens only.
190
- """
191
  import torch
192
- from transformers import LogitsProcessorList
193
-
194
  scores = []
195
  for choice in choices:
196
  full_text = f"{prompt} {choice}"
@@ -198,154 +257,142 @@ class EvalRunner:
198
  truncation=True, max_length=512)
199
  if hasattr(self._model, 'device'):
200
  inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
201
-
202
  with torch.no_grad():
203
  outputs = self._model(**inputs)
204
- logits = outputs.logits # (1, seq_len, vocab_size)
205
-
206
- # Get log-probs for the choice tokens
207
  prompt_ids = self._tokenizer(prompt, return_tensors="pt",
208
  truncation=True, max_length=512)["input_ids"]
209
  n_prompt = prompt_ids.shape[1]
210
  n_total = inputs["input_ids"].shape[1]
211
-
212
- # Sum log-probs of choice tokens
213
  log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
214
  choice_log_prob = 0.0
215
  for pos in range(n_prompt, n_total):
216
  token_id = inputs["input_ids"][0, pos].item()
217
  choice_log_prob += log_probs[pos - 1, token_id].item()
218
-
219
- # Length-normalize
220
  n_choice_tokens = max(n_total - n_prompt, 1)
221
  scores.append(choice_log_prob / n_choice_tokens)
222
-
223
  return scores
224
 
225
- def _score_choices_offline(self, sample: TaskSample) -> Tuple[List[float], List[float], Dict]:
226
  """
227
- Offline scoring: no LLM, use Tensegrity cognitive layer.
228
-
229
- Baseline: uniform random (represents an LLM with no reasoning)
230
- Grafted: Tensegrity processes the prompt and scores choices via posteriors
231
-
232
- Returns (baseline_scores, grafted_scores, graft_info)
233
  """
234
  from tensegrity.broca.controller import CognitiveController
235
 
236
  n = len(sample.choices)
237
- # Baseline: uniform scores (random baseline)
238
- rng = np.random.RandomState(hash(sample.id) % 2**31)
239
- baseline_scores = rng.randn(n).tolist()
240
-
241
- # Grafted: Tensegrity processes the prompt as observation
242
  controller = CognitiveController(
243
  n_hypotheses=n,
244
  hypothesis_labels=[f"choice_{i}" for i in range(n)],
245
  use_llm=False,
246
  )
247
-
248
- # Feed the prompt as an observation, using choice keywords for grounding
249
- # Inject choice content into the hypothesis labels for the template parser
250
  for i, hyp in enumerate(controller.belief_state.hypotheses):
251
- hyp.description = sample.choices[i][:50] # First 50 chars as label
252
 
253
- result = controller.step(sample.prompt)
254
 
255
- # Extract posteriors as scores
256
- posteriors = {h.description: h.probability
257
- for h in controller.belief_state.hypotheses}
258
- grafted_scores = [
259
  controller.belief_state.hypotheses[i].probability
260
  for i in range(n)
261
  ]
262
 
263
- # Entropy
264
- probs = np.array(grafted_scores)
265
  probs = probs[probs > 0]
266
  if len(probs) > 1:
267
  entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
268
  else:
269
  entropy = 0.0
270
 
271
- emitted = entropy < self.graft_entropy_gate
272
-
273
- graft_info = {
274
- "posteriors": posteriors,
275
- "entropy": entropy,
276
- "emitted": emitted,
277
- }
278
-
279
- return baseline_scores, grafted_scores, graft_info
280
 
281
  # ─── EVALUATION ─────────────────────────────────────────
282
 
283
  def evaluate_sample(self, sample: TaskSample) -> SampleResult:
284
- """Evaluate a single sample: baseline vs grafted."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285
  if self.mode == "local":
286
  self._init_model()
287
-
288
- t0 = time.time()
289
  baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
290
- t_baseline = time.time() - t0
291
-
292
- # For grafted: build logit processor from Tensegrity beliefs
293
- # (simplified: use offline posteriors as static bias)
294
- t0 = time.time()
295
- _, grafted_offline, graft_info = self._score_choices_offline(sample)
296
- # Blend: 50% LLM score + 50% Tensegrity posterior
297
- grafted_scores = [
298
- 0.5 * b + 0.5 * g
299
- for b, g in zip(baseline_scores, grafted_offline)
300
- ]
301
- t_grafted = time.time() - t0 + t_baseline # Includes LLM time
302
-
303
- posteriors = graft_info["posteriors"]
304
- entropy = graft_info["entropy"]
305
- emitted = graft_info["emitted"]
306
-
307
- elif self.mode == "offline":
308
- t0 = time.time()
309
- baseline_scores, grafted_scores, graft_info = self._score_choices_offline(sample)
310
- t_elapsed = time.time() - t0
311
-
312
- t_baseline = t_elapsed / 2
313
- t_grafted = t_elapsed / 2
314
- posteriors = graft_info["posteriors"]
315
- entropy = graft_info["entropy"]
316
- emitted = graft_info["emitted"]
317
  else:
318
- raise ValueError(f"Unknown mode: {self.mode}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  baseline_pred = int(np.argmax(baseline_scores))
321
  grafted_pred = int(np.argmax(grafted_scores))
322
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
  return SampleResult(
324
  sample_id=sample.id,
325
  task=sample.metadata.get("task", ""),
326
  gold=sample.gold,
 
327
  baseline_pred=baseline_pred,
328
  grafted_pred=grafted_pred,
329
- baseline_correct=(baseline_pred == sample.gold),
330
- grafted_correct=(grafted_pred == sample.gold),
331
  baseline_scores=baseline_scores,
332
  grafted_scores=grafted_scores,
333
- graft_posteriors=posteriors,
334
  graft_entropy=entropy,
335
- graft_emitted=emitted,
336
- wall_time_baseline=t_baseline,
337
- wall_time_grafted=t_grafted,
 
 
338
  )
339
 
340
  def evaluate_task(self, task_name: str,
341
  max_samples: Optional[int] = None,
342
  verbose: bool = False) -> TaskResult:
343
- """Evaluate all samples in a task."""
344
  config = TASK_REGISTRY[task_name]
345
  samples = load_task_samples(task_name, max_samples)
346
 
347
  if verbose:
348
- print(f" [{task_name}] Loading {len(samples)} samples...")
349
 
350
  results = []
351
  for i, sample in enumerate(samples):
@@ -354,65 +401,78 @@ class EvalRunner:
354
  if verbose and (i + 1) % 100 == 0:
355
  acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
356
  acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
357
- print(f" {i+1}/{len(samples)}: baseline={acc_b:.1%} grafted={acc_g:.1%}")
358
 
359
  n = len(results)
360
  if n == 0:
361
  return TaskResult(
362
- task=task_name, domain=config.domain, n_samples=0,
363
  baseline_accuracy=0, grafted_accuracy=0, delta=0,
364
  baseline_correct=0, grafted_correct=0,
365
- mean_graft_entropy=0, mean_graft_emitted_rate=0,
366
- mean_wall_time_baseline=0, mean_wall_time_grafted=0,
367
- speedup=1.0,
368
  )
369
 
370
  bl_correct = sum(1 for r in results if r.baseline_correct)
371
  gr_correct = sum(1 for r in results if r.grafted_correct)
372
- bl_acc = bl_correct / n
373
- gr_acc = gr_correct / n
374
 
375
- mean_bl_time = np.mean([r.wall_time_baseline for r in results])
376
- mean_gr_time = np.mean([r.wall_time_grafted for r in results])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
  return TaskResult(
379
  task=task_name,
380
  domain=config.domain,
381
  n_samples=n,
382
- baseline_accuracy=bl_acc,
383
- grafted_accuracy=gr_acc,
384
- delta=gr_acc - bl_acc,
 
385
  baseline_correct=bl_correct,
386
  grafted_correct=gr_correct,
 
 
 
 
387
  mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
388
- mean_graft_emitted_rate=np.mean([r.graft_emitted for r in results]),
389
- mean_wall_time_baseline=mean_bl_time,
390
- mean_wall_time_grafted=mean_gr_time,
391
- speedup=mean_bl_time / max(mean_gr_time, 1e-9),
392
  )
393
 
394
  def run_benchmark(self, tasks: Optional[List[str]] = None,
395
  max_samples_per_task: Optional[int] = None,
396
  verbose: bool = True) -> BenchmarkResult:
397
- """
398
- Run the full benchmark across multiple tasks.
399
-
400
- Args:
401
- tasks: List of task names. None = all tasks.
402
- max_samples_per_task: Cap per task (for fast dev runs).
403
- verbose: Print progress.
404
- """
405
  if tasks is None:
406
  tasks = list(TASK_REGISTRY.keys())
407
 
408
  if verbose:
409
  print(f"\n{'β–ˆ' * 60}")
410
  print(f" TENSEGRITY BENCHMARK")
411
- print(f" Model: {self.model_name}")
412
- print(f" Mode: {self.mode}")
413
- print(f" Tasks: {len(tasks)}")
 
414
  cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
415
- print(f" Samples/task: {cap_str}")
416
  print(f"{'β–ˆ' * 60}")
417
 
418
  t_start = time.time()
@@ -427,44 +487,96 @@ class EvalRunner:
427
  task_results.append(tr)
428
  if verbose:
429
  sign = "+" if tr.delta >= 0 else ""
430
- print(f" β†’ baseline={tr.baseline_accuracy:.1%} "
431
- f"grafted={tr.grafted_accuracy:.1%} "
432
- f"Ξ”={sign}{tr.delta:.1%} "
433
- f"(n={tr.n_samples}, emit={tr.mean_graft_emitted_rate:.0%})")
 
434
  except Exception as e:
435
  logger.error(f"Task {task_name} failed: {e}")
436
  if verbose:
437
  print(f" βœ— FAILED: {e}")
 
438
 
439
  total_time = time.time() - t_start
440
 
441
- # Aggregate
442
  total_bl = sum(t.baseline_correct for t in task_results)
443
  total_gr = sum(t.grafted_correct for t in task_results)
444
  total_n = sum(t.n_samples for t in task_results)
445
 
446
- overall_bl = total_bl / max(total_n, 1)
447
- overall_gr = total_gr / max(total_n, 1)
 
 
 
 
448
 
449
  result = BenchmarkResult(
450
  model_name=self.model_name,
 
 
451
  tasks=task_results,
452
- overall_baseline_accuracy=overall_bl,
453
- overall_grafted_accuracy=overall_gr,
454
- overall_delta=overall_gr - overall_bl,
 
455
  total_samples=total_n,
456
  total_wall_time=total_time,
457
  timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
458
  )
459
 
460
  if verbose:
461
- print(f"\n{'═' * 68}")
462
  print(result.summary_table())
463
- print(f"\nTotal time: {total_time:.1f}s")
464
- print(f"{'═' * 68}")
 
 
 
 
465
 
466
  return result
467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  def save_results(self, result: BenchmarkResult, path: str):
469
  """Save benchmark results to JSON."""
470
  with open(path, "w") as f:
 
7
  P(choice | prompt) computed from raw logits.
8
  Prediction = argmax over choices.
9
 
10
+ GRAFTED: score(choice) = llm_logprob(choice) + Ξ» * tensegrity_score(choice)
11
+ Where Ξ» controls the graft weight. Ξ»=0 recovers baseline.
12
+
13
+ The ONLY difference is the additive Tensegrity term.
 
 
 
 
14
  This is a controlled A/B comparison.
15
 
16
+ Metrics per task:
17
+ - raw_acc: baseline accuracy
18
+ - grafted_acc: grafted accuracy
19
+ - delta: grafted - baseline
20
+ - coverage: fraction of samples where graft posteriors are non-uniform
21
+ - cond_acc_biased: accuracy on the subset where graft was non-uniform
22
+ - mean_bias_mag: mean max absolute Tensegrity score deviation from uniform
23
+ - flip_rate: fraction of samples where baseline_pred != grafted_pred
24
+ - good_flips: LLM wrong β†’ graft right
25
+ - bad_flips: LLM right β†’ graft wrong
26
+ - preserved: LLM right β†’ graft right
27
+ - neutral: LLM wrong β†’ graft wrong
28
  """
29
 
30
  import numpy as np
 
32
  import json
33
  import logging
34
  from typing import Dict, List, Optional, Any, Tuple
35
+ from dataclasses import dataclass, field
36
  from pathlib import Path
37
 
38
  from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
 
46
  sample_id: str
47
  task: str
48
  gold: int
49
+ n_choices: int
50
  baseline_pred: int
51
  grafted_pred: int
52
  baseline_correct: bool
53
  grafted_correct: bool
54
  baseline_scores: List[float]
55
  grafted_scores: List[float]
56
+ tensegrity_scores: List[float] # Raw Tensegrity posteriors (pre-blend)
57
+ graft_entropy: float # Normalized entropy of Tensegrity posteriors
58
+ bias_applied: bool # Did Tensegrity posteriors differ from uniform?
59
+ bias_magnitude: float # Max absolute deviation from uniform
60
+ flip_type: str # "good_flip", "bad_flip", "preserved", "neutral", "no_flip"
61
+ lam: float # Ξ» used for this evaluation
62
+ wall_time: float
63
+
64
+
65
+ @dataclass
66
+ class FlipAccounting:
67
+ """Flip analysis for one task."""
68
+ good_flips: int = 0 # LLM wrong β†’ graft right
69
+ bad_flips: int = 0 # LLM right β†’ graft wrong
70
+ preserved: int = 0 # LLM right β†’ graft right
71
+ neutral: int = 0 # LLM wrong β†’ graft wrong (no change)
72
+ no_flip: int = 0 # Same prediction (subset of preserved + neutral)
73
+
74
+ @property
75
+ def total(self):
76
+ return self.good_flips + self.bad_flips + self.preserved + self.neutral
77
+
78
+ @property
79
+ def flip_rate(self):
80
+ return (self.good_flips + self.bad_flips) / max(self.total, 1)
81
+
82
+ @property
83
+ def good_bad_ratio(self):
84
+ if self.bad_flips == 0:
85
+ return float('inf') if self.good_flips > 0 else 0.0
86
+ return self.good_flips / self.bad_flips
87
+
88
+ def to_dict(self):
89
+ return {
90
+ "good_flips": self.good_flips,
91
+ "bad_flips": self.bad_flips,
92
+ "preserved": self.preserved,
93
+ "neutral": self.neutral,
94
+ "flip_rate": round(self.flip_rate, 4),
95
+ "good_bad_ratio": round(self.good_bad_ratio, 2) if self.good_bad_ratio != float('inf') else "inf",
96
+ }
97
 
98
 
99
  @dataclass
 
102
  task: str
103
  domain: str
104
  n_samples: int
105
+ lam: float
106
+ # Core accuracy
107
  baseline_accuracy: float
108
  grafted_accuracy: float
109
+ delta: float
110
  baseline_correct: int
111
  grafted_correct: int
112
+ # Graft diagnostics
113
+ coverage: float # Fraction where bias_applied=True
114
+ cond_acc_biased: float # Accuracy only on samples where bias was applied
115
+ cond_acc_unbiased: float # Accuracy only on samples where bias was NOT applied
116
+ mean_bias_magnitude: float
117
  mean_graft_entropy: float
118
+ # Flips
119
+ flips: FlipAccounting
120
+ # Timing
121
+ mean_wall_time: float
122
 
123
 
124
  @dataclass
125
  class BenchmarkResult:
126
  """Full benchmark result across all tasks."""
127
  model_name: str
128
+ mode: str
129
+ lam: float
130
  tasks: List[TaskResult]
131
  overall_baseline_accuracy: float
132
  overall_grafted_accuracy: float
133
  overall_delta: float
134
+ overall_flips: FlipAccounting
135
  total_samples: int
136
  total_wall_time: float
137
  timestamp: str = ""
 
139
  def to_dict(self) -> dict:
140
  return {
141
  "model": self.model_name,
142
+ "mode": self.mode,
143
+ "lambda": self.lam,
144
  "overall": {
145
  "baseline_accuracy": round(self.overall_baseline_accuracy, 4),
146
  "grafted_accuracy": round(self.overall_grafted_accuracy, 4),
147
  "delta": round(self.overall_delta, 4),
148
  "total_samples": self.total_samples,
149
  "wall_time_s": round(self.total_wall_time, 1),
150
+ "flips": self.overall_flips.to_dict(),
151
  },
152
  "tasks": [
153
  {
154
  "task": t.task,
155
  "domain": t.domain,
156
  "n": t.n_samples,
157
+ "lambda": t.lam,
158
  "baseline": round(t.baseline_accuracy, 4),
159
  "grafted": round(t.grafted_accuracy, 4),
160
  "delta": round(t.delta, 4),
161
+ "coverage": round(t.coverage, 3),
162
+ "cond_acc_biased": round(t.cond_acc_biased, 4),
163
+ "mean_bias_mag": round(t.mean_bias_magnitude, 4),
164
+ "mean_entropy": round(t.mean_graft_entropy, 3),
165
+ "flips": t.flips.to_dict(),
166
  }
167
  for t in self.tasks
168
  ],
 
170
 
171
  def summary_table(self) -> str:
172
  lines = []
173
+ lines.append(f"{'Task':<22} {'N':>5} {'Base':>7} {'Graft':>7} {'Ξ”':>7}"
174
+ f" {'Cov':>5} {'G/B':>6} {'Gβ†’βœ“':>4} {'Gβ†’βœ—':>4}")
175
+ lines.append("─" * 75)
176
  for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
177
  sign = "+" if t.delta >= 0 else ""
178
+ gb = t.flips.good_bad_ratio
179
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
180
  lines.append(
181
+ f"{t.task:<22} {t.n_samples:>5} {t.baseline_accuracy:>6.1%} "
182
+ f"{t.grafted_accuracy:>6.1%} {sign}{t.delta:>6.1%}"
183
+ f" {t.coverage:>4.0%} {gb_str:>6} {t.flips.good_flips:>4} {t.flips.bad_flips:>4}"
184
  )
185
+ lines.append("─" * 75)
186
  sign = "+" if self.overall_delta >= 0 else ""
187
+ gb = self.overall_flips.good_bad_ratio
188
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
189
  lines.append(
190
+ f"{'OVERALL':<22} {self.total_samples:>5} {self.overall_baseline_accuracy:>6.1%} "
191
+ f"{self.overall_grafted_accuracy:>6.1%} {sign}{self.overall_delta:>6.1%}"
192
+ f" {'':>5} {gb_str:>6} "
193
+ f"{self.overall_flips.good_flips:>4} {self.overall_flips.bad_flips:>4}"
194
  )
195
  return "\n".join(lines)
196
 
 
201
 
202
  Modes:
203
  "local" β€” Uses transformers model with LogitsProcessor
204
+ "offline" β€” No LLM; baseline = random, grafted = Tensegrity posteriors
205
  (tests the cognitive layer in isolation)
206
+
207
+ Ξ» parameter:
208
+ score(choice) = baseline_score(choice) + Ξ» * tensegrity_score(choice)
209
+ Ξ»=0 β†’ pure baseline. Ξ»>0 β†’ graft contributes. Sweep to find optimal.
210
  """
211
 
212
  def __init__(self,
213
  model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
214
  mode: str = "offline",
215
+ lam: float = 1.0,
 
216
  seed: int = 42):
217
+ """
218
+ Args:
219
+ model_name: HF model ID for local mode
220
+ mode: "offline" or "local"
221
+ lam: Ξ» β€” graft weight. score = baseline + Ξ» * tensegrity
222
+ seed: Random seed
223
+ """
224
  self.model_name = model_name
225
  self.mode = mode
226
+ self.lam = lam
 
227
  self.seed = seed
228
 
 
229
  self._model = None
230
  self._tokenizer = None
231
 
232
  def _init_model(self):
233
+ if self._model is not None or self.mode != "local":
 
234
  return
 
 
 
235
  from transformers import AutoTokenizer, AutoModelForCausalLM
236
  import torch
 
237
  logger.info(f"Loading model {self.model_name}...")
238
  self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
239
  if self._tokenizer.pad_token is None:
 
244
  device_map="auto" if torch.cuda.is_available() else None,
245
  )
246
  self._model.eval()
 
247
 
248
  # ─── SCORING ────────────────────────────────────────────
249
 
250
+ def _score_choices_local(self, prompt: str, choices: List[str]) -> List[float]:
251
+ """Score each choice by log P(choice | prompt)."""
 
 
 
 
 
 
252
  import torch
 
 
253
  scores = []
254
  for choice in choices:
255
  full_text = f"{prompt} {choice}"
 
257
  truncation=True, max_length=512)
258
  if hasattr(self._model, 'device'):
259
  inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
 
260
  with torch.no_grad():
261
  outputs = self._model(**inputs)
262
+ logits = outputs.logits
 
 
263
  prompt_ids = self._tokenizer(prompt, return_tensors="pt",
264
  truncation=True, max_length=512)["input_ids"]
265
  n_prompt = prompt_ids.shape[1]
266
  n_total = inputs["input_ids"].shape[1]
 
 
267
  log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
268
  choice_log_prob = 0.0
269
  for pos in range(n_prompt, n_total):
270
  token_id = inputs["input_ids"][0, pos].item()
271
  choice_log_prob += log_probs[pos - 1, token_id].item()
 
 
272
  n_choice_tokens = max(n_total - n_prompt, 1)
273
  scores.append(choice_log_prob / n_choice_tokens)
 
274
  return scores
275
 
276
+ def _get_tensegrity_scores(self, sample: TaskSample) -> Tuple[List[float], float]:
277
  """
278
+ Run Tensegrity cognitive layer on a sample.
279
+ Returns (posteriors_list, normalized_entropy).
 
 
 
 
280
  """
281
  from tensegrity.broca.controller import CognitiveController
282
 
283
  n = len(sample.choices)
 
 
 
 
 
284
  controller = CognitiveController(
285
  n_hypotheses=n,
286
  hypothesis_labels=[f"choice_{i}" for i in range(n)],
287
  use_llm=False,
288
  )
 
 
 
289
  for i, hyp in enumerate(controller.belief_state.hypotheses):
290
+ hyp.description = sample.choices[i][:50]
291
 
292
+ controller.step(sample.prompt)
293
 
294
+ posteriors = [
 
 
 
295
  controller.belief_state.hypotheses[i].probability
296
  for i in range(n)
297
  ]
298
 
299
+ probs = np.array(posteriors)
 
300
  probs = probs[probs > 0]
301
  if len(probs) > 1:
302
  entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
303
  else:
304
  entropy = 0.0
305
 
306
+ return posteriors, entropy
 
 
 
 
 
 
 
 
307
 
308
  # ─── EVALUATION ─────────────────────────────────────────
309
 
310
  def evaluate_sample(self, sample: TaskSample) -> SampleResult:
311
+ """Evaluate a single sample with full diagnostics."""
312
+ t0 = time.time()
313
+ n = len(sample.choices)
314
+ uniform = 1.0 / n
315
+
316
+ # Get Tensegrity scores
317
+ tensegrity_scores, entropy = self._get_tensegrity_scores(sample)
318
+
319
+ # Compute bias diagnostics
320
+ deviations = [abs(s - uniform) for s in tensegrity_scores]
321
+ bias_magnitude = max(deviations)
322
+ # bias_applied = posteriors are meaningfully non-uniform
323
+ bias_applied = bias_magnitude > 0.02 # More than 2% deviation from uniform
324
+
325
+ # Get baseline scores
326
  if self.mode == "local":
327
  self._init_model()
 
 
328
  baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  else:
330
+ # Offline: random baseline (seeded by sample ID for reproducibility)
331
+ rng = np.random.RandomState(hash(sample.id) % 2**31)
332
+ baseline_scores = rng.randn(n).tolist()
333
+
334
+ # Grafted: baseline + Ξ» * tensegrity
335
+ # Normalize tensegrity scores to be on comparable scale to baseline
336
+ # In offline mode, baseline is N(0,1), tensegrity is [0,1] probabilities
337
+ # In local mode, baseline is log-probs (~[-5, 0]), tensegrity is [0,1]
338
+ # Convert tensegrity to log-odds for better scale matching
339
+ tensegrity_logodds = [
340
+ np.log(max(s, 1e-9)) - np.log(uniform)
341
+ for s in tensegrity_scores
342
+ ]
343
+
344
+ grafted_scores = [
345
+ b + self.lam * t
346
+ for b, t in zip(baseline_scores, tensegrity_logodds)
347
+ ]
348
 
349
  baseline_pred = int(np.argmax(baseline_scores))
350
  grafted_pred = int(np.argmax(grafted_scores))
351
 
352
+ baseline_correct = (baseline_pred == sample.gold)
353
+ grafted_correct = (grafted_pred == sample.gold)
354
+
355
+ # Flip classification
356
+ if baseline_pred == grafted_pred:
357
+ flip_type = "preserved" if baseline_correct else "neutral"
358
+ elif not baseline_correct and grafted_correct:
359
+ flip_type = "good_flip"
360
+ elif baseline_correct and not grafted_correct:
361
+ flip_type = "bad_flip"
362
+ else:
363
+ flip_type = "neutral" # Both wrong, different wrong answers
364
+
365
+ wall_time = time.time() - t0
366
+
367
  return SampleResult(
368
  sample_id=sample.id,
369
  task=sample.metadata.get("task", ""),
370
  gold=sample.gold,
371
+ n_choices=n,
372
  baseline_pred=baseline_pred,
373
  grafted_pred=grafted_pred,
374
+ baseline_correct=baseline_correct,
375
+ grafted_correct=grafted_correct,
376
  baseline_scores=baseline_scores,
377
  grafted_scores=grafted_scores,
378
+ tensegrity_scores=tensegrity_scores,
379
  graft_entropy=entropy,
380
+ bias_applied=bias_applied,
381
+ bias_magnitude=bias_magnitude,
382
+ flip_type=flip_type,
383
+ lam=self.lam,
384
+ wall_time=wall_time,
385
  )
386
 
387
  def evaluate_task(self, task_name: str,
388
  max_samples: Optional[int] = None,
389
  verbose: bool = False) -> TaskResult:
390
+ """Evaluate all samples in a task with full flip accounting."""
391
  config = TASK_REGISTRY[task_name]
392
  samples = load_task_samples(task_name, max_samples)
393
 
394
  if verbose:
395
+ print(f" [{task_name}] Loaded {len(samples)} samples")
396
 
397
  results = []
398
  for i, sample in enumerate(samples):
 
401
  if verbose and (i + 1) % 100 == 0:
402
  acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
403
  acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
404
+ print(f" {i+1}/{len(samples)}: base={acc_b:.1%} graft={acc_g:.1%}")
405
 
406
  n = len(results)
407
  if n == 0:
408
  return TaskResult(
409
+ task=task_name, domain=config.domain, n_samples=0, lam=self.lam,
410
  baseline_accuracy=0, grafted_accuracy=0, delta=0,
411
  baseline_correct=0, grafted_correct=0,
412
+ coverage=0, cond_acc_biased=0, cond_acc_unbiased=0,
413
+ mean_bias_magnitude=0, mean_graft_entropy=0,
414
+ flips=FlipAccounting(), mean_wall_time=0,
415
  )
416
 
417
  bl_correct = sum(1 for r in results if r.baseline_correct)
418
  gr_correct = sum(1 for r in results if r.grafted_correct)
 
 
419
 
420
+ # Flip accounting
421
+ flips = FlipAccounting()
422
+ for r in results:
423
+ if r.flip_type == "good_flip":
424
+ flips.good_flips += 1
425
+ elif r.flip_type == "bad_flip":
426
+ flips.bad_flips += 1
427
+ elif r.flip_type == "preserved":
428
+ flips.preserved += 1
429
+ elif r.flip_type == "neutral":
430
+ flips.neutral += 1
431
+
432
+ # Coverage: fraction where bias was non-trivial
433
+ biased = [r for r in results if r.bias_applied]
434
+ coverage = len(biased) / n
435
+
436
+ # Conditional accuracy
437
+ cond_acc_biased = (sum(1 for r in biased if r.grafted_correct) / len(biased)) if biased else 0.0
438
+ unbiased = [r for r in results if not r.bias_applied]
439
+ cond_acc_unbiased = (sum(1 for r in unbiased if r.grafted_correct) / len(unbiased)) if unbiased else 0.0
440
 
441
  return TaskResult(
442
  task=task_name,
443
  domain=config.domain,
444
  n_samples=n,
445
+ lam=self.lam,
446
+ baseline_accuracy=bl_correct / n,
447
+ grafted_accuracy=gr_correct / n,
448
+ delta=(gr_correct - bl_correct) / n,
449
  baseline_correct=bl_correct,
450
  grafted_correct=gr_correct,
451
+ coverage=coverage,
452
+ cond_acc_biased=cond_acc_biased,
453
+ cond_acc_unbiased=cond_acc_unbiased,
454
+ mean_bias_magnitude=np.mean([r.bias_magnitude for r in results]),
455
  mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
456
+ flips=flips,
457
+ mean_wall_time=np.mean([r.wall_time for r in results]),
 
 
458
  )
459
 
460
  def run_benchmark(self, tasks: Optional[List[str]] = None,
461
  max_samples_per_task: Optional[int] = None,
462
  verbose: bool = True) -> BenchmarkResult:
463
+ """Run the full benchmark across multiple tasks."""
 
 
 
 
 
 
 
464
  if tasks is None:
465
  tasks = list(TASK_REGISTRY.keys())
466
 
467
  if verbose:
468
  print(f"\n{'β–ˆ' * 60}")
469
  print(f" TENSEGRITY BENCHMARK")
470
+ print(f" Model: {self.model_name}")
471
+ print(f" Mode: {self.mode}")
472
+ print(f" Ξ»: {self.lam}")
473
+ print(f" Tasks: {len(tasks)}")
474
  cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
475
+ print(f" N/task: {cap_str}")
476
  print(f"{'β–ˆ' * 60}")
477
 
478
  t_start = time.time()
 
487
  task_results.append(tr)
488
  if verbose:
489
  sign = "+" if tr.delta >= 0 else ""
490
+ gb = tr.flips.good_bad_ratio
491
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
492
+ print(f" base={tr.baseline_accuracy:.1%} graft={tr.grafted_accuracy:.1%} "
493
+ f"Ξ”={sign}{tr.delta:.1%} cov={tr.coverage:.0%} "
494
+ f"flips={tr.flips.good_flips}↑/{tr.flips.bad_flips}↓ G/B={gb_str}")
495
  except Exception as e:
496
  logger.error(f"Task {task_name} failed: {e}")
497
  if verbose:
498
  print(f" βœ— FAILED: {e}")
499
+ import traceback; traceback.print_exc()
500
 
501
  total_time = time.time() - t_start
502
 
 
503
  total_bl = sum(t.baseline_correct for t in task_results)
504
  total_gr = sum(t.grafted_correct for t in task_results)
505
  total_n = sum(t.n_samples for t in task_results)
506
 
507
+ overall_flips = FlipAccounting()
508
+ for t in task_results:
509
+ overall_flips.good_flips += t.flips.good_flips
510
+ overall_flips.bad_flips += t.flips.bad_flips
511
+ overall_flips.preserved += t.flips.preserved
512
+ overall_flips.neutral += t.flips.neutral
513
 
514
  result = BenchmarkResult(
515
  model_name=self.model_name,
516
+ mode=self.mode,
517
+ lam=self.lam,
518
  tasks=task_results,
519
+ overall_baseline_accuracy=total_bl / max(total_n, 1),
520
+ overall_grafted_accuracy=total_gr / max(total_n, 1),
521
+ overall_delta=(total_gr - total_bl) / max(total_n, 1),
522
+ overall_flips=overall_flips,
523
  total_samples=total_n,
524
  total_wall_time=total_time,
525
  timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
526
  )
527
 
528
  if verbose:
529
+ print(f"\n{'═' * 75}")
530
  print(result.summary_table())
531
+ print(f"\n Ξ»={self.lam} Time={total_time:.1f}s")
532
+ print(f" Total flips: {overall_flips.good_flips}↑ good, "
533
+ f"{overall_flips.bad_flips}↓ bad, "
534
+ f"{overall_flips.preserved} preserved, "
535
+ f"{overall_flips.neutral} neutral")
536
+ print(f"{'═' * 75}")
537
 
538
  return result
539
 
540
+ def sweep_lambda(self, tasks: Optional[List[str]] = None,
541
+ lambdas: Optional[List[float]] = None,
542
+ max_samples_per_task: Optional[int] = None,
543
+ verbose: bool = True) -> List[BenchmarkResult]:
544
+ """
545
+ Sweep Ξ» to find optimal graft weight.
546
+
547
+ Args:
548
+ lambdas: Values to sweep. Default: [0, 0.1, 0.25, 0.5, 1.0, 2.0]
549
+ """
550
+ if lambdas is None:
551
+ lambdas = [0.0, 0.1, 0.25, 0.5, 1.0, 2.0]
552
+
553
+ if verbose:
554
+ print(f"\n{'β–ˆ' * 60}")
555
+ print(f" Ξ» SWEEP: {lambdas}")
556
+ print(f"{'β–ˆ' * 60}")
557
+
558
+ results = []
559
+ for lam_val in lambdas:
560
+ self.lam = lam_val
561
+ result = self.run_benchmark(tasks, max_samples_per_task, verbose=False)
562
+ results.append(result)
563
+
564
+ if verbose:
565
+ sign = "+" if result.overall_delta >= 0 else ""
566
+ gb = result.overall_flips.good_bad_ratio
567
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
568
+ print(f" Ξ»={lam_val:<5} base={result.overall_baseline_accuracy:.1%} "
569
+ f"graft={result.overall_grafted_accuracy:.1%} "
570
+ f"Ξ”={sign}{result.overall_delta:.1%} G/B={gb_str} "
571
+ f"({result.overall_flips.good_flips}↑/{result.overall_flips.bad_flips}↓)")
572
+
573
+ if verbose:
574
+ # Find optimal Ξ»
575
+ best = max(results, key=lambda r: r.overall_delta)
576
+ print(f"\n Best Ξ» = {best.lam} β†’ Ξ” = {best.overall_delta:+.1%}")
577
+
578
+ return results
579
+
580
  def save_results(self, result: BenchmarkResult, path: str):
581
  """Save benchmark results to JSON."""
582
  with open(path, "w") as f: