theapemachine commited on
Commit
5c5ec1b
Β·
2 Parent(s): 86eaefa8fab8ff

Merge branch 'main' of https://huggingface.co/theapemachine/tensegrity

Browse files
Files changed (2) hide show
  1. tensegrity/bench/run.py +39 -20
  2. tensegrity/bench/runner.py +290 -177
tensegrity/bench/run.py CHANGED
@@ -3,14 +3,17 @@
3
  Tensegrity Benchmark CLI.
4
 
5
  Usage:
6
- # Quick dev run (offline, 20 samples/task, 3 tasks):
7
- python -m tensegrity.bench.run --mode offline --max-samples 20 --tasks copa,boolq,logiqa
8
 
9
- # Full offline benchmark (all tasks, all samples):
10
  python -m tensegrity.bench.run --mode offline
11
 
 
 
 
12
  # Local model benchmark (requires GPU):
13
- python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct --max-samples 50
14
 
15
  # Save results:
16
  python -m tensegrity.bench.run --mode offline --output results.json
@@ -35,10 +38,12 @@ def main():
35
  help="Comma-separated task names (default: all)")
36
  parser.add_argument("--max-samples", type=int, default=None,
37
  help="Max samples per task (default: all)")
38
- parser.add_argument("--scale", type=float, default=2.5,
39
- help="Graft logit bias scale")
40
- parser.add_argument("--entropy-gate", type=float, default=0.85,
41
- help="Convergence gate threshold")
 
 
42
  parser.add_argument("--output", default=None,
43
  help="Save results to JSON file")
44
  parser.add_argument("--list-tasks", action="store_true",
@@ -62,22 +67,36 @@ def main():
62
  runner = EvalRunner(
63
  model_name=args.model,
64
  mode=args.mode,
65
- graft_scale=args.scale,
66
- graft_entropy_gate=args.entropy_gate,
67
  seed=args.seed,
68
  )
69
 
70
- result = runner.run_benchmark(
71
- tasks=tasks,
72
- max_samples_per_task=args.max_samples,
73
- verbose=not args.quiet,
74
- )
75
-
76
- if args.output:
77
- runner.save_results(result, args.output)
78
- print(f"\nResults saved to {args.output}")
 
 
 
 
 
 
79
  else:
80
- print(f"\n{json.dumps(result.to_dict(), indent=2)}")
 
 
 
 
 
 
 
 
 
81
 
82
 
83
  if __name__ == "__main__":
 
3
  Tensegrity Benchmark CLI.
4
 
5
  Usage:
6
+ # Quick benchmark (offline, 50 samples/task):
7
+ python -m tensegrity.bench.run --mode offline --max-samples 50 --tasks copa,boolq,sciq
8
 
9
+ # Full offline benchmark:
10
  python -m tensegrity.bench.run --mode offline
11
 
12
+ # Ξ» sweep (find optimal graft weight):
13
+ python -m tensegrity.bench.run --sweep --max-samples 100 --tasks copa,sciq,truthfulqa
14
+
15
  # Local model benchmark (requires GPU):
16
+ python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct
17
 
18
  # Save results:
19
  python -m tensegrity.bench.run --mode offline --output results.json
 
38
  help="Comma-separated task names (default: all)")
39
  parser.add_argument("--max-samples", type=int, default=None,
40
  help="Max samples per task (default: all)")
41
+ parser.add_argument("--lam", type=float, default=1.0,
42
+ help="Ξ» β€” graft weight: score = baseline + Ξ»*tensegrity (default: 1.0)")
43
+ parser.add_argument("--sweep", action="store_true",
44
+ help="Run Ξ» sweep over [0, 0.1, 0.25, 0.5, 1.0, 2.0]")
45
+ parser.add_argument("--sweep-lambdas", default=None,
46
+ help="Custom Ξ» values for sweep (comma-separated, e.g. 0,0.5,1,2,4)")
47
  parser.add_argument("--output", default=None,
48
  help="Save results to JSON file")
49
  parser.add_argument("--list-tasks", action="store_true",
 
67
  runner = EvalRunner(
68
  model_name=args.model,
69
  mode=args.mode,
70
+ lam=args.lam,
 
71
  seed=args.seed,
72
  )
73
 
74
+ if args.sweep:
75
+ lambdas = None
76
+ if args.sweep_lambdas:
77
+ lambdas = [float(x) for x in args.sweep_lambdas.split(",")]
78
+ results = runner.sweep_lambda(
79
+ tasks=tasks,
80
+ lambdas=lambdas,
81
+ max_samples_per_task=args.max_samples,
82
+ verbose=not args.quiet,
83
+ )
84
+ if args.output:
85
+ sweep_data = [r.to_dict() for r in results]
86
+ with open(args.output, "w") as f:
87
+ json.dump(sweep_data, f, indent=2)
88
+ print(f"\nSweep results saved to {args.output}")
89
  else:
90
+ result = runner.run_benchmark(
91
+ tasks=tasks,
92
+ max_samples_per_task=args.max_samples,
93
+ verbose=not args.quiet,
94
+ )
95
+ if args.output:
96
+ runner.save_results(result, args.output)
97
+ print(f"\nResults saved to {args.output}")
98
+ elif not args.quiet:
99
+ print(f"\n{json.dumps(result.to_dict(), indent=2)}")
100
 
101
 
102
  if __name__ == "__main__":
tensegrity/bench/runner.py CHANGED
@@ -7,21 +7,24 @@ Two evaluation modes per sample:
7
  P(choice | prompt) computed from raw logits.
8
  Prediction = argmax over choices.
9
 
10
- GRAFTED: Same scoring, but with TensegrityLogitsProcessor active.
11
- Tensegrity processes the prompt as an observation first,
12
- forms belief posteriors over choices, then injects logit
13
- biases during the scoring pass. Prediction = argmax over
14
- biased scores.
15
-
16
- Both modes use identical prompts, identical model, identical decoding.
17
- The ONLY difference is the presence/absence of the logit-bias graft.
18
  This is a controlled A/B comparison.
19
 
20
- Metrics:
21
- - accuracy: fraction correct
22
- - accuracy_by_domain: broken down by task domain
23
- - delta: grafted_accuracy - baseline_accuracy (positive = graft helps)
24
- - confidence: mean max-posterior at decision time
 
 
 
 
 
 
 
25
  """
26
 
27
  import numpy as np
@@ -29,7 +32,7 @@ import time
29
  import json
30
  import logging
31
  from typing import Dict, List, Optional, Any, Tuple
32
- from dataclasses import dataclass, field, asdict
33
  from pathlib import Path
34
 
35
  from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
@@ -44,17 +47,54 @@ class SampleResult:
44
  sample_id: str
45
  task: str
46
  gold: int
 
47
  baseline_pred: int
48
  grafted_pred: int
49
  baseline_correct: bool
50
  grafted_correct: bool
51
  baseline_scores: List[float]
52
  grafted_scores: List[float]
53
- graft_posteriors: Dict[str, float]
54
- graft_entropy: float
55
- graft_emitted: bool
56
- wall_time_baseline: float
57
- wall_time_grafted: float
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
 
60
  @dataclass
@@ -63,26 +103,36 @@ class TaskResult:
63
  task: str
64
  domain: str
65
  n_samples: int
 
 
66
  baseline_accuracy: float
67
  grafted_accuracy: float
68
- delta: float # grafted - baseline
69
  baseline_correct: int
70
  grafted_correct: int
 
 
 
 
 
71
  mean_graft_entropy: float
72
- mean_graft_emitted_rate: float
73
- mean_wall_time_baseline: float
74
- mean_wall_time_grafted: float
75
- speedup: float # baseline_time / grafted_time
76
 
77
 
78
  @dataclass
79
  class BenchmarkResult:
80
  """Full benchmark result across all tasks."""
81
  model_name: str
 
 
82
  tasks: List[TaskResult]
83
  overall_baseline_accuracy: float
84
  overall_grafted_accuracy: float
85
  overall_delta: float
 
86
  total_samples: int
87
  total_wall_time: float
88
  timestamp: str = ""
@@ -90,23 +140,30 @@ class BenchmarkResult:
90
  def to_dict(self) -> dict:
91
  return {
92
  "model": self.model_name,
 
 
93
  "overall": {
94
  "baseline_accuracy": round(self.overall_baseline_accuracy, 4),
95
  "grafted_accuracy": round(self.overall_grafted_accuracy, 4),
96
  "delta": round(self.overall_delta, 4),
97
  "total_samples": self.total_samples,
98
  "wall_time_s": round(self.total_wall_time, 1),
 
99
  },
100
  "tasks": [
101
  {
102
  "task": t.task,
103
  "domain": t.domain,
104
  "n": t.n_samples,
 
105
  "baseline": round(t.baseline_accuracy, 4),
106
  "grafted": round(t.grafted_accuracy, 4),
107
  "delta": round(t.delta, 4),
108
- "graft_emit_rate": round(t.mean_graft_emitted_rate, 3),
109
- "graft_entropy": round(t.mean_graft_entropy, 3),
 
 
 
110
  }
111
  for t in self.tasks
112
  ],
@@ -114,19 +171,27 @@ class BenchmarkResult:
114
 
115
  def summary_table(self) -> str:
116
  lines = []
117
- lines.append(f"{'Task':<25} {'N':>5} {'Baseline':>10} {'Grafted':>10} {'Ξ”':>8} {'Emit%':>7}")
118
- lines.append("─" * 68)
 
119
  for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
120
  sign = "+" if t.delta >= 0 else ""
 
 
121
  lines.append(
122
- f"{t.task:<25} {t.n_samples:>5} {t.baseline_accuracy:>9.1%} "
123
- f"{t.grafted_accuracy:>9.1%} {sign}{t.delta:>7.1%} {t.mean_graft_emitted_rate:>6.0%}"
 
124
  )
125
- lines.append("─" * 68)
126
  sign = "+" if self.overall_delta >= 0 else ""
 
 
127
  lines.append(
128
- f"{'OVERALL':<25} {self.total_samples:>5} {self.overall_baseline_accuracy:>9.1%} "
129
- f"{self.overall_grafted_accuracy:>9.1%} {sign}{self.overall_delta:>7.1%}"
 
 
130
  )
131
  return "\n".join(lines)
132
 
@@ -137,35 +202,39 @@ class EvalRunner:
137
 
138
  Modes:
139
  "local" β€” Uses transformers model with LogitsProcessor
140
- "offline" β€” No LLM; scores choices via Tensegrity posteriors only
141
  (tests the cognitive layer in isolation)
 
 
 
 
142
  """
143
 
144
  def __init__(self,
145
  model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
146
  mode: str = "offline",
147
- graft_scale: float = 2.5,
148
- graft_entropy_gate: float = 0.85,
149
  seed: int = 42):
 
 
 
 
 
 
 
150
  self.model_name = model_name
151
  self.mode = mode
152
- self.graft_scale = graft_scale
153
- self.graft_entropy_gate = graft_entropy_gate
154
  self.seed = seed
155
 
156
- # Lazy-loaded
157
  self._model = None
158
  self._tokenizer = None
159
 
160
  def _init_model(self):
161
- """Load model + tokenizer for local mode."""
162
- if self._model is not None:
163
  return
164
- if self.mode != "local":
165
- return
166
-
167
  from transformers import AutoTokenizer, AutoModelForCausalLM
168
-
169
  dtype, device_map, move_to = inference_load_settings()
170
  logger.info(f"Loading model {self.model_name}...")
171
  self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
@@ -179,21 +248,12 @@ class EvalRunner:
179
  if move_to is not None:
180
  self._model = self._model.to(move_to)
181
  self._model.eval()
182
- logger.info("Model loaded.")
183
 
184
  # ─── SCORING ────────────────────────────────────────────
185
 
186
- def _score_choices_local(self, prompt: str, choices: List[str],
187
- logit_bias_fn=None) -> List[float]:
188
- """
189
- Score each choice by computing log P(choice | prompt).
190
-
191
- For each choice, concatenate prompt + choice, compute the
192
- sum of log-probs over the choice tokens only.
193
- """
194
  import torch
195
- from transformers import LogitsProcessorList
196
-
197
  scores = []
198
  for choice in choices:
199
  full_text = f"{prompt} {choice}"
@@ -201,154 +261,142 @@ class EvalRunner:
201
  truncation=True, max_length=512)
202
  if hasattr(self._model, 'device'):
203
  inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
204
-
205
  with torch.no_grad():
206
  outputs = self._model(**inputs)
207
- logits = outputs.logits # (1, seq_len, vocab_size)
208
-
209
- # Get log-probs for the choice tokens
210
  prompt_ids = self._tokenizer(prompt, return_tensors="pt",
211
  truncation=True, max_length=512)["input_ids"]
212
  n_prompt = prompt_ids.shape[1]
213
  n_total = inputs["input_ids"].shape[1]
214
-
215
- # Sum log-probs of choice tokens
216
  log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
217
  choice_log_prob = 0.0
218
  for pos in range(n_prompt, n_total):
219
  token_id = inputs["input_ids"][0, pos].item()
220
  choice_log_prob += log_probs[pos - 1, token_id].item()
221
-
222
- # Length-normalize
223
  n_choice_tokens = max(n_total - n_prompt, 1)
224
  scores.append(choice_log_prob / n_choice_tokens)
225
-
226
  return scores
227
 
228
- def _score_choices_offline(self, sample: TaskSample) -> Tuple[List[float], List[float], Dict]:
229
  """
230
- Offline scoring: no LLM, use Tensegrity cognitive layer.
231
-
232
- Baseline: uniform random (represents an LLM with no reasoning)
233
- Grafted: Tensegrity processes the prompt and scores choices via posteriors
234
-
235
- Returns (baseline_scores, grafted_scores, graft_info)
236
  """
237
  from tensegrity.broca.controller import CognitiveController
238
 
239
  n = len(sample.choices)
240
- # Baseline: uniform scores (random baseline)
241
- rng = np.random.RandomState(hash(sample.id) % 2**31)
242
- baseline_scores = rng.randn(n).tolist()
243
-
244
- # Grafted: Tensegrity processes the prompt as observation
245
  controller = CognitiveController(
246
  n_hypotheses=n,
247
  hypothesis_labels=[f"choice_{i}" for i in range(n)],
248
  use_llm=False,
249
  )
250
-
251
- # Feed the prompt as an observation, using choice keywords for grounding
252
- # Inject choice content into the hypothesis labels for the template parser
253
  for i, hyp in enumerate(controller.belief_state.hypotheses):
254
- hyp.description = sample.choices[i][:50] # First 50 chars as label
255
 
256
- result = controller.step(sample.prompt)
257
 
258
- # Extract posteriors as scores
259
- posteriors = {h.description: h.probability
260
- for h in controller.belief_state.hypotheses}
261
- grafted_scores = [
262
  controller.belief_state.hypotheses[i].probability
263
  for i in range(n)
264
  ]
265
 
266
- # Entropy
267
- probs = np.array(grafted_scores)
268
  probs = probs[probs > 0]
269
  if len(probs) > 1:
270
  entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
271
  else:
272
  entropy = 0.0
273
 
274
- emitted = entropy < self.graft_entropy_gate
275
-
276
- graft_info = {
277
- "posteriors": posteriors,
278
- "entropy": entropy,
279
- "emitted": emitted,
280
- }
281
-
282
- return baseline_scores, grafted_scores, graft_info
283
 
284
  # ─── EVALUATION ─────────────────────────────────────────
285
 
286
  def evaluate_sample(self, sample: TaskSample) -> SampleResult:
287
- """Evaluate a single sample: baseline vs grafted."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  if self.mode == "local":
289
  self._init_model()
290
-
291
- t0 = time.time()
292
  baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
293
- t_baseline = time.time() - t0
294
-
295
- # For grafted: build logit processor from Tensegrity beliefs
296
- # (simplified: use offline posteriors as static bias)
297
- t0 = time.time()
298
- _, grafted_offline, graft_info = self._score_choices_offline(sample)
299
- # Blend: 50% LLM score + 50% Tensegrity posterior
300
- grafted_scores = [
301
- 0.5 * b + 0.5 * g
302
- for b, g in zip(baseline_scores, grafted_offline)
303
- ]
304
- t_grafted = time.time() - t0 + t_baseline # Includes LLM time
305
-
306
- posteriors = graft_info["posteriors"]
307
- entropy = graft_info["entropy"]
308
- emitted = graft_info["emitted"]
309
-
310
- elif self.mode == "offline":
311
- t0 = time.time()
312
- baseline_scores, grafted_scores, graft_info = self._score_choices_offline(sample)
313
- t_elapsed = time.time() - t0
314
-
315
- t_baseline = t_elapsed / 2
316
- t_grafted = t_elapsed / 2
317
- posteriors = graft_info["posteriors"]
318
- entropy = graft_info["entropy"]
319
- emitted = graft_info["emitted"]
320
  else:
321
- raise ValueError(f"Unknown mode: {self.mode}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
 
323
  baseline_pred = int(np.argmax(baseline_scores))
324
  grafted_pred = int(np.argmax(grafted_scores))
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  return SampleResult(
327
  sample_id=sample.id,
328
  task=sample.metadata.get("task", ""),
329
  gold=sample.gold,
 
330
  baseline_pred=baseline_pred,
331
  grafted_pred=grafted_pred,
332
- baseline_correct=(baseline_pred == sample.gold),
333
- grafted_correct=(grafted_pred == sample.gold),
334
  baseline_scores=baseline_scores,
335
  grafted_scores=grafted_scores,
336
- graft_posteriors=posteriors,
337
  graft_entropy=entropy,
338
- graft_emitted=emitted,
339
- wall_time_baseline=t_baseline,
340
- wall_time_grafted=t_grafted,
 
 
341
  )
342
 
343
  def evaluate_task(self, task_name: str,
344
  max_samples: Optional[int] = None,
345
  verbose: bool = False) -> TaskResult:
346
- """Evaluate all samples in a task."""
347
  config = TASK_REGISTRY[task_name]
348
  samples = load_task_samples(task_name, max_samples)
349
 
350
  if verbose:
351
- print(f" [{task_name}] Loading {len(samples)} samples...")
352
 
353
  results = []
354
  for i, sample in enumerate(samples):
@@ -357,65 +405,78 @@ class EvalRunner:
357
  if verbose and (i + 1) % 100 == 0:
358
  acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
359
  acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
360
- print(f" {i+1}/{len(samples)}: baseline={acc_b:.1%} grafted={acc_g:.1%}")
361
 
362
  n = len(results)
363
  if n == 0:
364
  return TaskResult(
365
- task=task_name, domain=config.domain, n_samples=0,
366
  baseline_accuracy=0, grafted_accuracy=0, delta=0,
367
  baseline_correct=0, grafted_correct=0,
368
- mean_graft_entropy=0, mean_graft_emitted_rate=0,
369
- mean_wall_time_baseline=0, mean_wall_time_grafted=0,
370
- speedup=1.0,
371
  )
372
 
373
  bl_correct = sum(1 for r in results if r.baseline_correct)
374
  gr_correct = sum(1 for r in results if r.grafted_correct)
375
- bl_acc = bl_correct / n
376
- gr_acc = gr_correct / n
377
 
378
- mean_bl_time = np.mean([r.wall_time_baseline for r in results])
379
- mean_gr_time = np.mean([r.wall_time_grafted for r in results])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
 
381
  return TaskResult(
382
  task=task_name,
383
  domain=config.domain,
384
  n_samples=n,
385
- baseline_accuracy=bl_acc,
386
- grafted_accuracy=gr_acc,
387
- delta=gr_acc - bl_acc,
 
388
  baseline_correct=bl_correct,
389
  grafted_correct=gr_correct,
 
 
 
 
390
  mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
391
- mean_graft_emitted_rate=np.mean([r.graft_emitted for r in results]),
392
- mean_wall_time_baseline=mean_bl_time,
393
- mean_wall_time_grafted=mean_gr_time,
394
- speedup=mean_bl_time / max(mean_gr_time, 1e-9),
395
  )
396
 
397
  def run_benchmark(self, tasks: Optional[List[str]] = None,
398
  max_samples_per_task: Optional[int] = None,
399
  verbose: bool = True) -> BenchmarkResult:
400
- """
401
- Run the full benchmark across multiple tasks.
402
-
403
- Args:
404
- tasks: List of task names. None = all tasks.
405
- max_samples_per_task: Cap per task (for fast dev runs).
406
- verbose: Print progress.
407
- """
408
  if tasks is None:
409
  tasks = list(TASK_REGISTRY.keys())
410
 
411
  if verbose:
412
  print(f"\n{'β–ˆ' * 60}")
413
  print(f" TENSEGRITY BENCHMARK")
414
- print(f" Model: {self.model_name}")
415
- print(f" Mode: {self.mode}")
416
- print(f" Tasks: {len(tasks)}")
 
417
  cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
418
- print(f" Samples/task: {cap_str}")
419
  print(f"{'β–ˆ' * 60}")
420
 
421
  t_start = time.time()
@@ -430,44 +491,96 @@ class EvalRunner:
430
  task_results.append(tr)
431
  if verbose:
432
  sign = "+" if tr.delta >= 0 else ""
433
- print(f" β†’ baseline={tr.baseline_accuracy:.1%} "
434
- f"grafted={tr.grafted_accuracy:.1%} "
435
- f"Ξ”={sign}{tr.delta:.1%} "
436
- f"(n={tr.n_samples}, emit={tr.mean_graft_emitted_rate:.0%})")
 
437
  except Exception as e:
438
  logger.error(f"Task {task_name} failed: {e}")
439
  if verbose:
440
  print(f" βœ— FAILED: {e}")
 
441
 
442
  total_time = time.time() - t_start
443
 
444
- # Aggregate
445
  total_bl = sum(t.baseline_correct for t in task_results)
446
  total_gr = sum(t.grafted_correct for t in task_results)
447
  total_n = sum(t.n_samples for t in task_results)
448
 
449
- overall_bl = total_bl / max(total_n, 1)
450
- overall_gr = total_gr / max(total_n, 1)
 
 
 
 
451
 
452
  result = BenchmarkResult(
453
  model_name=self.model_name,
 
 
454
  tasks=task_results,
455
- overall_baseline_accuracy=overall_bl,
456
- overall_grafted_accuracy=overall_gr,
457
- overall_delta=overall_gr - overall_bl,
 
458
  total_samples=total_n,
459
  total_wall_time=total_time,
460
  timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
461
  )
462
 
463
  if verbose:
464
- print(f"\n{'═' * 68}")
465
  print(result.summary_table())
466
- print(f"\nTotal time: {total_time:.1f}s")
467
- print(f"{'═' * 68}")
 
 
 
 
468
 
469
  return result
470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
471
  def save_results(self, result: BenchmarkResult, path: str):
472
  """Save benchmark results to JSON."""
473
  with open(path, "w") as f:
 
7
  P(choice | prompt) computed from raw logits.
8
  Prediction = argmax over choices.
9
 
10
+ GRAFTED: score(choice) = llm_logprob(choice) + Ξ» * tensegrity_score(choice)
11
+ Where Ξ» controls the graft weight. Ξ»=0 recovers baseline.
12
+
13
+ The ONLY difference is the additive Tensegrity term.
 
 
 
 
14
  This is a controlled A/B comparison.
15
 
16
+ Metrics per task:
17
+ - raw_acc: baseline accuracy
18
+ - grafted_acc: grafted accuracy
19
+ - delta: grafted - baseline
20
+ - coverage: fraction of samples where graft posteriors are non-uniform
21
+ - cond_acc_biased: accuracy on the subset where graft was non-uniform
22
+ - mean_bias_mag: mean max absolute Tensegrity score deviation from uniform
23
+ - flip_rate: fraction of samples where baseline_pred != grafted_pred
24
+ - good_flips: LLM wrong β†’ graft right
25
+ - bad_flips: LLM right β†’ graft wrong
26
+ - preserved: LLM right β†’ graft right
27
+ - neutral: LLM wrong β†’ graft wrong
28
  """
29
 
30
  import numpy as np
 
32
  import json
33
  import logging
34
  from typing import Dict, List, Optional, Any, Tuple
35
+ from dataclasses import dataclass, field
36
  from pathlib import Path
37
 
38
  from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
 
47
  sample_id: str
48
  task: str
49
  gold: int
50
+ n_choices: int
51
  baseline_pred: int
52
  grafted_pred: int
53
  baseline_correct: bool
54
  grafted_correct: bool
55
  baseline_scores: List[float]
56
  grafted_scores: List[float]
57
+ tensegrity_scores: List[float] # Raw Tensegrity posteriors (pre-blend)
58
+ graft_entropy: float # Normalized entropy of Tensegrity posteriors
59
+ bias_applied: bool # Did Tensegrity posteriors differ from uniform?
60
+ bias_magnitude: float # Max absolute deviation from uniform
61
+ flip_type: str # "good_flip", "bad_flip", "preserved", "neutral", "no_flip"
62
+ lam: float # Ξ» used for this evaluation
63
+ wall_time: float
64
+
65
+
66
+ @dataclass
67
+ class FlipAccounting:
68
+ """Flip analysis for one task."""
69
+ good_flips: int = 0 # LLM wrong β†’ graft right
70
+ bad_flips: int = 0 # LLM right β†’ graft wrong
71
+ preserved: int = 0 # LLM right β†’ graft right
72
+ neutral: int = 0 # LLM wrong β†’ graft wrong (no change)
73
+ no_flip: int = 0 # Same prediction (subset of preserved + neutral)
74
+
75
+ @property
76
+ def total(self):
77
+ return self.good_flips + self.bad_flips + self.preserved + self.neutral
78
+
79
+ @property
80
+ def flip_rate(self):
81
+ return (self.good_flips + self.bad_flips) / max(self.total, 1)
82
+
83
+ @property
84
+ def good_bad_ratio(self):
85
+ if self.bad_flips == 0:
86
+ return float('inf') if self.good_flips > 0 else 0.0
87
+ return self.good_flips / self.bad_flips
88
+
89
+ def to_dict(self):
90
+ return {
91
+ "good_flips": self.good_flips,
92
+ "bad_flips": self.bad_flips,
93
+ "preserved": self.preserved,
94
+ "neutral": self.neutral,
95
+ "flip_rate": round(self.flip_rate, 4),
96
+ "good_bad_ratio": round(self.good_bad_ratio, 2) if self.good_bad_ratio != float('inf') else "inf",
97
+ }
98
 
99
 
100
  @dataclass
 
103
  task: str
104
  domain: str
105
  n_samples: int
106
+ lam: float
107
+ # Core accuracy
108
  baseline_accuracy: float
109
  grafted_accuracy: float
110
+ delta: float
111
  baseline_correct: int
112
  grafted_correct: int
113
+ # Graft diagnostics
114
+ coverage: float # Fraction where bias_applied=True
115
+ cond_acc_biased: float # Accuracy only on samples where bias was applied
116
+ cond_acc_unbiased: float # Accuracy only on samples where bias was NOT applied
117
+ mean_bias_magnitude: float
118
  mean_graft_entropy: float
119
+ # Flips
120
+ flips: FlipAccounting
121
+ # Timing
122
+ mean_wall_time: float
123
 
124
 
125
  @dataclass
126
  class BenchmarkResult:
127
  """Full benchmark result across all tasks."""
128
  model_name: str
129
+ mode: str
130
+ lam: float
131
  tasks: List[TaskResult]
132
  overall_baseline_accuracy: float
133
  overall_grafted_accuracy: float
134
  overall_delta: float
135
+ overall_flips: FlipAccounting
136
  total_samples: int
137
  total_wall_time: float
138
  timestamp: str = ""
 
140
  def to_dict(self) -> dict:
141
  return {
142
  "model": self.model_name,
143
+ "mode": self.mode,
144
+ "lambda": self.lam,
145
  "overall": {
146
  "baseline_accuracy": round(self.overall_baseline_accuracy, 4),
147
  "grafted_accuracy": round(self.overall_grafted_accuracy, 4),
148
  "delta": round(self.overall_delta, 4),
149
  "total_samples": self.total_samples,
150
  "wall_time_s": round(self.total_wall_time, 1),
151
+ "flips": self.overall_flips.to_dict(),
152
  },
153
  "tasks": [
154
  {
155
  "task": t.task,
156
  "domain": t.domain,
157
  "n": t.n_samples,
158
+ "lambda": t.lam,
159
  "baseline": round(t.baseline_accuracy, 4),
160
  "grafted": round(t.grafted_accuracy, 4),
161
  "delta": round(t.delta, 4),
162
+ "coverage": round(t.coverage, 3),
163
+ "cond_acc_biased": round(t.cond_acc_biased, 4),
164
+ "mean_bias_mag": round(t.mean_bias_magnitude, 4),
165
+ "mean_entropy": round(t.mean_graft_entropy, 3),
166
+ "flips": t.flips.to_dict(),
167
  }
168
  for t in self.tasks
169
  ],
 
171
 
172
  def summary_table(self) -> str:
173
  lines = []
174
+ lines.append(f"{'Task':<22} {'N':>5} {'Base':>7} {'Graft':>7} {'Ξ”':>7}"
175
+ f" {'Cov':>5} {'G/B':>6} {'Gβ†’βœ“':>4} {'Gβ†’βœ—':>4}")
176
+ lines.append("─" * 75)
177
  for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
178
  sign = "+" if t.delta >= 0 else ""
179
+ gb = t.flips.good_bad_ratio
180
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
181
  lines.append(
182
+ f"{t.task:<22} {t.n_samples:>5} {t.baseline_accuracy:>6.1%} "
183
+ f"{t.grafted_accuracy:>6.1%} {sign}{t.delta:>6.1%}"
184
+ f" {t.coverage:>4.0%} {gb_str:>6} {t.flips.good_flips:>4} {t.flips.bad_flips:>4}"
185
  )
186
+ lines.append("─" * 75)
187
  sign = "+" if self.overall_delta >= 0 else ""
188
+ gb = self.overall_flips.good_bad_ratio
189
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
190
  lines.append(
191
+ f"{'OVERALL':<22} {self.total_samples:>5} {self.overall_baseline_accuracy:>6.1%} "
192
+ f"{self.overall_grafted_accuracy:>6.1%} {sign}{self.overall_delta:>6.1%}"
193
+ f" {'':>5} {gb_str:>6} "
194
+ f"{self.overall_flips.good_flips:>4} {self.overall_flips.bad_flips:>4}"
195
  )
196
  return "\n".join(lines)
197
 
 
202
 
203
  Modes:
204
  "local" β€” Uses transformers model with LogitsProcessor
205
+ "offline" β€” No LLM; baseline = random, grafted = Tensegrity posteriors
206
  (tests the cognitive layer in isolation)
207
+
208
+ Ξ» parameter:
209
+ score(choice) = baseline_score(choice) + Ξ» * tensegrity_score(choice)
210
+ Ξ»=0 β†’ pure baseline. Ξ»>0 β†’ graft contributes. Sweep to find optimal.
211
  """
212
 
213
  def __init__(self,
214
  model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
215
  mode: str = "offline",
216
+ lam: float = 1.0,
 
217
  seed: int = 42):
218
+ """
219
+ Args:
220
+ model_name: HF model ID for local mode
221
+ mode: "offline" or "local"
222
+ lam: Ξ» β€” graft weight. score = baseline + Ξ» * tensegrity
223
+ seed: Random seed
224
+ """
225
  self.model_name = model_name
226
  self.mode = mode
227
+ self.lam = lam
 
228
  self.seed = seed
229
 
 
230
  self._model = None
231
  self._tokenizer = None
232
 
233
  def _init_model(self):
234
+ if self._model is not None or self.mode != "local":
 
235
  return
 
 
 
236
  from transformers import AutoTokenizer, AutoModelForCausalLM
237
+ import torch
238
  dtype, device_map, move_to = inference_load_settings()
239
  logger.info(f"Loading model {self.model_name}...")
240
  self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
 
248
  if move_to is not None:
249
  self._model = self._model.to(move_to)
250
  self._model.eval()
 
251
 
252
  # ─── SCORING ────────────────────────────────────────────
253
 
254
+ def _score_choices_local(self, prompt: str, choices: List[str]) -> List[float]:
255
+ """Score each choice by log P(choice | prompt)."""
 
 
 
 
 
 
256
  import torch
 
 
257
  scores = []
258
  for choice in choices:
259
  full_text = f"{prompt} {choice}"
 
261
  truncation=True, max_length=512)
262
  if hasattr(self._model, 'device'):
263
  inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
 
264
  with torch.no_grad():
265
  outputs = self._model(**inputs)
266
+ logits = outputs.logits
 
 
267
  prompt_ids = self._tokenizer(prompt, return_tensors="pt",
268
  truncation=True, max_length=512)["input_ids"]
269
  n_prompt = prompt_ids.shape[1]
270
  n_total = inputs["input_ids"].shape[1]
 
 
271
  log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
272
  choice_log_prob = 0.0
273
  for pos in range(n_prompt, n_total):
274
  token_id = inputs["input_ids"][0, pos].item()
275
  choice_log_prob += log_probs[pos - 1, token_id].item()
 
 
276
  n_choice_tokens = max(n_total - n_prompt, 1)
277
  scores.append(choice_log_prob / n_choice_tokens)
 
278
  return scores
279
 
280
+ def _get_tensegrity_scores(self, sample: TaskSample) -> Tuple[List[float], float]:
281
  """
282
+ Run Tensegrity cognitive layer on a sample.
283
+ Returns (posteriors_list, normalized_entropy).
 
 
 
 
284
  """
285
  from tensegrity.broca.controller import CognitiveController
286
 
287
  n = len(sample.choices)
 
 
 
 
 
288
  controller = CognitiveController(
289
  n_hypotheses=n,
290
  hypothesis_labels=[f"choice_{i}" for i in range(n)],
291
  use_llm=False,
292
  )
 
 
 
293
  for i, hyp in enumerate(controller.belief_state.hypotheses):
294
+ hyp.description = sample.choices[i][:50]
295
 
296
+ controller.step(sample.prompt)
297
 
298
+ posteriors = [
 
 
 
299
  controller.belief_state.hypotheses[i].probability
300
  for i in range(n)
301
  ]
302
 
303
+ probs = np.array(posteriors)
 
304
  probs = probs[probs > 0]
305
  if len(probs) > 1:
306
  entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
307
  else:
308
  entropy = 0.0
309
 
310
+ return posteriors, entropy
 
 
 
 
 
 
 
 
311
 
312
  # ─── EVALUATION ─────────────────────────────────────────
313
 
314
  def evaluate_sample(self, sample: TaskSample) -> SampleResult:
315
+ """Evaluate a single sample with full diagnostics."""
316
+ t0 = time.time()
317
+ n = len(sample.choices)
318
+ uniform = 1.0 / n
319
+
320
+ # Get Tensegrity scores
321
+ tensegrity_scores, entropy = self._get_tensegrity_scores(sample)
322
+
323
+ # Compute bias diagnostics
324
+ deviations = [abs(s - uniform) for s in tensegrity_scores]
325
+ bias_magnitude = max(deviations)
326
+ # bias_applied = posteriors are meaningfully non-uniform
327
+ bias_applied = bias_magnitude > 0.02 # More than 2% deviation from uniform
328
+
329
+ # Get baseline scores
330
  if self.mode == "local":
331
  self._init_model()
 
 
332
  baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  else:
334
+ # Offline: random baseline (seeded by sample ID for reproducibility)
335
+ rng = np.random.RandomState(hash(sample.id) % 2**31)
336
+ baseline_scores = rng.randn(n).tolist()
337
+
338
+ # Grafted: baseline + Ξ» * tensegrity
339
+ # Normalize tensegrity scores to be on comparable scale to baseline
340
+ # In offline mode, baseline is N(0,1), tensegrity is [0,1] probabilities
341
+ # In local mode, baseline is log-probs (~[-5, 0]), tensegrity is [0,1]
342
+ # Convert tensegrity to log-odds for better scale matching
343
+ tensegrity_logodds = [
344
+ np.log(max(s, 1e-9)) - np.log(uniform)
345
+ for s in tensegrity_scores
346
+ ]
347
+
348
+ grafted_scores = [
349
+ b + self.lam * t
350
+ for b, t in zip(baseline_scores, tensegrity_logodds)
351
+ ]
352
 
353
  baseline_pred = int(np.argmax(baseline_scores))
354
  grafted_pred = int(np.argmax(grafted_scores))
355
 
356
+ baseline_correct = (baseline_pred == sample.gold)
357
+ grafted_correct = (grafted_pred == sample.gold)
358
+
359
+ # Flip classification
360
+ if baseline_pred == grafted_pred:
361
+ flip_type = "preserved" if baseline_correct else "neutral"
362
+ elif not baseline_correct and grafted_correct:
363
+ flip_type = "good_flip"
364
+ elif baseline_correct and not grafted_correct:
365
+ flip_type = "bad_flip"
366
+ else:
367
+ flip_type = "neutral" # Both wrong, different wrong answers
368
+
369
+ wall_time = time.time() - t0
370
+
371
  return SampleResult(
372
  sample_id=sample.id,
373
  task=sample.metadata.get("task", ""),
374
  gold=sample.gold,
375
+ n_choices=n,
376
  baseline_pred=baseline_pred,
377
  grafted_pred=grafted_pred,
378
+ baseline_correct=baseline_correct,
379
+ grafted_correct=grafted_correct,
380
  baseline_scores=baseline_scores,
381
  grafted_scores=grafted_scores,
382
+ tensegrity_scores=tensegrity_scores,
383
  graft_entropy=entropy,
384
+ bias_applied=bias_applied,
385
+ bias_magnitude=bias_magnitude,
386
+ flip_type=flip_type,
387
+ lam=self.lam,
388
+ wall_time=wall_time,
389
  )
390
 
391
  def evaluate_task(self, task_name: str,
392
  max_samples: Optional[int] = None,
393
  verbose: bool = False) -> TaskResult:
394
+ """Evaluate all samples in a task with full flip accounting."""
395
  config = TASK_REGISTRY[task_name]
396
  samples = load_task_samples(task_name, max_samples)
397
 
398
  if verbose:
399
+ print(f" [{task_name}] Loaded {len(samples)} samples")
400
 
401
  results = []
402
  for i, sample in enumerate(samples):
 
405
  if verbose and (i + 1) % 100 == 0:
406
  acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
407
  acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
408
+ print(f" {i+1}/{len(samples)}: base={acc_b:.1%} graft={acc_g:.1%}")
409
 
410
  n = len(results)
411
  if n == 0:
412
  return TaskResult(
413
+ task=task_name, domain=config.domain, n_samples=0, lam=self.lam,
414
  baseline_accuracy=0, grafted_accuracy=0, delta=0,
415
  baseline_correct=0, grafted_correct=0,
416
+ coverage=0, cond_acc_biased=0, cond_acc_unbiased=0,
417
+ mean_bias_magnitude=0, mean_graft_entropy=0,
418
+ flips=FlipAccounting(), mean_wall_time=0,
419
  )
420
 
421
  bl_correct = sum(1 for r in results if r.baseline_correct)
422
  gr_correct = sum(1 for r in results if r.grafted_correct)
 
 
423
 
424
+ # Flip accounting
425
+ flips = FlipAccounting()
426
+ for r in results:
427
+ if r.flip_type == "good_flip":
428
+ flips.good_flips += 1
429
+ elif r.flip_type == "bad_flip":
430
+ flips.bad_flips += 1
431
+ elif r.flip_type == "preserved":
432
+ flips.preserved += 1
433
+ elif r.flip_type == "neutral":
434
+ flips.neutral += 1
435
+
436
+ # Coverage: fraction where bias was non-trivial
437
+ biased = [r for r in results if r.bias_applied]
438
+ coverage = len(biased) / n
439
+
440
+ # Conditional accuracy
441
+ cond_acc_biased = (sum(1 for r in biased if r.grafted_correct) / len(biased)) if biased else 0.0
442
+ unbiased = [r for r in results if not r.bias_applied]
443
+ cond_acc_unbiased = (sum(1 for r in unbiased if r.grafted_correct) / len(unbiased)) if unbiased else 0.0
444
 
445
  return TaskResult(
446
  task=task_name,
447
  domain=config.domain,
448
  n_samples=n,
449
+ lam=self.lam,
450
+ baseline_accuracy=bl_correct / n,
451
+ grafted_accuracy=gr_correct / n,
452
+ delta=(gr_correct - bl_correct) / n,
453
  baseline_correct=bl_correct,
454
  grafted_correct=gr_correct,
455
+ coverage=coverage,
456
+ cond_acc_biased=cond_acc_biased,
457
+ cond_acc_unbiased=cond_acc_unbiased,
458
+ mean_bias_magnitude=np.mean([r.bias_magnitude for r in results]),
459
  mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
460
+ flips=flips,
461
+ mean_wall_time=np.mean([r.wall_time for r in results]),
 
 
462
  )
463
 
464
  def run_benchmark(self, tasks: Optional[List[str]] = None,
465
  max_samples_per_task: Optional[int] = None,
466
  verbose: bool = True) -> BenchmarkResult:
467
+ """Run the full benchmark across multiple tasks."""
 
 
 
 
 
 
 
468
  if tasks is None:
469
  tasks = list(TASK_REGISTRY.keys())
470
 
471
  if verbose:
472
  print(f"\n{'β–ˆ' * 60}")
473
  print(f" TENSEGRITY BENCHMARK")
474
+ print(f" Model: {self.model_name}")
475
+ print(f" Mode: {self.mode}")
476
+ print(f" Ξ»: {self.lam}")
477
+ print(f" Tasks: {len(tasks)}")
478
  cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
479
+ print(f" N/task: {cap_str}")
480
  print(f"{'β–ˆ' * 60}")
481
 
482
  t_start = time.time()
 
491
  task_results.append(tr)
492
  if verbose:
493
  sign = "+" if tr.delta >= 0 else ""
494
+ gb = tr.flips.good_bad_ratio
495
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
496
+ print(f" base={tr.baseline_accuracy:.1%} graft={tr.grafted_accuracy:.1%} "
497
+ f"Ξ”={sign}{tr.delta:.1%} cov={tr.coverage:.0%} "
498
+ f"flips={tr.flips.good_flips}↑/{tr.flips.bad_flips}↓ G/B={gb_str}")
499
  except Exception as e:
500
  logger.error(f"Task {task_name} failed: {e}")
501
  if verbose:
502
  print(f" βœ— FAILED: {e}")
503
+ import traceback; traceback.print_exc()
504
 
505
  total_time = time.time() - t_start
506
 
 
507
  total_bl = sum(t.baseline_correct for t in task_results)
508
  total_gr = sum(t.grafted_correct for t in task_results)
509
  total_n = sum(t.n_samples for t in task_results)
510
 
511
+ overall_flips = FlipAccounting()
512
+ for t in task_results:
513
+ overall_flips.good_flips += t.flips.good_flips
514
+ overall_flips.bad_flips += t.flips.bad_flips
515
+ overall_flips.preserved += t.flips.preserved
516
+ overall_flips.neutral += t.flips.neutral
517
 
518
  result = BenchmarkResult(
519
  model_name=self.model_name,
520
+ mode=self.mode,
521
+ lam=self.lam,
522
  tasks=task_results,
523
+ overall_baseline_accuracy=total_bl / max(total_n, 1),
524
+ overall_grafted_accuracy=total_gr / max(total_n, 1),
525
+ overall_delta=(total_gr - total_bl) / max(total_n, 1),
526
+ overall_flips=overall_flips,
527
  total_samples=total_n,
528
  total_wall_time=total_time,
529
  timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
530
  )
531
 
532
  if verbose:
533
+ print(f"\n{'═' * 75}")
534
  print(result.summary_table())
535
+ print(f"\n Ξ»={self.lam} Time={total_time:.1f}s")
536
+ print(f" Total flips: {overall_flips.good_flips}↑ good, "
537
+ f"{overall_flips.bad_flips}↓ bad, "
538
+ f"{overall_flips.preserved} preserved, "
539
+ f"{overall_flips.neutral} neutral")
540
+ print(f"{'═' * 75}")
541
 
542
  return result
543
 
544
+ def sweep_lambda(self, tasks: Optional[List[str]] = None,
545
+ lambdas: Optional[List[float]] = None,
546
+ max_samples_per_task: Optional[int] = None,
547
+ verbose: bool = True) -> List[BenchmarkResult]:
548
+ """
549
+ Sweep Ξ» to find optimal graft weight.
550
+
551
+ Args:
552
+ lambdas: Values to sweep. Default: [0, 0.1, 0.25, 0.5, 1.0, 2.0]
553
+ """
554
+ if lambdas is None:
555
+ lambdas = [0.0, 0.1, 0.25, 0.5, 1.0, 2.0]
556
+
557
+ if verbose:
558
+ print(f"\n{'β–ˆ' * 60}")
559
+ print(f" Ξ» SWEEP: {lambdas}")
560
+ print(f"{'β–ˆ' * 60}")
561
+
562
+ results = []
563
+ for lam_val in lambdas:
564
+ self.lam = lam_val
565
+ result = self.run_benchmark(tasks, max_samples_per_task, verbose=False)
566
+ results.append(result)
567
+
568
+ if verbose:
569
+ sign = "+" if result.overall_delta >= 0 else ""
570
+ gb = result.overall_flips.good_bad_ratio
571
+ gb_str = f"{gb:.1f}" if gb != float('inf') else "∞"
572
+ print(f" Ξ»={lam_val:<5} base={result.overall_baseline_accuracy:.1%} "
573
+ f"graft={result.overall_grafted_accuracy:.1%} "
574
+ f"Ξ”={sign}{result.overall_delta:.1%} G/B={gb_str} "
575
+ f"({result.overall_flips.good_flips}↑/{result.overall_flips.bad_flips}↓)")
576
+
577
+ if verbose:
578
+ # Find optimal Ξ»
579
+ best = max(results, key=lambda r: r.overall_delta)
580
+ print(f"\n Best Ξ» = {best.lam} β†’ Ξ” = {best.overall_delta:+.1%}")
581
+
582
+ return results
583
+
584
  def save_results(self, result: BenchmarkResult, path: str):
585
  """Save benchmark results to JSON."""
586
  with open(path, "w") as f: