Merge branch 'main' of https://huggingface.co/theapemachine/tensegrity
Browse files- tensegrity/bench/run.py +39 -20
- tensegrity/bench/runner.py +290 -177
tensegrity/bench/run.py
CHANGED
|
@@ -3,14 +3,17 @@
|
|
| 3 |
Tensegrity Benchmark CLI.
|
| 4 |
|
| 5 |
Usage:
|
| 6 |
-
# Quick
|
| 7 |
-
python -m tensegrity.bench.run --mode offline --max-samples
|
| 8 |
|
| 9 |
-
# Full offline benchmark
|
| 10 |
python -m tensegrity.bench.run --mode offline
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
# Local model benchmark (requires GPU):
|
| 13 |
-
python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct
|
| 14 |
|
| 15 |
# Save results:
|
| 16 |
python -m tensegrity.bench.run --mode offline --output results.json
|
|
@@ -35,10 +38,12 @@ def main():
|
|
| 35 |
help="Comma-separated task names (default: all)")
|
| 36 |
parser.add_argument("--max-samples", type=int, default=None,
|
| 37 |
help="Max samples per task (default: all)")
|
| 38 |
-
parser.add_argument("--
|
| 39 |
-
help="
|
| 40 |
-
parser.add_argument("--
|
| 41 |
-
help="
|
|
|
|
|
|
|
| 42 |
parser.add_argument("--output", default=None,
|
| 43 |
help="Save results to JSON file")
|
| 44 |
parser.add_argument("--list-tasks", action="store_true",
|
|
@@ -62,22 +67,36 @@ def main():
|
|
| 62 |
runner = EvalRunner(
|
| 63 |
model_name=args.model,
|
| 64 |
mode=args.mode,
|
| 65 |
-
|
| 66 |
-
graft_entropy_gate=args.entropy_gate,
|
| 67 |
seed=args.seed,
|
| 68 |
)
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
else:
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
|
| 83 |
if __name__ == "__main__":
|
|
|
|
| 3 |
Tensegrity Benchmark CLI.
|
| 4 |
|
| 5 |
Usage:
|
| 6 |
+
# Quick benchmark (offline, 50 samples/task):
|
| 7 |
+
python -m tensegrity.bench.run --mode offline --max-samples 50 --tasks copa,boolq,sciq
|
| 8 |
|
| 9 |
+
# Full offline benchmark:
|
| 10 |
python -m tensegrity.bench.run --mode offline
|
| 11 |
|
| 12 |
+
# Ξ» sweep (find optimal graft weight):
|
| 13 |
+
python -m tensegrity.bench.run --sweep --max-samples 100 --tasks copa,sciq,truthfulqa
|
| 14 |
+
|
| 15 |
# Local model benchmark (requires GPU):
|
| 16 |
+
python -m tensegrity.bench.run --mode local --model meta-llama/Llama-3.2-1B-Instruct
|
| 17 |
|
| 18 |
# Save results:
|
| 19 |
python -m tensegrity.bench.run --mode offline --output results.json
|
|
|
|
| 38 |
help="Comma-separated task names (default: all)")
|
| 39 |
parser.add_argument("--max-samples", type=int, default=None,
|
| 40 |
help="Max samples per task (default: all)")
|
| 41 |
+
parser.add_argument("--lam", type=float, default=1.0,
|
| 42 |
+
help="Ξ» β graft weight: score = baseline + Ξ»*tensegrity (default: 1.0)")
|
| 43 |
+
parser.add_argument("--sweep", action="store_true",
|
| 44 |
+
help="Run Ξ» sweep over [0, 0.1, 0.25, 0.5, 1.0, 2.0]")
|
| 45 |
+
parser.add_argument("--sweep-lambdas", default=None,
|
| 46 |
+
help="Custom Ξ» values for sweep (comma-separated, e.g. 0,0.5,1,2,4)")
|
| 47 |
parser.add_argument("--output", default=None,
|
| 48 |
help="Save results to JSON file")
|
| 49 |
parser.add_argument("--list-tasks", action="store_true",
|
|
|
|
| 67 |
runner = EvalRunner(
|
| 68 |
model_name=args.model,
|
| 69 |
mode=args.mode,
|
| 70 |
+
lam=args.lam,
|
|
|
|
| 71 |
seed=args.seed,
|
| 72 |
)
|
| 73 |
|
| 74 |
+
if args.sweep:
|
| 75 |
+
lambdas = None
|
| 76 |
+
if args.sweep_lambdas:
|
| 77 |
+
lambdas = [float(x) for x in args.sweep_lambdas.split(",")]
|
| 78 |
+
results = runner.sweep_lambda(
|
| 79 |
+
tasks=tasks,
|
| 80 |
+
lambdas=lambdas,
|
| 81 |
+
max_samples_per_task=args.max_samples,
|
| 82 |
+
verbose=not args.quiet,
|
| 83 |
+
)
|
| 84 |
+
if args.output:
|
| 85 |
+
sweep_data = [r.to_dict() for r in results]
|
| 86 |
+
with open(args.output, "w") as f:
|
| 87 |
+
json.dump(sweep_data, f, indent=2)
|
| 88 |
+
print(f"\nSweep results saved to {args.output}")
|
| 89 |
else:
|
| 90 |
+
result = runner.run_benchmark(
|
| 91 |
+
tasks=tasks,
|
| 92 |
+
max_samples_per_task=args.max_samples,
|
| 93 |
+
verbose=not args.quiet,
|
| 94 |
+
)
|
| 95 |
+
if args.output:
|
| 96 |
+
runner.save_results(result, args.output)
|
| 97 |
+
print(f"\nResults saved to {args.output}")
|
| 98 |
+
elif not args.quiet:
|
| 99 |
+
print(f"\n{json.dumps(result.to_dict(), indent=2)}")
|
| 100 |
|
| 101 |
|
| 102 |
if __name__ == "__main__":
|
tensegrity/bench/runner.py
CHANGED
|
@@ -7,21 +7,24 @@ Two evaluation modes per sample:
|
|
| 7 |
P(choice | prompt) computed from raw logits.
|
| 8 |
Prediction = argmax over choices.
|
| 9 |
|
| 10 |
-
GRAFTED:
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
biased scores.
|
| 15 |
-
|
| 16 |
-
Both modes use identical prompts, identical model, identical decoding.
|
| 17 |
-
The ONLY difference is the presence/absence of the logit-bias graft.
|
| 18 |
This is a controlled A/B comparison.
|
| 19 |
|
| 20 |
-
Metrics:
|
| 21 |
-
-
|
| 22 |
-
-
|
| 23 |
-
- delta:
|
| 24 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
"""
|
| 26 |
|
| 27 |
import numpy as np
|
|
@@ -29,7 +32,7 @@ import time
|
|
| 29 |
import json
|
| 30 |
import logging
|
| 31 |
from typing import Dict, List, Optional, Any, Tuple
|
| 32 |
-
from dataclasses import dataclass, field
|
| 33 |
from pathlib import Path
|
| 34 |
|
| 35 |
from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
|
|
@@ -44,17 +47,54 @@ class SampleResult:
|
|
| 44 |
sample_id: str
|
| 45 |
task: str
|
| 46 |
gold: int
|
|
|
|
| 47 |
baseline_pred: int
|
| 48 |
grafted_pred: int
|
| 49 |
baseline_correct: bool
|
| 50 |
grafted_correct: bool
|
| 51 |
baseline_scores: List[float]
|
| 52 |
grafted_scores: List[float]
|
| 53 |
-
|
| 54 |
-
graft_entropy: float
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
@dataclass
|
|
@@ -63,26 +103,36 @@ class TaskResult:
|
|
| 63 |
task: str
|
| 64 |
domain: str
|
| 65 |
n_samples: int
|
|
|
|
|
|
|
| 66 |
baseline_accuracy: float
|
| 67 |
grafted_accuracy: float
|
| 68 |
-
delta: float
|
| 69 |
baseline_correct: int
|
| 70 |
grafted_correct: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
mean_graft_entropy: float
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
|
| 77 |
|
| 78 |
@dataclass
|
| 79 |
class BenchmarkResult:
|
| 80 |
"""Full benchmark result across all tasks."""
|
| 81 |
model_name: str
|
|
|
|
|
|
|
| 82 |
tasks: List[TaskResult]
|
| 83 |
overall_baseline_accuracy: float
|
| 84 |
overall_grafted_accuracy: float
|
| 85 |
overall_delta: float
|
|
|
|
| 86 |
total_samples: int
|
| 87 |
total_wall_time: float
|
| 88 |
timestamp: str = ""
|
|
@@ -90,23 +140,30 @@ class BenchmarkResult:
|
|
| 90 |
def to_dict(self) -> dict:
|
| 91 |
return {
|
| 92 |
"model": self.model_name,
|
|
|
|
|
|
|
| 93 |
"overall": {
|
| 94 |
"baseline_accuracy": round(self.overall_baseline_accuracy, 4),
|
| 95 |
"grafted_accuracy": round(self.overall_grafted_accuracy, 4),
|
| 96 |
"delta": round(self.overall_delta, 4),
|
| 97 |
"total_samples": self.total_samples,
|
| 98 |
"wall_time_s": round(self.total_wall_time, 1),
|
|
|
|
| 99 |
},
|
| 100 |
"tasks": [
|
| 101 |
{
|
| 102 |
"task": t.task,
|
| 103 |
"domain": t.domain,
|
| 104 |
"n": t.n_samples,
|
|
|
|
| 105 |
"baseline": round(t.baseline_accuracy, 4),
|
| 106 |
"grafted": round(t.grafted_accuracy, 4),
|
| 107 |
"delta": round(t.delta, 4),
|
| 108 |
-
"
|
| 109 |
-
"
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
| 111 |
for t in self.tasks
|
| 112 |
],
|
|
@@ -114,19 +171,27 @@ class BenchmarkResult:
|
|
| 114 |
|
| 115 |
def summary_table(self) -> str:
|
| 116 |
lines = []
|
| 117 |
-
lines.append(f"{'Task':<
|
| 118 |
-
|
|
|
|
| 119 |
for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
|
| 120 |
sign = "+" if t.delta >= 0 else ""
|
|
|
|
|
|
|
| 121 |
lines.append(
|
| 122 |
-
f"{t.task:<
|
| 123 |
-
f"{t.grafted_accuracy:>
|
|
|
|
| 124 |
)
|
| 125 |
-
lines.append("β" *
|
| 126 |
sign = "+" if self.overall_delta >= 0 else ""
|
|
|
|
|
|
|
| 127 |
lines.append(
|
| 128 |
-
f"{'OVERALL':<
|
| 129 |
-
f"{self.overall_grafted_accuracy:>
|
|
|
|
|
|
|
| 130 |
)
|
| 131 |
return "\n".join(lines)
|
| 132 |
|
|
@@ -137,35 +202,39 @@ class EvalRunner:
|
|
| 137 |
|
| 138 |
Modes:
|
| 139 |
"local" β Uses transformers model with LogitsProcessor
|
| 140 |
-
"offline" β No LLM;
|
| 141 |
(tests the cognitive layer in isolation)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
"""
|
| 143 |
|
| 144 |
def __init__(self,
|
| 145 |
model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
|
| 146 |
mode: str = "offline",
|
| 147 |
-
|
| 148 |
-
graft_entropy_gate: float = 0.85,
|
| 149 |
seed: int = 42):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
self.model_name = model_name
|
| 151 |
self.mode = mode
|
| 152 |
-
self.
|
| 153 |
-
self.graft_entropy_gate = graft_entropy_gate
|
| 154 |
self.seed = seed
|
| 155 |
|
| 156 |
-
# Lazy-loaded
|
| 157 |
self._model = None
|
| 158 |
self._tokenizer = None
|
| 159 |
|
| 160 |
def _init_model(self):
|
| 161 |
-
|
| 162 |
-
if self._model is not None:
|
| 163 |
return
|
| 164 |
-
if self.mode != "local":
|
| 165 |
-
return
|
| 166 |
-
|
| 167 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 168 |
-
|
| 169 |
dtype, device_map, move_to = inference_load_settings()
|
| 170 |
logger.info(f"Loading model {self.model_name}...")
|
| 171 |
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
@@ -179,21 +248,12 @@ class EvalRunner:
|
|
| 179 |
if move_to is not None:
|
| 180 |
self._model = self._model.to(move_to)
|
| 181 |
self._model.eval()
|
| 182 |
-
logger.info("Model loaded.")
|
| 183 |
|
| 184 |
# βββ SCORING ββββββββββββββββββββββββββββββββββββββββββββ
|
| 185 |
|
| 186 |
-
def _score_choices_local(self, prompt: str, choices: List[str]
|
| 187 |
-
|
| 188 |
-
"""
|
| 189 |
-
Score each choice by computing log P(choice | prompt).
|
| 190 |
-
|
| 191 |
-
For each choice, concatenate prompt + choice, compute the
|
| 192 |
-
sum of log-probs over the choice tokens only.
|
| 193 |
-
"""
|
| 194 |
import torch
|
| 195 |
-
from transformers import LogitsProcessorList
|
| 196 |
-
|
| 197 |
scores = []
|
| 198 |
for choice in choices:
|
| 199 |
full_text = f"{prompt} {choice}"
|
|
@@ -201,154 +261,142 @@ class EvalRunner:
|
|
| 201 |
truncation=True, max_length=512)
|
| 202 |
if hasattr(self._model, 'device'):
|
| 203 |
inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
|
| 204 |
-
|
| 205 |
with torch.no_grad():
|
| 206 |
outputs = self._model(**inputs)
|
| 207 |
-
logits = outputs.logits
|
| 208 |
-
|
| 209 |
-
# Get log-probs for the choice tokens
|
| 210 |
prompt_ids = self._tokenizer(prompt, return_tensors="pt",
|
| 211 |
truncation=True, max_length=512)["input_ids"]
|
| 212 |
n_prompt = prompt_ids.shape[1]
|
| 213 |
n_total = inputs["input_ids"].shape[1]
|
| 214 |
-
|
| 215 |
-
# Sum log-probs of choice tokens
|
| 216 |
log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
|
| 217 |
choice_log_prob = 0.0
|
| 218 |
for pos in range(n_prompt, n_total):
|
| 219 |
token_id = inputs["input_ids"][0, pos].item()
|
| 220 |
choice_log_prob += log_probs[pos - 1, token_id].item()
|
| 221 |
-
|
| 222 |
-
# Length-normalize
|
| 223 |
n_choice_tokens = max(n_total - n_prompt, 1)
|
| 224 |
scores.append(choice_log_prob / n_choice_tokens)
|
| 225 |
-
|
| 226 |
return scores
|
| 227 |
|
| 228 |
-
def
|
| 229 |
"""
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
Baseline: uniform random (represents an LLM with no reasoning)
|
| 233 |
-
Grafted: Tensegrity processes the prompt and scores choices via posteriors
|
| 234 |
-
|
| 235 |
-
Returns (baseline_scores, grafted_scores, graft_info)
|
| 236 |
"""
|
| 237 |
from tensegrity.broca.controller import CognitiveController
|
| 238 |
|
| 239 |
n = len(sample.choices)
|
| 240 |
-
# Baseline: uniform scores (random baseline)
|
| 241 |
-
rng = np.random.RandomState(hash(sample.id) % 2**31)
|
| 242 |
-
baseline_scores = rng.randn(n).tolist()
|
| 243 |
-
|
| 244 |
-
# Grafted: Tensegrity processes the prompt as observation
|
| 245 |
controller = CognitiveController(
|
| 246 |
n_hypotheses=n,
|
| 247 |
hypothesis_labels=[f"choice_{i}" for i in range(n)],
|
| 248 |
use_llm=False,
|
| 249 |
)
|
| 250 |
-
|
| 251 |
-
# Feed the prompt as an observation, using choice keywords for grounding
|
| 252 |
-
# Inject choice content into the hypothesis labels for the template parser
|
| 253 |
for i, hyp in enumerate(controller.belief_state.hypotheses):
|
| 254 |
-
hyp.description = sample.choices[i][:50]
|
| 255 |
|
| 256 |
-
|
| 257 |
|
| 258 |
-
|
| 259 |
-
posteriors = {h.description: h.probability
|
| 260 |
-
for h in controller.belief_state.hypotheses}
|
| 261 |
-
grafted_scores = [
|
| 262 |
controller.belief_state.hypotheses[i].probability
|
| 263 |
for i in range(n)
|
| 264 |
]
|
| 265 |
|
| 266 |
-
|
| 267 |
-
probs = np.array(grafted_scores)
|
| 268 |
probs = probs[probs > 0]
|
| 269 |
if len(probs) > 1:
|
| 270 |
entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
|
| 271 |
else:
|
| 272 |
entropy = 0.0
|
| 273 |
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
graft_info = {
|
| 277 |
-
"posteriors": posteriors,
|
| 278 |
-
"entropy": entropy,
|
| 279 |
-
"emitted": emitted,
|
| 280 |
-
}
|
| 281 |
-
|
| 282 |
-
return baseline_scores, grafted_scores, graft_info
|
| 283 |
|
| 284 |
# βββ EVALUATION βββββββββββββββββββββββββββββββββββββββββ
|
| 285 |
|
| 286 |
def evaluate_sample(self, sample: TaskSample) -> SampleResult:
|
| 287 |
-
"""Evaluate a single sample
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
if self.mode == "local":
|
| 289 |
self._init_model()
|
| 290 |
-
|
| 291 |
-
t0 = time.time()
|
| 292 |
baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
|
| 293 |
-
t_baseline = time.time() - t0
|
| 294 |
-
|
| 295 |
-
# For grafted: build logit processor from Tensegrity beliefs
|
| 296 |
-
# (simplified: use offline posteriors as static bias)
|
| 297 |
-
t0 = time.time()
|
| 298 |
-
_, grafted_offline, graft_info = self._score_choices_offline(sample)
|
| 299 |
-
# Blend: 50% LLM score + 50% Tensegrity posterior
|
| 300 |
-
grafted_scores = [
|
| 301 |
-
0.5 * b + 0.5 * g
|
| 302 |
-
for b, g in zip(baseline_scores, grafted_offline)
|
| 303 |
-
]
|
| 304 |
-
t_grafted = time.time() - t0 + t_baseline # Includes LLM time
|
| 305 |
-
|
| 306 |
-
posteriors = graft_info["posteriors"]
|
| 307 |
-
entropy = graft_info["entropy"]
|
| 308 |
-
emitted = graft_info["emitted"]
|
| 309 |
-
|
| 310 |
-
elif self.mode == "offline":
|
| 311 |
-
t0 = time.time()
|
| 312 |
-
baseline_scores, grafted_scores, graft_info = self._score_choices_offline(sample)
|
| 313 |
-
t_elapsed = time.time() - t0
|
| 314 |
-
|
| 315 |
-
t_baseline = t_elapsed / 2
|
| 316 |
-
t_grafted = t_elapsed / 2
|
| 317 |
-
posteriors = graft_info["posteriors"]
|
| 318 |
-
entropy = graft_info["entropy"]
|
| 319 |
-
emitted = graft_info["emitted"]
|
| 320 |
else:
|
| 321 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
baseline_pred = int(np.argmax(baseline_scores))
|
| 324 |
grafted_pred = int(np.argmax(grafted_scores))
|
| 325 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
return SampleResult(
|
| 327 |
sample_id=sample.id,
|
| 328 |
task=sample.metadata.get("task", ""),
|
| 329 |
gold=sample.gold,
|
|
|
|
| 330 |
baseline_pred=baseline_pred,
|
| 331 |
grafted_pred=grafted_pred,
|
| 332 |
-
baseline_correct=
|
| 333 |
-
grafted_correct=
|
| 334 |
baseline_scores=baseline_scores,
|
| 335 |
grafted_scores=grafted_scores,
|
| 336 |
-
|
| 337 |
graft_entropy=entropy,
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
|
|
|
|
|
|
| 341 |
)
|
| 342 |
|
| 343 |
def evaluate_task(self, task_name: str,
|
| 344 |
max_samples: Optional[int] = None,
|
| 345 |
verbose: bool = False) -> TaskResult:
|
| 346 |
-
"""Evaluate all samples in a task."""
|
| 347 |
config = TASK_REGISTRY[task_name]
|
| 348 |
samples = load_task_samples(task_name, max_samples)
|
| 349 |
|
| 350 |
if verbose:
|
| 351 |
-
print(f" [{task_name}]
|
| 352 |
|
| 353 |
results = []
|
| 354 |
for i, sample in enumerate(samples):
|
|
@@ -357,65 +405,78 @@ class EvalRunner:
|
|
| 357 |
if verbose and (i + 1) % 100 == 0:
|
| 358 |
acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
|
| 359 |
acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
|
| 360 |
-
print(f" {i+1}/{len(samples)}:
|
| 361 |
|
| 362 |
n = len(results)
|
| 363 |
if n == 0:
|
| 364 |
return TaskResult(
|
| 365 |
-
task=task_name, domain=config.domain, n_samples=0,
|
| 366 |
baseline_accuracy=0, grafted_accuracy=0, delta=0,
|
| 367 |
baseline_correct=0, grafted_correct=0,
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
)
|
| 372 |
|
| 373 |
bl_correct = sum(1 for r in results if r.baseline_correct)
|
| 374 |
gr_correct = sum(1 for r in results if r.grafted_correct)
|
| 375 |
-
bl_acc = bl_correct / n
|
| 376 |
-
gr_acc = gr_correct / n
|
| 377 |
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
return TaskResult(
|
| 382 |
task=task_name,
|
| 383 |
domain=config.domain,
|
| 384 |
n_samples=n,
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
|
|
|
| 388 |
baseline_correct=bl_correct,
|
| 389 |
grafted_correct=gr_correct,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 390 |
mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
mean_wall_time_grafted=mean_gr_time,
|
| 394 |
-
speedup=mean_bl_time / max(mean_gr_time, 1e-9),
|
| 395 |
)
|
| 396 |
|
| 397 |
def run_benchmark(self, tasks: Optional[List[str]] = None,
|
| 398 |
max_samples_per_task: Optional[int] = None,
|
| 399 |
verbose: bool = True) -> BenchmarkResult:
|
| 400 |
-
"""
|
| 401 |
-
Run the full benchmark across multiple tasks.
|
| 402 |
-
|
| 403 |
-
Args:
|
| 404 |
-
tasks: List of task names. None = all tasks.
|
| 405 |
-
max_samples_per_task: Cap per task (for fast dev runs).
|
| 406 |
-
verbose: Print progress.
|
| 407 |
-
"""
|
| 408 |
if tasks is None:
|
| 409 |
tasks = list(TASK_REGISTRY.keys())
|
| 410 |
|
| 411 |
if verbose:
|
| 412 |
print(f"\n{'β' * 60}")
|
| 413 |
print(f" TENSEGRITY BENCHMARK")
|
| 414 |
-
print(f" Model:
|
| 415 |
-
print(f" Mode:
|
| 416 |
-
print(f"
|
|
|
|
| 417 |
cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
|
| 418 |
-
print(f"
|
| 419 |
print(f"{'β' * 60}")
|
| 420 |
|
| 421 |
t_start = time.time()
|
|
@@ -430,44 +491,96 @@ class EvalRunner:
|
|
| 430 |
task_results.append(tr)
|
| 431 |
if verbose:
|
| 432 |
sign = "+" if tr.delta >= 0 else ""
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
f"
|
|
|
|
| 437 |
except Exception as e:
|
| 438 |
logger.error(f"Task {task_name} failed: {e}")
|
| 439 |
if verbose:
|
| 440 |
print(f" β FAILED: {e}")
|
|
|
|
| 441 |
|
| 442 |
total_time = time.time() - t_start
|
| 443 |
|
| 444 |
-
# Aggregate
|
| 445 |
total_bl = sum(t.baseline_correct for t in task_results)
|
| 446 |
total_gr = sum(t.grafted_correct for t in task_results)
|
| 447 |
total_n = sum(t.n_samples for t in task_results)
|
| 448 |
|
| 449 |
-
|
| 450 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 451 |
|
| 452 |
result = BenchmarkResult(
|
| 453 |
model_name=self.model_name,
|
|
|
|
|
|
|
| 454 |
tasks=task_results,
|
| 455 |
-
overall_baseline_accuracy=
|
| 456 |
-
overall_grafted_accuracy=
|
| 457 |
-
overall_delta=
|
|
|
|
| 458 |
total_samples=total_n,
|
| 459 |
total_wall_time=total_time,
|
| 460 |
timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 461 |
)
|
| 462 |
|
| 463 |
if verbose:
|
| 464 |
-
print(f"\n{'β' *
|
| 465 |
print(result.summary_table())
|
| 466 |
-
print(f"\
|
| 467 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
return result
|
| 470 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
def save_results(self, result: BenchmarkResult, path: str):
|
| 472 |
"""Save benchmark results to JSON."""
|
| 473 |
with open(path, "w") as f:
|
|
|
|
| 7 |
P(choice | prompt) computed from raw logits.
|
| 8 |
Prediction = argmax over choices.
|
| 9 |
|
| 10 |
+
GRAFTED: score(choice) = llm_logprob(choice) + Ξ» * tensegrity_score(choice)
|
| 11 |
+
Where Ξ» controls the graft weight. Ξ»=0 recovers baseline.
|
| 12 |
+
|
| 13 |
+
The ONLY difference is the additive Tensegrity term.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
This is a controlled A/B comparison.
|
| 15 |
|
| 16 |
+
Metrics per task:
|
| 17 |
+
- raw_acc: baseline accuracy
|
| 18 |
+
- grafted_acc: grafted accuracy
|
| 19 |
+
- delta: grafted - baseline
|
| 20 |
+
- coverage: fraction of samples where graft posteriors are non-uniform
|
| 21 |
+
- cond_acc_biased: accuracy on the subset where graft was non-uniform
|
| 22 |
+
- mean_bias_mag: mean max absolute Tensegrity score deviation from uniform
|
| 23 |
+
- flip_rate: fraction of samples where baseline_pred != grafted_pred
|
| 24 |
+
- good_flips: LLM wrong β graft right
|
| 25 |
+
- bad_flips: LLM right β graft wrong
|
| 26 |
+
- preserved: LLM right β graft right
|
| 27 |
+
- neutral: LLM wrong β graft wrong
|
| 28 |
"""
|
| 29 |
|
| 30 |
import numpy as np
|
|
|
|
| 32 |
import json
|
| 33 |
import logging
|
| 34 |
from typing import Dict, List, Optional, Any, Tuple
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
from pathlib import Path
|
| 37 |
|
| 38 |
from tensegrity.bench.tasks import TaskSample, TaskConfig, TASK_REGISTRY, load_task_samples
|
|
|
|
| 47 |
sample_id: str
|
| 48 |
task: str
|
| 49 |
gold: int
|
| 50 |
+
n_choices: int
|
| 51 |
baseline_pred: int
|
| 52 |
grafted_pred: int
|
| 53 |
baseline_correct: bool
|
| 54 |
grafted_correct: bool
|
| 55 |
baseline_scores: List[float]
|
| 56 |
grafted_scores: List[float]
|
| 57 |
+
tensegrity_scores: List[float] # Raw Tensegrity posteriors (pre-blend)
|
| 58 |
+
graft_entropy: float # Normalized entropy of Tensegrity posteriors
|
| 59 |
+
bias_applied: bool # Did Tensegrity posteriors differ from uniform?
|
| 60 |
+
bias_magnitude: float # Max absolute deviation from uniform
|
| 61 |
+
flip_type: str # "good_flip", "bad_flip", "preserved", "neutral", "no_flip"
|
| 62 |
+
lam: float # Ξ» used for this evaluation
|
| 63 |
+
wall_time: float
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class FlipAccounting:
|
| 68 |
+
"""Flip analysis for one task."""
|
| 69 |
+
good_flips: int = 0 # LLM wrong β graft right
|
| 70 |
+
bad_flips: int = 0 # LLM right β graft wrong
|
| 71 |
+
preserved: int = 0 # LLM right β graft right
|
| 72 |
+
neutral: int = 0 # LLM wrong β graft wrong (no change)
|
| 73 |
+
no_flip: int = 0 # Same prediction (subset of preserved + neutral)
|
| 74 |
+
|
| 75 |
+
@property
|
| 76 |
+
def total(self):
|
| 77 |
+
return self.good_flips + self.bad_flips + self.preserved + self.neutral
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def flip_rate(self):
|
| 81 |
+
return (self.good_flips + self.bad_flips) / max(self.total, 1)
|
| 82 |
+
|
| 83 |
+
@property
|
| 84 |
+
def good_bad_ratio(self):
|
| 85 |
+
if self.bad_flips == 0:
|
| 86 |
+
return float('inf') if self.good_flips > 0 else 0.0
|
| 87 |
+
return self.good_flips / self.bad_flips
|
| 88 |
+
|
| 89 |
+
def to_dict(self):
|
| 90 |
+
return {
|
| 91 |
+
"good_flips": self.good_flips,
|
| 92 |
+
"bad_flips": self.bad_flips,
|
| 93 |
+
"preserved": self.preserved,
|
| 94 |
+
"neutral": self.neutral,
|
| 95 |
+
"flip_rate": round(self.flip_rate, 4),
|
| 96 |
+
"good_bad_ratio": round(self.good_bad_ratio, 2) if self.good_bad_ratio != float('inf') else "inf",
|
| 97 |
+
}
|
| 98 |
|
| 99 |
|
| 100 |
@dataclass
|
|
|
|
| 103 |
task: str
|
| 104 |
domain: str
|
| 105 |
n_samples: int
|
| 106 |
+
lam: float
|
| 107 |
+
# Core accuracy
|
| 108 |
baseline_accuracy: float
|
| 109 |
grafted_accuracy: float
|
| 110 |
+
delta: float
|
| 111 |
baseline_correct: int
|
| 112 |
grafted_correct: int
|
| 113 |
+
# Graft diagnostics
|
| 114 |
+
coverage: float # Fraction where bias_applied=True
|
| 115 |
+
cond_acc_biased: float # Accuracy only on samples where bias was applied
|
| 116 |
+
cond_acc_unbiased: float # Accuracy only on samples where bias was NOT applied
|
| 117 |
+
mean_bias_magnitude: float
|
| 118 |
mean_graft_entropy: float
|
| 119 |
+
# Flips
|
| 120 |
+
flips: FlipAccounting
|
| 121 |
+
# Timing
|
| 122 |
+
mean_wall_time: float
|
| 123 |
|
| 124 |
|
| 125 |
@dataclass
|
| 126 |
class BenchmarkResult:
|
| 127 |
"""Full benchmark result across all tasks."""
|
| 128 |
model_name: str
|
| 129 |
+
mode: str
|
| 130 |
+
lam: float
|
| 131 |
tasks: List[TaskResult]
|
| 132 |
overall_baseline_accuracy: float
|
| 133 |
overall_grafted_accuracy: float
|
| 134 |
overall_delta: float
|
| 135 |
+
overall_flips: FlipAccounting
|
| 136 |
total_samples: int
|
| 137 |
total_wall_time: float
|
| 138 |
timestamp: str = ""
|
|
|
|
| 140 |
def to_dict(self) -> dict:
|
| 141 |
return {
|
| 142 |
"model": self.model_name,
|
| 143 |
+
"mode": self.mode,
|
| 144 |
+
"lambda": self.lam,
|
| 145 |
"overall": {
|
| 146 |
"baseline_accuracy": round(self.overall_baseline_accuracy, 4),
|
| 147 |
"grafted_accuracy": round(self.overall_grafted_accuracy, 4),
|
| 148 |
"delta": round(self.overall_delta, 4),
|
| 149 |
"total_samples": self.total_samples,
|
| 150 |
"wall_time_s": round(self.total_wall_time, 1),
|
| 151 |
+
"flips": self.overall_flips.to_dict(),
|
| 152 |
},
|
| 153 |
"tasks": [
|
| 154 |
{
|
| 155 |
"task": t.task,
|
| 156 |
"domain": t.domain,
|
| 157 |
"n": t.n_samples,
|
| 158 |
+
"lambda": t.lam,
|
| 159 |
"baseline": round(t.baseline_accuracy, 4),
|
| 160 |
"grafted": round(t.grafted_accuracy, 4),
|
| 161 |
"delta": round(t.delta, 4),
|
| 162 |
+
"coverage": round(t.coverage, 3),
|
| 163 |
+
"cond_acc_biased": round(t.cond_acc_biased, 4),
|
| 164 |
+
"mean_bias_mag": round(t.mean_bias_magnitude, 4),
|
| 165 |
+
"mean_entropy": round(t.mean_graft_entropy, 3),
|
| 166 |
+
"flips": t.flips.to_dict(),
|
| 167 |
}
|
| 168 |
for t in self.tasks
|
| 169 |
],
|
|
|
|
| 171 |
|
| 172 |
def summary_table(self) -> str:
|
| 173 |
lines = []
|
| 174 |
+
lines.append(f"{'Task':<22} {'N':>5} {'Base':>7} {'Graft':>7} {'Ξ':>7}"
|
| 175 |
+
f" {'Cov':>5} {'G/B':>6} {'Gββ':>4} {'Gββ':>4}")
|
| 176 |
+
lines.append("β" * 75)
|
| 177 |
for t in sorted(self.tasks, key=lambda x: x.delta, reverse=True):
|
| 178 |
sign = "+" if t.delta >= 0 else ""
|
| 179 |
+
gb = t.flips.good_bad_ratio
|
| 180 |
+
gb_str = f"{gb:.1f}" if gb != float('inf') else "β"
|
| 181 |
lines.append(
|
| 182 |
+
f"{t.task:<22} {t.n_samples:>5} {t.baseline_accuracy:>6.1%} "
|
| 183 |
+
f"{t.grafted_accuracy:>6.1%} {sign}{t.delta:>6.1%}"
|
| 184 |
+
f" {t.coverage:>4.0%} {gb_str:>6} {t.flips.good_flips:>4} {t.flips.bad_flips:>4}"
|
| 185 |
)
|
| 186 |
+
lines.append("β" * 75)
|
| 187 |
sign = "+" if self.overall_delta >= 0 else ""
|
| 188 |
+
gb = self.overall_flips.good_bad_ratio
|
| 189 |
+
gb_str = f"{gb:.1f}" if gb != float('inf') else "β"
|
| 190 |
lines.append(
|
| 191 |
+
f"{'OVERALL':<22} {self.total_samples:>5} {self.overall_baseline_accuracy:>6.1%} "
|
| 192 |
+
f"{self.overall_grafted_accuracy:>6.1%} {sign}{self.overall_delta:>6.1%}"
|
| 193 |
+
f" {'':>5} {gb_str:>6} "
|
| 194 |
+
f"{self.overall_flips.good_flips:>4} {self.overall_flips.bad_flips:>4}"
|
| 195 |
)
|
| 196 |
return "\n".join(lines)
|
| 197 |
|
|
|
|
| 202 |
|
| 203 |
Modes:
|
| 204 |
"local" β Uses transformers model with LogitsProcessor
|
| 205 |
+
"offline" β No LLM; baseline = random, grafted = Tensegrity posteriors
|
| 206 |
(tests the cognitive layer in isolation)
|
| 207 |
+
|
| 208 |
+
Ξ» parameter:
|
| 209 |
+
score(choice) = baseline_score(choice) + Ξ» * tensegrity_score(choice)
|
| 210 |
+
Ξ»=0 β pure baseline. Ξ»>0 β graft contributes. Sweep to find optimal.
|
| 211 |
"""
|
| 212 |
|
| 213 |
def __init__(self,
|
| 214 |
model_name: str = "meta-llama/Llama-3.2-1B-Instruct",
|
| 215 |
mode: str = "offline",
|
| 216 |
+
lam: float = 1.0,
|
|
|
|
| 217 |
seed: int = 42):
|
| 218 |
+
"""
|
| 219 |
+
Args:
|
| 220 |
+
model_name: HF model ID for local mode
|
| 221 |
+
mode: "offline" or "local"
|
| 222 |
+
lam: Ξ» β graft weight. score = baseline + Ξ» * tensegrity
|
| 223 |
+
seed: Random seed
|
| 224 |
+
"""
|
| 225 |
self.model_name = model_name
|
| 226 |
self.mode = mode
|
| 227 |
+
self.lam = lam
|
|
|
|
| 228 |
self.seed = seed
|
| 229 |
|
|
|
|
| 230 |
self._model = None
|
| 231 |
self._tokenizer = None
|
| 232 |
|
| 233 |
def _init_model(self):
|
| 234 |
+
if self._model is not None or self.mode != "local":
|
|
|
|
| 235 |
return
|
|
|
|
|
|
|
|
|
|
| 236 |
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 237 |
+
import torch
|
| 238 |
dtype, device_map, move_to = inference_load_settings()
|
| 239 |
logger.info(f"Loading model {self.model_name}...")
|
| 240 |
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
|
|
|
| 248 |
if move_to is not None:
|
| 249 |
self._model = self._model.to(move_to)
|
| 250 |
self._model.eval()
|
|
|
|
| 251 |
|
| 252 |
# βββ SCORING ββββββββββββββββββββββββββββββββββββββββββββ
|
| 253 |
|
| 254 |
+
def _score_choices_local(self, prompt: str, choices: List[str]) -> List[float]:
|
| 255 |
+
"""Score each choice by log P(choice | prompt)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
import torch
|
|
|
|
|
|
|
| 257 |
scores = []
|
| 258 |
for choice in choices:
|
| 259 |
full_text = f"{prompt} {choice}"
|
|
|
|
| 261 |
truncation=True, max_length=512)
|
| 262 |
if hasattr(self._model, 'device'):
|
| 263 |
inputs = {k: v.to(self._model.device) for k, v in inputs.items()}
|
|
|
|
| 264 |
with torch.no_grad():
|
| 265 |
outputs = self._model(**inputs)
|
| 266 |
+
logits = outputs.logits
|
|
|
|
|
|
|
| 267 |
prompt_ids = self._tokenizer(prompt, return_tensors="pt",
|
| 268 |
truncation=True, max_length=512)["input_ids"]
|
| 269 |
n_prompt = prompt_ids.shape[1]
|
| 270 |
n_total = inputs["input_ids"].shape[1]
|
|
|
|
|
|
|
| 271 |
log_probs = torch.nn.functional.log_softmax(logits[0], dim=-1)
|
| 272 |
choice_log_prob = 0.0
|
| 273 |
for pos in range(n_prompt, n_total):
|
| 274 |
token_id = inputs["input_ids"][0, pos].item()
|
| 275 |
choice_log_prob += log_probs[pos - 1, token_id].item()
|
|
|
|
|
|
|
| 276 |
n_choice_tokens = max(n_total - n_prompt, 1)
|
| 277 |
scores.append(choice_log_prob / n_choice_tokens)
|
|
|
|
| 278 |
return scores
|
| 279 |
|
| 280 |
+
def _get_tensegrity_scores(self, sample: TaskSample) -> Tuple[List[float], float]:
|
| 281 |
"""
|
| 282 |
+
Run Tensegrity cognitive layer on a sample.
|
| 283 |
+
Returns (posteriors_list, normalized_entropy).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
"""
|
| 285 |
from tensegrity.broca.controller import CognitiveController
|
| 286 |
|
| 287 |
n = len(sample.choices)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
controller = CognitiveController(
|
| 289 |
n_hypotheses=n,
|
| 290 |
hypothesis_labels=[f"choice_{i}" for i in range(n)],
|
| 291 |
use_llm=False,
|
| 292 |
)
|
|
|
|
|
|
|
|
|
|
| 293 |
for i, hyp in enumerate(controller.belief_state.hypotheses):
|
| 294 |
+
hyp.description = sample.choices[i][:50]
|
| 295 |
|
| 296 |
+
controller.step(sample.prompt)
|
| 297 |
|
| 298 |
+
posteriors = [
|
|
|
|
|
|
|
|
|
|
| 299 |
controller.belief_state.hypotheses[i].probability
|
| 300 |
for i in range(n)
|
| 301 |
]
|
| 302 |
|
| 303 |
+
probs = np.array(posteriors)
|
|
|
|
| 304 |
probs = probs[probs > 0]
|
| 305 |
if len(probs) > 1:
|
| 306 |
entropy = float(-np.sum(probs * np.log(probs + 1e-16)) / np.log(len(probs)))
|
| 307 |
else:
|
| 308 |
entropy = 0.0
|
| 309 |
|
| 310 |
+
return posteriors, entropy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
# βββ EVALUATION βββββββββββββββββββββββββββββββββββββββββ
|
| 313 |
|
| 314 |
def evaluate_sample(self, sample: TaskSample) -> SampleResult:
|
| 315 |
+
"""Evaluate a single sample with full diagnostics."""
|
| 316 |
+
t0 = time.time()
|
| 317 |
+
n = len(sample.choices)
|
| 318 |
+
uniform = 1.0 / n
|
| 319 |
+
|
| 320 |
+
# Get Tensegrity scores
|
| 321 |
+
tensegrity_scores, entropy = self._get_tensegrity_scores(sample)
|
| 322 |
+
|
| 323 |
+
# Compute bias diagnostics
|
| 324 |
+
deviations = [abs(s - uniform) for s in tensegrity_scores]
|
| 325 |
+
bias_magnitude = max(deviations)
|
| 326 |
+
# bias_applied = posteriors are meaningfully non-uniform
|
| 327 |
+
bias_applied = bias_magnitude > 0.02 # More than 2% deviation from uniform
|
| 328 |
+
|
| 329 |
+
# Get baseline scores
|
| 330 |
if self.mode == "local":
|
| 331 |
self._init_model()
|
|
|
|
|
|
|
| 332 |
baseline_scores = self._score_choices_local(sample.prompt, sample.choices)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
else:
|
| 334 |
+
# Offline: random baseline (seeded by sample ID for reproducibility)
|
| 335 |
+
rng = np.random.RandomState(hash(sample.id) % 2**31)
|
| 336 |
+
baseline_scores = rng.randn(n).tolist()
|
| 337 |
+
|
| 338 |
+
# Grafted: baseline + Ξ» * tensegrity
|
| 339 |
+
# Normalize tensegrity scores to be on comparable scale to baseline
|
| 340 |
+
# In offline mode, baseline is N(0,1), tensegrity is [0,1] probabilities
|
| 341 |
+
# In local mode, baseline is log-probs (~[-5, 0]), tensegrity is [0,1]
|
| 342 |
+
# Convert tensegrity to log-odds for better scale matching
|
| 343 |
+
tensegrity_logodds = [
|
| 344 |
+
np.log(max(s, 1e-9)) - np.log(uniform)
|
| 345 |
+
for s in tensegrity_scores
|
| 346 |
+
]
|
| 347 |
+
|
| 348 |
+
grafted_scores = [
|
| 349 |
+
b + self.lam * t
|
| 350 |
+
for b, t in zip(baseline_scores, tensegrity_logodds)
|
| 351 |
+
]
|
| 352 |
|
| 353 |
baseline_pred = int(np.argmax(baseline_scores))
|
| 354 |
grafted_pred = int(np.argmax(grafted_scores))
|
| 355 |
|
| 356 |
+
baseline_correct = (baseline_pred == sample.gold)
|
| 357 |
+
grafted_correct = (grafted_pred == sample.gold)
|
| 358 |
+
|
| 359 |
+
# Flip classification
|
| 360 |
+
if baseline_pred == grafted_pred:
|
| 361 |
+
flip_type = "preserved" if baseline_correct else "neutral"
|
| 362 |
+
elif not baseline_correct and grafted_correct:
|
| 363 |
+
flip_type = "good_flip"
|
| 364 |
+
elif baseline_correct and not grafted_correct:
|
| 365 |
+
flip_type = "bad_flip"
|
| 366 |
+
else:
|
| 367 |
+
flip_type = "neutral" # Both wrong, different wrong answers
|
| 368 |
+
|
| 369 |
+
wall_time = time.time() - t0
|
| 370 |
+
|
| 371 |
return SampleResult(
|
| 372 |
sample_id=sample.id,
|
| 373 |
task=sample.metadata.get("task", ""),
|
| 374 |
gold=sample.gold,
|
| 375 |
+
n_choices=n,
|
| 376 |
baseline_pred=baseline_pred,
|
| 377 |
grafted_pred=grafted_pred,
|
| 378 |
+
baseline_correct=baseline_correct,
|
| 379 |
+
grafted_correct=grafted_correct,
|
| 380 |
baseline_scores=baseline_scores,
|
| 381 |
grafted_scores=grafted_scores,
|
| 382 |
+
tensegrity_scores=tensegrity_scores,
|
| 383 |
graft_entropy=entropy,
|
| 384 |
+
bias_applied=bias_applied,
|
| 385 |
+
bias_magnitude=bias_magnitude,
|
| 386 |
+
flip_type=flip_type,
|
| 387 |
+
lam=self.lam,
|
| 388 |
+
wall_time=wall_time,
|
| 389 |
)
|
| 390 |
|
| 391 |
def evaluate_task(self, task_name: str,
|
| 392 |
max_samples: Optional[int] = None,
|
| 393 |
verbose: bool = False) -> TaskResult:
|
| 394 |
+
"""Evaluate all samples in a task with full flip accounting."""
|
| 395 |
config = TASK_REGISTRY[task_name]
|
| 396 |
samples = load_task_samples(task_name, max_samples)
|
| 397 |
|
| 398 |
if verbose:
|
| 399 |
+
print(f" [{task_name}] Loaded {len(samples)} samples")
|
| 400 |
|
| 401 |
results = []
|
| 402 |
for i, sample in enumerate(samples):
|
|
|
|
| 405 |
if verbose and (i + 1) % 100 == 0:
|
| 406 |
acc_b = sum(1 for x in results if x.baseline_correct) / len(results)
|
| 407 |
acc_g = sum(1 for x in results if x.grafted_correct) / len(results)
|
| 408 |
+
print(f" {i+1}/{len(samples)}: base={acc_b:.1%} graft={acc_g:.1%}")
|
| 409 |
|
| 410 |
n = len(results)
|
| 411 |
if n == 0:
|
| 412 |
return TaskResult(
|
| 413 |
+
task=task_name, domain=config.domain, n_samples=0, lam=self.lam,
|
| 414 |
baseline_accuracy=0, grafted_accuracy=0, delta=0,
|
| 415 |
baseline_correct=0, grafted_correct=0,
|
| 416 |
+
coverage=0, cond_acc_biased=0, cond_acc_unbiased=0,
|
| 417 |
+
mean_bias_magnitude=0, mean_graft_entropy=0,
|
| 418 |
+
flips=FlipAccounting(), mean_wall_time=0,
|
| 419 |
)
|
| 420 |
|
| 421 |
bl_correct = sum(1 for r in results if r.baseline_correct)
|
| 422 |
gr_correct = sum(1 for r in results if r.grafted_correct)
|
|
|
|
|
|
|
| 423 |
|
| 424 |
+
# Flip accounting
|
| 425 |
+
flips = FlipAccounting()
|
| 426 |
+
for r in results:
|
| 427 |
+
if r.flip_type == "good_flip":
|
| 428 |
+
flips.good_flips += 1
|
| 429 |
+
elif r.flip_type == "bad_flip":
|
| 430 |
+
flips.bad_flips += 1
|
| 431 |
+
elif r.flip_type == "preserved":
|
| 432 |
+
flips.preserved += 1
|
| 433 |
+
elif r.flip_type == "neutral":
|
| 434 |
+
flips.neutral += 1
|
| 435 |
+
|
| 436 |
+
# Coverage: fraction where bias was non-trivial
|
| 437 |
+
biased = [r for r in results if r.bias_applied]
|
| 438 |
+
coverage = len(biased) / n
|
| 439 |
+
|
| 440 |
+
# Conditional accuracy
|
| 441 |
+
cond_acc_biased = (sum(1 for r in biased if r.grafted_correct) / len(biased)) if biased else 0.0
|
| 442 |
+
unbiased = [r for r in results if not r.bias_applied]
|
| 443 |
+
cond_acc_unbiased = (sum(1 for r in unbiased if r.grafted_correct) / len(unbiased)) if unbiased else 0.0
|
| 444 |
|
| 445 |
return TaskResult(
|
| 446 |
task=task_name,
|
| 447 |
domain=config.domain,
|
| 448 |
n_samples=n,
|
| 449 |
+
lam=self.lam,
|
| 450 |
+
baseline_accuracy=bl_correct / n,
|
| 451 |
+
grafted_accuracy=gr_correct / n,
|
| 452 |
+
delta=(gr_correct - bl_correct) / n,
|
| 453 |
baseline_correct=bl_correct,
|
| 454 |
grafted_correct=gr_correct,
|
| 455 |
+
coverage=coverage,
|
| 456 |
+
cond_acc_biased=cond_acc_biased,
|
| 457 |
+
cond_acc_unbiased=cond_acc_unbiased,
|
| 458 |
+
mean_bias_magnitude=np.mean([r.bias_magnitude for r in results]),
|
| 459 |
mean_graft_entropy=np.mean([r.graft_entropy for r in results]),
|
| 460 |
+
flips=flips,
|
| 461 |
+
mean_wall_time=np.mean([r.wall_time for r in results]),
|
|
|
|
|
|
|
| 462 |
)
|
| 463 |
|
| 464 |
def run_benchmark(self, tasks: Optional[List[str]] = None,
|
| 465 |
max_samples_per_task: Optional[int] = None,
|
| 466 |
verbose: bool = True) -> BenchmarkResult:
|
| 467 |
+
"""Run the full benchmark across multiple tasks."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
if tasks is None:
|
| 469 |
tasks = list(TASK_REGISTRY.keys())
|
| 470 |
|
| 471 |
if verbose:
|
| 472 |
print(f"\n{'β' * 60}")
|
| 473 |
print(f" TENSEGRITY BENCHMARK")
|
| 474 |
+
print(f" Model: {self.model_name}")
|
| 475 |
+
print(f" Mode: {self.mode}")
|
| 476 |
+
print(f" Ξ»: {self.lam}")
|
| 477 |
+
print(f" Tasks: {len(tasks)}")
|
| 478 |
cap_str = str(max_samples_per_task) if max_samples_per_task else "all"
|
| 479 |
+
print(f" N/task: {cap_str}")
|
| 480 |
print(f"{'β' * 60}")
|
| 481 |
|
| 482 |
t_start = time.time()
|
|
|
|
| 491 |
task_results.append(tr)
|
| 492 |
if verbose:
|
| 493 |
sign = "+" if tr.delta >= 0 else ""
|
| 494 |
+
gb = tr.flips.good_bad_ratio
|
| 495 |
+
gb_str = f"{gb:.1f}" if gb != float('inf') else "β"
|
| 496 |
+
print(f" base={tr.baseline_accuracy:.1%} graft={tr.grafted_accuracy:.1%} "
|
| 497 |
+
f"Ξ={sign}{tr.delta:.1%} cov={tr.coverage:.0%} "
|
| 498 |
+
f"flips={tr.flips.good_flips}β/{tr.flips.bad_flips}β G/B={gb_str}")
|
| 499 |
except Exception as e:
|
| 500 |
logger.error(f"Task {task_name} failed: {e}")
|
| 501 |
if verbose:
|
| 502 |
print(f" β FAILED: {e}")
|
| 503 |
+
import traceback; traceback.print_exc()
|
| 504 |
|
| 505 |
total_time = time.time() - t_start
|
| 506 |
|
|
|
|
| 507 |
total_bl = sum(t.baseline_correct for t in task_results)
|
| 508 |
total_gr = sum(t.grafted_correct for t in task_results)
|
| 509 |
total_n = sum(t.n_samples for t in task_results)
|
| 510 |
|
| 511 |
+
overall_flips = FlipAccounting()
|
| 512 |
+
for t in task_results:
|
| 513 |
+
overall_flips.good_flips += t.flips.good_flips
|
| 514 |
+
overall_flips.bad_flips += t.flips.bad_flips
|
| 515 |
+
overall_flips.preserved += t.flips.preserved
|
| 516 |
+
overall_flips.neutral += t.flips.neutral
|
| 517 |
|
| 518 |
result = BenchmarkResult(
|
| 519 |
model_name=self.model_name,
|
| 520 |
+
mode=self.mode,
|
| 521 |
+
lam=self.lam,
|
| 522 |
tasks=task_results,
|
| 523 |
+
overall_baseline_accuracy=total_bl / max(total_n, 1),
|
| 524 |
+
overall_grafted_accuracy=total_gr / max(total_n, 1),
|
| 525 |
+
overall_delta=(total_gr - total_bl) / max(total_n, 1),
|
| 526 |
+
overall_flips=overall_flips,
|
| 527 |
total_samples=total_n,
|
| 528 |
total_wall_time=total_time,
|
| 529 |
timestamp=time.strftime("%Y-%m-%d %H:%M:%S"),
|
| 530 |
)
|
| 531 |
|
| 532 |
if verbose:
|
| 533 |
+
print(f"\n{'β' * 75}")
|
| 534 |
print(result.summary_table())
|
| 535 |
+
print(f"\n Ξ»={self.lam} Time={total_time:.1f}s")
|
| 536 |
+
print(f" Total flips: {overall_flips.good_flips}β good, "
|
| 537 |
+
f"{overall_flips.bad_flips}β bad, "
|
| 538 |
+
f"{overall_flips.preserved} preserved, "
|
| 539 |
+
f"{overall_flips.neutral} neutral")
|
| 540 |
+
print(f"{'β' * 75}")
|
| 541 |
|
| 542 |
return result
|
| 543 |
|
| 544 |
+
def sweep_lambda(self, tasks: Optional[List[str]] = None,
|
| 545 |
+
lambdas: Optional[List[float]] = None,
|
| 546 |
+
max_samples_per_task: Optional[int] = None,
|
| 547 |
+
verbose: bool = True) -> List[BenchmarkResult]:
|
| 548 |
+
"""
|
| 549 |
+
Sweep Ξ» to find optimal graft weight.
|
| 550 |
+
|
| 551 |
+
Args:
|
| 552 |
+
lambdas: Values to sweep. Default: [0, 0.1, 0.25, 0.5, 1.0, 2.0]
|
| 553 |
+
"""
|
| 554 |
+
if lambdas is None:
|
| 555 |
+
lambdas = [0.0, 0.1, 0.25, 0.5, 1.0, 2.0]
|
| 556 |
+
|
| 557 |
+
if verbose:
|
| 558 |
+
print(f"\n{'β' * 60}")
|
| 559 |
+
print(f" Ξ» SWEEP: {lambdas}")
|
| 560 |
+
print(f"{'β' * 60}")
|
| 561 |
+
|
| 562 |
+
results = []
|
| 563 |
+
for lam_val in lambdas:
|
| 564 |
+
self.lam = lam_val
|
| 565 |
+
result = self.run_benchmark(tasks, max_samples_per_task, verbose=False)
|
| 566 |
+
results.append(result)
|
| 567 |
+
|
| 568 |
+
if verbose:
|
| 569 |
+
sign = "+" if result.overall_delta >= 0 else ""
|
| 570 |
+
gb = result.overall_flips.good_bad_ratio
|
| 571 |
+
gb_str = f"{gb:.1f}" if gb != float('inf') else "β"
|
| 572 |
+
print(f" Ξ»={lam_val:<5} base={result.overall_baseline_accuracy:.1%} "
|
| 573 |
+
f"graft={result.overall_grafted_accuracy:.1%} "
|
| 574 |
+
f"Ξ={sign}{result.overall_delta:.1%} G/B={gb_str} "
|
| 575 |
+
f"({result.overall_flips.good_flips}β/{result.overall_flips.bad_flips}β)")
|
| 576 |
+
|
| 577 |
+
if verbose:
|
| 578 |
+
# Find optimal Ξ»
|
| 579 |
+
best = max(results, key=lambda r: r.overall_delta)
|
| 580 |
+
print(f"\n Best Ξ» = {best.lam} β Ξ = {best.overall_delta:+.1%}")
|
| 581 |
+
|
| 582 |
+
return results
|
| 583 |
+
|
| 584 |
def save_results(self, result: BenchmarkResult, path: str):
|
| 585 |
"""Save benchmark results to JSON."""
|
| 586 |
with open(path, "w") as f:
|