feat: Persistent causal arena, BoolQ binary task fix, SBERT-only ablation baseline

#3
by theapemachine - opened
scripts/ablation_sbert_only.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SBERT-Only Ablation Baseline.
4
+
5
+ This script answers the most important question about Tensegrity:
6
+ "Does the cognitive layer add value above SBERT-alone?"
7
+
8
+ It runs the same benchmark tasks but uses ONLY SBERT cosine similarity
9
+ to score choices β€” no NGC, no causal arena, no Hopfield memory, no
10
+ belief updates, no falsification. Just:
11
+
12
+ score(choice_i) = cosine_sim(sbert(prompt), sbert(prompt + choice_i))
13
+
14
+ This is the honest baseline the cognitive layer must beat. If the
15
+ cognitive layer's Ξ” over SBERT-alone is positive, the manifold is
16
+ doing real work. If it's zero, the manifold is expensive SBERT.
17
+
18
+ Usage:
19
+ python scripts/ablation_sbert_only.py --max-samples 100
20
+ python scripts/ablation_sbert_only.py --tasks copa,boolq,sciq
21
+ """
22
+ import sys
23
+ import os
24
+ import time
25
+ import json
26
+ import argparse
27
+ import hashlib
28
+ import logging
29
+
30
+ import numpy as np
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def main():
36
+ parser = argparse.ArgumentParser(description="SBERT-only ablation baseline")
37
+ parser.add_argument("--tasks", default=None, help="Comma-separated task names")
38
+ parser.add_argument("--max-samples", type=int, default=None, help="Max samples per task")
39
+ parser.add_argument("--sbert-model", default="all-MiniLM-L6-v2", help="SBERT model name")
40
+ parser.add_argument("--output", default=None, help="Save JSON results to file")
41
+ parser.add_argument("--seed", type=int, default=42)
42
+ args = parser.parse_args()
43
+
44
+ from tensegrity.bench.tasks import TASK_REGISTRY, load_task_samples
45
+
46
+ # Load SBERT
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+ sbert = SentenceTransformer(args.sbert_model)
50
+ print(f"Loaded SBERT: {args.sbert_model}")
51
+ except Exception as e:
52
+ print(f"FATAL: Could not load SBERT: {e}")
53
+ sys.exit(1)
54
+
55
+ tasks = args.tasks.split(",") if args.tasks else list(TASK_REGISTRY.keys())
56
+
57
+ print(f"\n{'β–ˆ' * 60}")
58
+ print(f" SBERT-ONLY ABLATION BASELINE")
59
+ print(f" Model: {args.sbert_model}")
60
+ print(f" Tasks: {len(tasks)}")
61
+ print(f" N/task: {args.max_samples or 'all'}")
62
+ print(f"{'β–ˆ' * 60}")
63
+
64
+ t_start = time.time()
65
+ all_results = []
66
+ total_correct_sbert = 0
67
+ total_correct_random = 0
68
+ total_n = 0
69
+
70
+ for task_name in tasks:
71
+ config = TASK_REGISTRY[task_name]
72
+ samples = load_task_samples(task_name, args.max_samples)
73
+ print(f"\n β–Έ {task_name}: {config.description} ({len(samples)} samples)")
74
+
75
+ task_correct_sbert = 0
76
+ task_correct_random = 0
77
+ task_n = len(samples)
78
+
79
+ for sample in samples:
80
+ n = len(sample.choices)
81
+ if n == 0:
82
+ continue
83
+
84
+ # SBERT-only scoring: cosine(prompt, prompt+choice)
85
+ texts = [sample.prompt] + [f"{sample.prompt} {c}" for c in sample.choices]
86
+ embs = sbert.encode(texts, show_progress_bar=False)
87
+ pe = embs[0]
88
+ pn = np.linalg.norm(pe)
89
+ scores = np.zeros(n)
90
+ if pn > 1e-8:
91
+ for i in range(n):
92
+ ce = embs[i + 1]
93
+ cn = np.linalg.norm(ce)
94
+ if cn > 1e-8:
95
+ scores[i] = np.dot(pe, ce) / (pn * cn)
96
+
97
+ sbert_pred = int(np.argmax(scores))
98
+ if sbert_pred == sample.gold:
99
+ task_correct_sbert += 1
100
+
101
+ # Random baseline for comparison
102
+ seed_bytes = hashlib.sha256(sample.id.encode("utf-8")).digest()
103
+ sample_seed = int.from_bytes(seed_bytes[:8], "big", signed=False) % (2**31)
104
+ rng = np.random.RandomState(sample_seed)
105
+ random_pred = int(np.argmax(rng.randn(n)))
106
+ if random_pred == sample.gold:
107
+ task_correct_random += 1
108
+
109
+ sbert_acc = task_correct_sbert / max(task_n, 1)
110
+ random_acc = task_correct_random / max(task_n, 1)
111
+ chance = 1.0 / config.n_choices if config.n_choices > 0 else 0.25
112
+
113
+ total_correct_sbert += task_correct_sbert
114
+ total_correct_random += task_correct_random
115
+ total_n += task_n
116
+
117
+ result = {
118
+ "task": task_name, "domain": config.domain, "n": task_n,
119
+ "sbert_accuracy": round(sbert_acc, 4),
120
+ "random_accuracy": round(random_acc, 4),
121
+ "chance": round(chance, 4),
122
+ "sbert_over_chance": round(sbert_acc - chance, 4),
123
+ }
124
+ all_results.append(result)
125
+ print(f" SBERT={sbert_acc:.1%} random={random_acc:.1%} "
126
+ f"chance={chance:.1%} SBERT-chance={sbert_acc-chance:+.1%}")
127
+
128
+ total_time = time.time() - t_start
129
+ overall_sbert = total_correct_sbert / max(total_n, 1)
130
+ overall_random = total_correct_random / max(total_n, 1)
131
+
132
+ print(f"\n{'═' * 75}")
133
+ print(f" SBERT-only overall: {overall_sbert:.1%} (random: {overall_random:.1%})")
134
+ print(f" Total: {total_n} samples, {total_time:.1f}s")
135
+ print(f"{'═' * 75}")
136
+
137
+ # Print comparison table
138
+ print(f"\n{'Task':<22} {'N':>5} {'SBERT':>7} {'Random':>7} {'Chance':>7} {'SBERT-Chance':>12}")
139
+ print("─" * 65)
140
+ for r in sorted(all_results, key=lambda x: x["sbert_over_chance"], reverse=True):
141
+ print(f"{r['task']:<22} {r['n']:>5} {r['sbert_accuracy']:>6.1%} "
142
+ f"{r['random_accuracy']:>6.1%} {r['chance']:>6.1%} "
143
+ f"{r['sbert_over_chance']:>+11.1%}")
144
+ print("─" * 65)
145
+ print(f"{'OVERALL':<22} {total_n:>5} {overall_sbert:>6.1%} {overall_random:>6.1%}")
146
+
147
+ output = {
148
+ "mode": "sbert_only_ablation",
149
+ "sbert_model": args.sbert_model,
150
+ "overall_sbert_accuracy": round(overall_sbert, 4),
151
+ "overall_random_accuracy": round(overall_random, 4),
152
+ "total_samples": total_n,
153
+ "wall_time_s": round(total_time, 1),
154
+ "tasks": all_results,
155
+ }
156
+
157
+ if args.output:
158
+ with open(args.output, "w") as f:
159
+ json.dump(output, f, indent=2)
160
+ print(f"\nResults saved to {args.output}")
161
+ else:
162
+ print(f"\n{json.dumps(output, indent=2)}")
163
+
164
+
165
+ if __name__ == "__main__":
166
+ main()
tensegrity/broca/controller.py CHANGED
@@ -395,6 +395,27 @@ class CognitiveController:
395
  n = len(self.belief_state.hypotheses) or self.agent.n_states
396
  features = np.zeros(n)
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  # Map entities and relations to hypothesis dimensions using the
399
  # known hypothesis labels. The LLM parser (or template fallback)
400
  # extracts entities that may match hypothesis names.
 
395
  n = len(self.belief_state.hypotheses) or self.agent.n_states
396
  features = np.zeros(n)
397
 
398
+ # Detect binary yes/no tasks. For these tasks, the template parser's
399
+ # keyword-based polarity detection is systematically wrong because
400
+ # passages about questions almost always contain negation words
401
+ # ("not", "doesn't") that have nothing to do with the answer.
402
+ # When we detect a binary yes/no task, we suppress the template
403
+ # parser's relation-based evidence entirely and let SBERT carry
404
+ # the signal. This fixes the BoolQ -12% regression.
405
+ active_labels = [
406
+ h.description.lower() for h in self.belief_state.hypotheses
407
+ if not h.description.startswith("_empty_")
408
+ ]
409
+ is_binary_yesno = (
410
+ len(active_labels) == 2
411
+ and any(l in ("yes", "no", "true", "false") for l in active_labels)
412
+ )
413
+ if is_binary_yesno:
414
+ # For binary yes/no: return zero vector (no template-parser evidence).
415
+ # SBERT sentence similarity in the canonical pipeline will provide
416
+ # the actual signal. The template parser does more harm than good here.
417
+ return features
418
+
419
  # Map entities and relations to hypothesis dimensions using the
420
  # known hypothesis labels. The LLM parser (or template fallback)
421
  # extracts entities that may match hypothesis names.