theapemachine commited on
Commit
641ae8e
·
1 Parent(s): bf2a178

feat: Persistent causal arena, BoolQ binary task fix, SBERT-only ablation baseline (#3)

Browse files

- feat: controller.py (46feb31f0aaeaba437832e5cda23e0bd1f9f81cc)
- feat: canonical.py (5681f8c567cbed9915db4a076b8a342f041138f4)
- feat: ablation_sbert_only.py (d2eafd8354f0c1417fd358a686e7421c72602d0c)

scripts/ablation_sbert_only.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ SBERT-Only Ablation Baseline.
4
+
5
+ This script answers the most important question about Tensegrity:
6
+ "Does the cognitive layer add value above SBERT-alone?"
7
+
8
+ It runs the same benchmark tasks but uses ONLY SBERT cosine similarity
9
+ to score choices — no NGC, no causal arena, no Hopfield memory, no
10
+ belief updates, no falsification. Just:
11
+
12
+ score(choice_i) = cosine_sim(sbert(prompt), sbert(prompt + choice_i))
13
+
14
+ This is the honest baseline the cognitive layer must beat. If the
15
+ cognitive layer's Δ over SBERT-alone is positive, the manifold is
16
+ doing real work. If it's zero, the manifold is expensive SBERT.
17
+
18
+ Usage:
19
+ python scripts/ablation_sbert_only.py --max-samples 100
20
+ python scripts/ablation_sbert_only.py --tasks copa,boolq,sciq
21
+ """
22
+ import sys
23
+ import os
24
+ import time
25
+ import json
26
+ import argparse
27
+ import hashlib
28
+ import logging
29
+
30
+ import numpy as np
31
+
32
+ logger = logging.getLogger(__name__)
33
+
34
+
35
+ def main():
36
+ parser = argparse.ArgumentParser(description="SBERT-only ablation baseline")
37
+ parser.add_argument("--tasks", default=None, help="Comma-separated task names")
38
+ parser.add_argument("--max-samples", type=int, default=None, help="Max samples per task")
39
+ parser.add_argument("--sbert-model", default="all-MiniLM-L6-v2", help="SBERT model name")
40
+ parser.add_argument("--output", default=None, help="Save JSON results to file")
41
+ parser.add_argument("--seed", type=int, default=42)
42
+ args = parser.parse_args()
43
+
44
+ from tensegrity.bench.tasks import TASK_REGISTRY, load_task_samples
45
+
46
+ # Load SBERT
47
+ try:
48
+ from sentence_transformers import SentenceTransformer
49
+ sbert = SentenceTransformer(args.sbert_model)
50
+ print(f"Loaded SBERT: {args.sbert_model}")
51
+ except Exception as e:
52
+ print(f"FATAL: Could not load SBERT: {e}")
53
+ sys.exit(1)
54
+
55
+ tasks = args.tasks.split(",") if args.tasks else list(TASK_REGISTRY.keys())
56
+
57
+ print(f"\n{'█' * 60}")
58
+ print(f" SBERT-ONLY ABLATION BASELINE")
59
+ print(f" Model: {args.sbert_model}")
60
+ print(f" Tasks: {len(tasks)}")
61
+ print(f" N/task: {args.max_samples or 'all'}")
62
+ print(f"{'█' * 60}")
63
+
64
+ t_start = time.time()
65
+ all_results = []
66
+ total_correct_sbert = 0
67
+ total_correct_random = 0
68
+ total_n = 0
69
+
70
+ for task_name in tasks:
71
+ config = TASK_REGISTRY[task_name]
72
+ samples = load_task_samples(task_name, args.max_samples)
73
+ print(f"\n ▸ {task_name}: {config.description} ({len(samples)} samples)")
74
+
75
+ task_correct_sbert = 0
76
+ task_correct_random = 0
77
+ task_n = len(samples)
78
+
79
+ for sample in samples:
80
+ n = len(sample.choices)
81
+ if n == 0:
82
+ continue
83
+
84
+ # SBERT-only scoring: cosine(prompt, prompt+choice)
85
+ texts = [sample.prompt] + [f"{sample.prompt} {c}" for c in sample.choices]
86
+ embs = sbert.encode(texts, show_progress_bar=False)
87
+ pe = embs[0]
88
+ pn = np.linalg.norm(pe)
89
+ scores = np.zeros(n)
90
+ if pn > 1e-8:
91
+ for i in range(n):
92
+ ce = embs[i + 1]
93
+ cn = np.linalg.norm(ce)
94
+ if cn > 1e-8:
95
+ scores[i] = np.dot(pe, ce) / (pn * cn)
96
+
97
+ sbert_pred = int(np.argmax(scores))
98
+ if sbert_pred == sample.gold:
99
+ task_correct_sbert += 1
100
+
101
+ # Random baseline for comparison
102
+ seed_bytes = hashlib.sha256(sample.id.encode("utf-8")).digest()
103
+ sample_seed = int.from_bytes(seed_bytes[:8], "big", signed=False) % (2**31)
104
+ rng = np.random.RandomState(sample_seed)
105
+ random_pred = int(np.argmax(rng.randn(n)))
106
+ if random_pred == sample.gold:
107
+ task_correct_random += 1
108
+
109
+ sbert_acc = task_correct_sbert / max(task_n, 1)
110
+ random_acc = task_correct_random / max(task_n, 1)
111
+ chance = 1.0 / config.n_choices if config.n_choices > 0 else 0.25
112
+
113
+ total_correct_sbert += task_correct_sbert
114
+ total_correct_random += task_correct_random
115
+ total_n += task_n
116
+
117
+ result = {
118
+ "task": task_name, "domain": config.domain, "n": task_n,
119
+ "sbert_accuracy": round(sbert_acc, 4),
120
+ "random_accuracy": round(random_acc, 4),
121
+ "chance": round(chance, 4),
122
+ "sbert_over_chance": round(sbert_acc - chance, 4),
123
+ }
124
+ all_results.append(result)
125
+ print(f" SBERT={sbert_acc:.1%} random={random_acc:.1%} "
126
+ f"chance={chance:.1%} SBERT-chance={sbert_acc-chance:+.1%}")
127
+
128
+ total_time = time.time() - t_start
129
+ overall_sbert = total_correct_sbert / max(total_n, 1)
130
+ overall_random = total_correct_random / max(total_n, 1)
131
+
132
+ print(f"\n{'═' * 75}")
133
+ print(f" SBERT-only overall: {overall_sbert:.1%} (random: {overall_random:.1%})")
134
+ print(f" Total: {total_n} samples, {total_time:.1f}s")
135
+ print(f"{'═' * 75}")
136
+
137
+ # Print comparison table
138
+ print(f"\n{'Task':<22} {'N':>5} {'SBERT':>7} {'Random':>7} {'Chance':>7} {'SBERT-Chance':>12}")
139
+ print("─" * 65)
140
+ for r in sorted(all_results, key=lambda x: x["sbert_over_chance"], reverse=True):
141
+ print(f"{r['task']:<22} {r['n']:>5} {r['sbert_accuracy']:>6.1%} "
142
+ f"{r['random_accuracy']:>6.1%} {r['chance']:>6.1%} "
143
+ f"{r['sbert_over_chance']:>+11.1%}")
144
+ print("─" * 65)
145
+ print(f"{'OVERALL':<22} {total_n:>5} {overall_sbert:>6.1%} {overall_random:>6.1%}")
146
+
147
+ output = {
148
+ "mode": "sbert_only_ablation",
149
+ "sbert_model": args.sbert_model,
150
+ "overall_sbert_accuracy": round(overall_sbert, 4),
151
+ "overall_random_accuracy": round(overall_random, 4),
152
+ "total_samples": total_n,
153
+ "wall_time_s": round(total_time, 1),
154
+ "tasks": all_results,
155
+ }
156
+
157
+ if args.output:
158
+ with open(args.output, "w") as f:
159
+ json.dump(output, f, indent=2)
160
+ print(f"\nResults saved to {args.output}")
161
+ else:
162
+ print(f"\n{json.dumps(output, indent=2)}")
163
+
164
+
165
+ if __name__ == "__main__":
166
+ main()
tensegrity/broca/controller.py CHANGED
@@ -395,6 +395,27 @@ class CognitiveController:
395
  n = len(self.belief_state.hypotheses) or self.agent.n_states
396
  features = np.zeros(n)
397
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
  # Map entities and relations to hypothesis dimensions using the
399
  # known hypothesis labels. The LLM parser (or template fallback)
400
  # extracts entities that may match hypothesis names.
 
395
  n = len(self.belief_state.hypotheses) or self.agent.n_states
396
  features = np.zeros(n)
397
 
398
+ # Detect binary yes/no tasks. For these tasks, the template parser's
399
+ # keyword-based polarity detection is systematically wrong because
400
+ # passages about questions almost always contain negation words
401
+ # ("not", "doesn't") that have nothing to do with the answer.
402
+ # When we detect a binary yes/no task, we suppress the template
403
+ # parser's relation-based evidence entirely and let SBERT carry
404
+ # the signal. This fixes the BoolQ -12% regression.
405
+ active_labels = [
406
+ h.description.lower() for h in self.belief_state.hypotheses
407
+ if not h.description.startswith("_empty_")
408
+ ]
409
+ is_binary_yesno = (
410
+ len(active_labels) == 2
411
+ and any(l in ("yes", "no", "true", "false") for l in active_labels)
412
+ )
413
+ if is_binary_yesno:
414
+ # For binary yes/no: return zero vector (no template-parser evidence).
415
+ # SBERT sentence similarity in the canonical pipeline will provide
416
+ # the actual signal. The template parser does more harm than good here.
417
+ return features
418
+
419
  # Map entities and relations to hypothesis dimensions using the
420
  # known hypothesis labels. The LLM parser (or template fallback)
421
  # extracts entities that may match hypothesis names.
tensegrity/pipeline/canonical.py CHANGED
@@ -226,6 +226,16 @@ class CanonicalPipeline:
226
  self._choice_model_names: List[str] = []
227
  self._last_derived_obs: List[Dict[str, int]] = []
228
 
 
 
 
 
 
 
 
 
 
 
229
  if self.persistent_state_path:
230
  self.load_state(self.persistent_state_path)
231
 
@@ -286,14 +296,15 @@ class CanonicalPipeline:
286
  self._scm_topologies = {}
287
  self._choice_model_names = []
288
  self._last_derived_obs = []
 
 
 
 
289
  for i, label in enumerate(labels[:len(sample.choices)]):
290
- scm = self._build_choice_scm(i, label)
291
  try:
292
  self.energy_arena.register(scm)
293
  self._choice_model_names.append(scm.name)
294
- # Project this SCM's DAG into the NGC layer hierarchy via
295
- # TopologyMapper. Horizontal causal edges are resolved through
296
- # virtual parents at higher levels (the "elevator shaft" fix).
297
  n_ngc_layers = len(self.controller.agent.field.ngc.layer_sizes)
298
  topology = self._topology_mapper.from_scm(scm, n_layers=n_ngc_layers)
299
  self._scm_topologies[scm.name] = topology
@@ -345,23 +356,46 @@ class CanonicalPipeline:
345
 
346
  # ---------- per-choice SCM (used by EnergyCausalArena) ----------
347
 
348
- def _build_choice_scm(self, choice_idx: int, label: str) -> StructuralCausalModel:
 
349
  """
350
- Build a tiny SCM for one choice:
351
 
 
352
  prompt_feature ──▶ choice_match ──▶ observation
353
 
354
  │ (lateral) coherence
355
 
356
- The DAG has both vertical and horizontal edges. The TopologyMapper
357
- is exactly what turns the lateral coherence link into a virtual parent
358
- in the NGC hierarchy, addressing the topological-mismatch critique.
 
 
359
  """
360
  scm = StructuralCausalModel(name=f"choice_{choice_idx}_{label}")
361
  scm.add_variable("prompt_feature", n_values=4, parents=[])
362
  scm.add_variable("coherence", n_values=4, parents=[])
363
  scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
364
  scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  return scm
366
 
367
  # ---------- one-shot ingest (delegates to controller) ----------
@@ -1005,6 +1039,19 @@ class CanonicalPipeline:
1005
  except Exception as e:
1006
  logger.debug("feedback SCM update skipped: %s", e)
1007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  try:
1009
  self.controller.agent.experience_replay(n_episodes=3)
1010
  except Exception as e:
 
226
  self._choice_model_names: List[str] = []
227
  self._last_derived_obs: List[Dict[str, int]] = []
228
 
229
+ # --- Persistent causal knowledge ---
230
+ # Domain-level SCMs persist across items within a task. Instead of
231
+ # rebuilding every SCM from scratch per item (which gives uniform CPTs
232
+ # that contribute noise), we maintain a library of domain SCMs keyed
233
+ # by task domain. When a new item arrives, we look up existing SCMs
234
+ # for that domain and re-register them with accumulated experience.
235
+ # Per-choice ephemeral SCMs are still created, but the domain SCM
236
+ # provides a prior that shapes the per-choice energy competition.
237
+ self._domain_scm_library: Dict[str, StructuralCausalModel] = {}
238
+
239
  if self.persistent_state_path:
240
  self.load_state(self.persistent_state_path)
241
 
 
296
  self._scm_topologies = {}
297
  self._choice_model_names = []
298
  self._last_derived_obs = []
299
+
300
+ # Determine domain for persistent SCM lookup
301
+ domain = sample.metadata.get("domain", "general")
302
+
303
  for i, label in enumerate(labels[:len(sample.choices)]):
304
+ scm = self._build_choice_scm(i, label, domain=domain)
305
  try:
306
  self.energy_arena.register(scm)
307
  self._choice_model_names.append(scm.name)
 
 
 
308
  n_ngc_layers = len(self.controller.agent.field.ngc.layer_sizes)
309
  topology = self._topology_mapper.from_scm(scm, n_layers=n_ngc_layers)
310
  self._scm_topologies[scm.name] = topology
 
356
 
357
  # ---------- per-choice SCM (used by EnergyCausalArena) ----------
358
 
359
+ def _build_choice_scm(self, choice_idx: int, label: str,
360
+ domain: str = "general") -> StructuralCausalModel:
361
  """
362
+ Build a per-choice SCM, seeded with persistent domain knowledge.
363
 
364
+ The structure is always:
365
  prompt_feature ──▶ choice_match ──▶ observation
366
 
367
  │ (lateral) coherence
368
 
369
+ But CPTs are initialized from the domain SCM library if a matching
370
+ domain model exists. This means the per-choice SCMs start with
371
+ accumulated experience from prior items in the same domain, not
372
+ uniform Dirichlet priors. The domain model is the persistent
373
+ causal knowledge that survives across items.
374
  """
375
  scm = StructuralCausalModel(name=f"choice_{choice_idx}_{label}")
376
  scm.add_variable("prompt_feature", n_values=4, parents=[])
377
  scm.add_variable("coherence", n_values=4, parents=[])
378
  scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
379
  scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
380
+
381
+ # Seed from domain library if available
382
+ domain_key = f"domain_{domain}"
383
+ if domain_key in self._domain_scm_library:
384
+ domain_scm = self._domain_scm_library[domain_key]
385
+ # Copy accumulated CPTs from the domain model
386
+ for var_name, mech in scm.mechanisms.items():
387
+ domain_mech = domain_scm.mechanisms.get(var_name)
388
+ if domain_mech is not None and mech.cpt.shape == domain_mech.cpt.shape:
389
+ mech.cpt[:] = domain_mech.cpt
390
+ else:
391
+ # Create a new domain SCM for future seeding
392
+ domain_scm = StructuralCausalModel(name=domain_key)
393
+ domain_scm.add_variable("prompt_feature", n_values=4, parents=[])
394
+ domain_scm.add_variable("coherence", n_values=4, parents=[])
395
+ domain_scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
396
+ domain_scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
397
+ self._domain_scm_library[domain_key] = domain_scm
398
+
399
  return scm
400
 
401
  # ---------- one-shot ingest (delegates to controller) ----------
 
1039
  except Exception as e:
1040
  logger.debug("feedback SCM update skipped: %s", e)
1041
 
1042
+ # Update the persistent domain SCM with the gold-label observation.
1043
+ # This is what makes the causal arena accumulate experience: the
1044
+ # domain SCM's CPTs evolve with each feedback signal, and future
1045
+ # items in the same domain start with this accumulated knowledge.
1046
+ domain = sample.metadata.get("domain", "general")
1047
+ domain_key = f"domain_{domain}"
1048
+ domain_scm = self._domain_scm_library.get(domain_key)
1049
+ if domain_scm is not None and self._last_derived_obs:
1050
+ try:
1051
+ domain_scm.update_from_data([self._last_derived_obs[sample.gold]])
1052
+ except Exception as e:
1053
+ logger.debug("domain SCM update skipped: %s", e)
1054
+
1055
  try:
1056
  self.controller.agent.experience_replay(n_episodes=3)
1057
  except Exception as e: