feat: Persistent causal arena, BoolQ binary task fix, SBERT-only ablation baseline (#3)

- feat: controller.py (46feb31f0aaeaba437832e5cda23e0bd1f9f81cc)
- feat: canonical.py (5681f8c567cbed9915db4a076b8a342f041138f4)
- feat: ablation_sbert_only.py (d2eafd8354f0c1417fd358a686e7421c72602d0c)

Files changed (3) hide show

scripts/ablation_sbert_only.py +166 -0
tensegrity/broca/controller.py +21 -0
tensegrity/pipeline/canonical.py +56 -9

scripts/ablation_sbert_only.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python3
+"""
+SBERT-Only Ablation Baseline.
+This script answers the most important question about Tensegrity:
+"Does the cognitive layer add value above SBERT-alone?"
+It runs the same benchmark tasks but uses ONLY SBERT cosine similarity
+to score choices — no NGC, no causal arena, no Hopfield memory, no
+belief updates, no falsification. Just:
+    score(choice_i) = cosine_sim(sbert(prompt), sbert(prompt + choice_i))
+This is the honest baseline the cognitive layer must beat. If the
+cognitive layer's Δ over SBERT-alone is positive, the manifold is
+doing real work. If it's zero, the manifold is expensive SBERT.
+Usage:
+    python scripts/ablation_sbert_only.py --max-samples 100
+    python scripts/ablation_sbert_only.py --tasks copa,boolq,sciq
+"""
+import sys
+import os
+import time
+import json
+import argparse
+import hashlib
+import logging
+import numpy as np
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="SBERT-only ablation baseline")
+    parser.add_argument("--tasks", default=None, help="Comma-separated task names")
+    parser.add_argument("--max-samples", type=int, default=None, help="Max samples per task")
+    parser.add_argument("--sbert-model", default="all-MiniLM-L6-v2", help="SBERT model name")
+    parser.add_argument("--output", default=None, help="Save JSON results to file")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    from tensegrity.bench.tasks import TASK_REGISTRY, load_task_samples
+    # Load SBERT
+    try:
+        from sentence_transformers import SentenceTransformer
+        sbert = SentenceTransformer(args.sbert_model)
+        print(f"Loaded SBERT: {args.sbert_model}")
+    except Exception as e:
+        print(f"FATAL: Could not load SBERT: {e}")
+        sys.exit(1)
+    tasks = args.tasks.split(",") if args.tasks else list(TASK_REGISTRY.keys())
+    print(f"\n{'█' * 60}")
+    print(f"  SBERT-ONLY ABLATION BASELINE")
+    print(f"  Model: {args.sbert_model}")
+    print(f"  Tasks: {len(tasks)}")
+    print(f"  N/task: {args.max_samples or 'all'}")
+    print(f"{'█' * 60}")
+    t_start = time.time()
+    all_results = []
+    total_correct_sbert = 0
+    total_correct_random = 0
+    total_n = 0
+    for task_name in tasks:
+        config = TASK_REGISTRY[task_name]
+        samples = load_task_samples(task_name, args.max_samples)
+        print(f"\n  ▸ {task_name}: {config.description} ({len(samples)} samples)")
+        task_correct_sbert = 0
+        task_correct_random = 0
+        task_n = len(samples)
+        for sample in samples:
+            n = len(sample.choices)
+            if n == 0:
+                continue
+            # SBERT-only scoring: cosine(prompt, prompt+choice)
+            texts = [sample.prompt] + [f"{sample.prompt} {c}" for c in sample.choices]
+            embs = sbert.encode(texts, show_progress_bar=False)
+            pe = embs[0]
+            pn = np.linalg.norm(pe)
+            scores = np.zeros(n)
+            if pn > 1e-8:
+                for i in range(n):
+                    ce = embs[i + 1]
+                    cn = np.linalg.norm(ce)
+                    if cn > 1e-8:
+                        scores[i] = np.dot(pe, ce) / (pn * cn)
+            sbert_pred = int(np.argmax(scores))
+            if sbert_pred == sample.gold:
+                task_correct_sbert += 1
+            # Random baseline for comparison
+            seed_bytes = hashlib.sha256(sample.id.encode("utf-8")).digest()
+            sample_seed = int.from_bytes(seed_bytes[:8], "big", signed=False) % (2**31)
+            rng = np.random.RandomState(sample_seed)
+            random_pred = int(np.argmax(rng.randn(n)))
+            if random_pred == sample.gold:
+                task_correct_random += 1
+        sbert_acc = task_correct_sbert / max(task_n, 1)
+        random_acc = task_correct_random / max(task_n, 1)
+        chance = 1.0 / config.n_choices if config.n_choices > 0 else 0.25
+        total_correct_sbert += task_correct_sbert
+        total_correct_random += task_correct_random
+        total_n += task_n
+        result = {
+            "task": task_name, "domain": config.domain, "n": task_n,
+            "sbert_accuracy": round(sbert_acc, 4),
+            "random_accuracy": round(random_acc, 4),
+            "chance": round(chance, 4),
+            "sbert_over_chance": round(sbert_acc - chance, 4),
+        }
+        all_results.append(result)
+        print(f"    SBERT={sbert_acc:.1%}  random={random_acc:.1%}  "
+              f"chance={chance:.1%}  SBERT-chance={sbert_acc-chance:+.1%}")
+    total_time = time.time() - t_start
+    overall_sbert = total_correct_sbert / max(total_n, 1)
+    overall_random = total_correct_random / max(total_n, 1)
+    print(f"\n{'═' * 75}")
+    print(f"  SBERT-only overall: {overall_sbert:.1%}  (random: {overall_random:.1%})")
+    print(f"  Total: {total_n} samples, {total_time:.1f}s")
+    print(f"{'═' * 75}")
+    # Print comparison table
+    print(f"\n{'Task':<22} {'N':>5} {'SBERT':>7} {'Random':>7} {'Chance':>7} {'SBERT-Chance':>12}")
+    print("─" * 65)
+    for r in sorted(all_results, key=lambda x: x["sbert_over_chance"], reverse=True):
+        print(f"{r['task']:<22} {r['n']:>5} {r['sbert_accuracy']:>6.1%} "
+              f"{r['random_accuracy']:>6.1%} {r['chance']:>6.1%} "
+              f"{r['sbert_over_chance']:>+11.1%}")
+    print("─" * 65)
+    print(f"{'OVERALL':<22} {total_n:>5} {overall_sbert:>6.1%} {overall_random:>6.1%}")
+    output = {
+        "mode": "sbert_only_ablation",
+        "sbert_model": args.sbert_model,
+        "overall_sbert_accuracy": round(overall_sbert, 4),
+        "overall_random_accuracy": round(overall_random, 4),
+        "total_samples": total_n,
+        "wall_time_s": round(total_time, 1),
+        "tasks": all_results,
+    }
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(output, f, indent=2)
+        print(f"\nResults saved to {args.output}")
+    else:
+        print(f"\n{json.dumps(output, indent=2)}")
+if __name__ == "__main__":
+    main()

tensegrity/broca/controller.py CHANGED Viewed

@@ -395,6 +395,27 @@ class CognitiveController:
         n = len(self.belief_state.hypotheses) or self.agent.n_states
         features = np.zeros(n)
         # Map entities and relations to hypothesis dimensions using the
         # known hypothesis labels. The LLM parser (or template fallback)
         # extracts entities that may match hypothesis names.

         n = len(self.belief_state.hypotheses) or self.agent.n_states
         features = np.zeros(n)
+        # Detect binary yes/no tasks. For these tasks, the template parser's
+        # keyword-based polarity detection is systematically wrong because
+        # passages about questions almost always contain negation words
+        # ("not", "doesn't") that have nothing to do with the answer.
+        # When we detect a binary yes/no task, we suppress the template
+        # parser's relation-based evidence entirely and let SBERT carry
+        # the signal. This fixes the BoolQ -12% regression.
+        active_labels = [
+            h.description.lower() for h in self.belief_state.hypotheses
+            if not h.description.startswith("_empty_")
+        ]
+        is_binary_yesno = (
+            len(active_labels) == 2
+            and any(l in ("yes", "no", "true", "false") for l in active_labels)
+        )
+        if is_binary_yesno:
+            # For binary yes/no: return zero vector (no template-parser evidence).
+            # SBERT sentence similarity in the canonical pipeline will provide
+            # the actual signal. The template parser does more harm than good here.
+            return features
         # Map entities and relations to hypothesis dimensions using the
         # known hypothesis labels. The LLM parser (or template fallback)
         # extracts entities that may match hypothesis names.

tensegrity/pipeline/canonical.py CHANGED Viewed

@@ -226,6 +226,16 @@ class CanonicalPipeline:
         self._choice_model_names: List[str] = []
         self._last_derived_obs: List[Dict[str, int]] = []
         if self.persistent_state_path:
             self.load_state(self.persistent_state_path)
@@ -286,14 +296,15 @@ class CanonicalPipeline:
         self._scm_topologies = {}
         self._choice_model_names = []
         self._last_derived_obs = []
         for i, label in enumerate(labels[:len(sample.choices)]):
-            scm = self._build_choice_scm(i, label)
             try:
                 self.energy_arena.register(scm)
                 self._choice_model_names.append(scm.name)
-                # Project this SCM's DAG into the NGC layer hierarchy via
-                # TopologyMapper. Horizontal causal edges are resolved through
-                # virtual parents at higher levels (the "elevator shaft" fix).
                 n_ngc_layers = len(self.controller.agent.field.ngc.layer_sizes)
                 topology = self._topology_mapper.from_scm(scm, n_layers=n_ngc_layers)
                 self._scm_topologies[scm.name] = topology
@@ -345,23 +356,46 @@ class CanonicalPipeline:
     # ---------- per-choice SCM (used by EnergyCausalArena) ----------
-    def _build_choice_scm(self, choice_idx: int, label: str) -> StructuralCausalModel:
         """
-        Build a tiny SCM for one choice:
             prompt_feature  ──▶  choice_match  ──▶  observation
                                                 ▲
                                                 │ (lateral) coherence
-        The DAG has both vertical and horizontal edges. The TopologyMapper
-        is exactly what turns the lateral coherence link into a virtual parent
-        in the NGC hierarchy, addressing the topological-mismatch critique.
         """
         scm = StructuralCausalModel(name=f"choice_{choice_idx}_{label}")
         scm.add_variable("prompt_feature", n_values=4, parents=[])
         scm.add_variable("coherence", n_values=4, parents=[])
         scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
         scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
         return scm
     # ---------- one-shot ingest (delegates to controller) ----------
@@ -1005,6 +1039,19 @@ class CanonicalPipeline:
                 except Exception as e:
                     logger.debug("feedback SCM update skipped: %s", e)
         try:
             self.controller.agent.experience_replay(n_episodes=3)
         except Exception as e:

         self._choice_model_names: List[str] = []
         self._last_derived_obs: List[Dict[str, int]] = []
+        # --- Persistent causal knowledge ---
+        # Domain-level SCMs persist across items within a task. Instead of
+        # rebuilding every SCM from scratch per item (which gives uniform CPTs
+        # that contribute noise), we maintain a library of domain SCMs keyed
+        # by task domain. When a new item arrives, we look up existing SCMs
+        # for that domain and re-register them with accumulated experience.
+        # Per-choice ephemeral SCMs are still created, but the domain SCM
+        # provides a prior that shapes the per-choice energy competition.
+        self._domain_scm_library: Dict[str, StructuralCausalModel] = {}
         if self.persistent_state_path:
             self.load_state(self.persistent_state_path)
         self._scm_topologies = {}
         self._choice_model_names = []
         self._last_derived_obs = []
+        # Determine domain for persistent SCM lookup
+        domain = sample.metadata.get("domain", "general")
         for i, label in enumerate(labels[:len(sample.choices)]):
+            scm = self._build_choice_scm(i, label, domain=domain)
             try:
                 self.energy_arena.register(scm)
                 self._choice_model_names.append(scm.name)
                 n_ngc_layers = len(self.controller.agent.field.ngc.layer_sizes)
                 topology = self._topology_mapper.from_scm(scm, n_layers=n_ngc_layers)
                 self._scm_topologies[scm.name] = topology
     # ---------- per-choice SCM (used by EnergyCausalArena) ----------
+    def _build_choice_scm(self, choice_idx: int, label: str,
+                          domain: str = "general") -> StructuralCausalModel:
         """
+        Build a per-choice SCM, seeded with persistent domain knowledge.
+        The structure is always:
             prompt_feature  ──▶  choice_match  ──▶  observation
                                                 ▲
                                                 │ (lateral) coherence
+        But CPTs are initialized from the domain SCM library if a matching
+        domain model exists. This means the per-choice SCMs start with
+        accumulated experience from prior items in the same domain, not
+        uniform Dirichlet priors. The domain model is the persistent
+        causal knowledge that survives across items.
         """
         scm = StructuralCausalModel(name=f"choice_{choice_idx}_{label}")
         scm.add_variable("prompt_feature", n_values=4, parents=[])
         scm.add_variable("coherence", n_values=4, parents=[])
         scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
         scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
+        # Seed from domain library if available
+        domain_key = f"domain_{domain}"
+        if domain_key in self._domain_scm_library:
+            domain_scm = self._domain_scm_library[domain_key]
+            # Copy accumulated CPTs from the domain model
+            for var_name, mech in scm.mechanisms.items():
+                domain_mech = domain_scm.mechanisms.get(var_name)
+                if domain_mech is not None and mech.cpt.shape == domain_mech.cpt.shape:
+                    mech.cpt[:] = domain_mech.cpt
+        else:
+            # Create a new domain SCM for future seeding
+            domain_scm = StructuralCausalModel(name=domain_key)
+            domain_scm.add_variable("prompt_feature", n_values=4, parents=[])
+            domain_scm.add_variable("coherence", n_values=4, parents=[])
+            domain_scm.add_variable("choice_match", n_values=4, parents=["prompt_feature"])
+            domain_scm.add_variable("observation", n_values=4, parents=["choice_match", "coherence"])
+            self._domain_scm_library[domain_key] = domain_scm
         return scm
     # ---------- one-shot ingest (delegates to controller) ----------
                 except Exception as e:
                     logger.debug("feedback SCM update skipped: %s", e)
+            # Update the persistent domain SCM with the gold-label observation.
+            # This is what makes the causal arena accumulate experience: the
+            # domain SCM's CPTs evolve with each feedback signal, and future
+            # items in the same domain start with this accumulated knowledge.
+            domain = sample.metadata.get("domain", "general")
+            domain_key = f"domain_{domain}"
+            domain_scm = self._domain_scm_library.get(domain_key)
+            if domain_scm is not None and self._last_derived_obs:
+                try:
+                    domain_scm.update_from_data([self._last_derived_obs[sample.gold]])
+                except Exception as e:
+                    logger.debug("domain SCM update skipped: %s", e)
         try:
             self.controller.agent.experience_replay(n_episodes=3)
         except Exception as e: