feat: Persistent causal arena, BoolQ binary task fix, SBERT-only ablation baseline

by theapemachine - opened 13 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+187

-0

Files changed (2) hide show

scripts/ablation_sbert_only.py +166 -0
tensegrity/broca/controller.py +21 -0

scripts/ablation_sbert_only.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python3
+"""
+SBERT-Only Ablation Baseline.
+This script answers the most important question about Tensegrity:
+"Does the cognitive layer add value above SBERT-alone?"
+It runs the same benchmark tasks but uses ONLY SBERT cosine similarity
+to score choices — no NGC, no causal arena, no Hopfield memory, no
+belief updates, no falsification. Just:
+    score(choice_i) = cosine_sim(sbert(prompt), sbert(prompt + choice_i))
+This is the honest baseline the cognitive layer must beat. If the
+cognitive layer's Δ over SBERT-alone is positive, the manifold is
+doing real work. If it's zero, the manifold is expensive SBERT.
+Usage:
+    python scripts/ablation_sbert_only.py --max-samples 100
+    python scripts/ablation_sbert_only.py --tasks copa,boolq,sciq
+"""
+import sys
+import os
+import time
+import json
+import argparse
+import hashlib
+import logging
+import numpy as np
+logger = logging.getLogger(__name__)
+def main():
+    parser = argparse.ArgumentParser(description="SBERT-only ablation baseline")
+    parser.add_argument("--tasks", default=None, help="Comma-separated task names")
+    parser.add_argument("--max-samples", type=int, default=None, help="Max samples per task")
+    parser.add_argument("--sbert-model", default="all-MiniLM-L6-v2", help="SBERT model name")
+    parser.add_argument("--output", default=None, help="Save JSON results to file")
+    parser.add_argument("--seed", type=int, default=42)
+    args = parser.parse_args()
+    from tensegrity.bench.tasks import TASK_REGISTRY, load_task_samples
+    # Load SBERT
+    try:
+        from sentence_transformers import SentenceTransformer
+        sbert = SentenceTransformer(args.sbert_model)
+        print(f"Loaded SBERT: {args.sbert_model}")
+    except Exception as e:
+        print(f"FATAL: Could not load SBERT: {e}")
+        sys.exit(1)
+    tasks = args.tasks.split(",") if args.tasks else list(TASK_REGISTRY.keys())
+    print(f"\n{'█' * 60}")
+    print(f"  SBERT-ONLY ABLATION BASELINE")
+    print(f"  Model: {args.sbert_model}")
+    print(f"  Tasks: {len(tasks)}")
+    print(f"  N/task: {args.max_samples or 'all'}")
+    print(f"{'█' * 60}")
+    t_start = time.time()
+    all_results = []
+    total_correct_sbert = 0
+    total_correct_random = 0
+    total_n = 0
+    for task_name in tasks:
+        config = TASK_REGISTRY[task_name]
+        samples = load_task_samples(task_name, args.max_samples)
+        print(f"\n  ▸ {task_name}: {config.description} ({len(samples)} samples)")
+        task_correct_sbert = 0
+        task_correct_random = 0
+        task_n = len(samples)
+        for sample in samples:
+            n = len(sample.choices)
+            if n == 0:
+                continue
+            # SBERT-only scoring: cosine(prompt, prompt+choice)
+            texts = [sample.prompt] + [f"{sample.prompt} {c}" for c in sample.choices]
+            embs = sbert.encode(texts, show_progress_bar=False)
+            pe = embs[0]
+            pn = np.linalg.norm(pe)
+            scores = np.zeros(n)
+            if pn > 1e-8:
+                for i in range(n):
+                    ce = embs[i + 1]
+                    cn = np.linalg.norm(ce)
+                    if cn > 1e-8:
+                        scores[i] = np.dot(pe, ce) / (pn * cn)
+            sbert_pred = int(np.argmax(scores))
+            if sbert_pred == sample.gold:
+                task_correct_sbert += 1
+            # Random baseline for comparison
+            seed_bytes = hashlib.sha256(sample.id.encode("utf-8")).digest()
+            sample_seed = int.from_bytes(seed_bytes[:8], "big", signed=False) % (2**31)
+            rng = np.random.RandomState(sample_seed)
+            random_pred = int(np.argmax(rng.randn(n)))
+            if random_pred == sample.gold:
+                task_correct_random += 1
+        sbert_acc = task_correct_sbert / max(task_n, 1)
+        random_acc = task_correct_random / max(task_n, 1)
+        chance = 1.0 / config.n_choices if config.n_choices > 0 else 0.25
+        total_correct_sbert += task_correct_sbert
+        total_correct_random += task_correct_random
+        total_n += task_n
+        result = {
+            "task": task_name, "domain": config.domain, "n": task_n,
+            "sbert_accuracy": round(sbert_acc, 4),
+            "random_accuracy": round(random_acc, 4),
+            "chance": round(chance, 4),
+            "sbert_over_chance": round(sbert_acc - chance, 4),
+        }
+        all_results.append(result)
+        print(f"    SBERT={sbert_acc:.1%}  random={random_acc:.1%}  "
+              f"chance={chance:.1%}  SBERT-chance={sbert_acc-chance:+.1%}")
+    total_time = time.time() - t_start
+    overall_sbert = total_correct_sbert / max(total_n, 1)
+    overall_random = total_correct_random / max(total_n, 1)
+    print(f"\n{'═' * 75}")
+    print(f"  SBERT-only overall: {overall_sbert:.1%}  (random: {overall_random:.1%})")
+    print(f"  Total: {total_n} samples, {total_time:.1f}s")
+    print(f"{'═' * 75}")
+    # Print comparison table
+    print(f"\n{'Task':<22} {'N':>5} {'SBERT':>7} {'Random':>7} {'Chance':>7} {'SBERT-Chance':>12}")
+    print("─" * 65)
+    for r in sorted(all_results, key=lambda x: x["sbert_over_chance"], reverse=True):
+        print(f"{r['task']:<22} {r['n']:>5} {r['sbert_accuracy']:>6.1%} "
+              f"{r['random_accuracy']:>6.1%} {r['chance']:>6.1%} "
+              f"{r['sbert_over_chance']:>+11.1%}")
+    print("─" * 65)
+    print(f"{'OVERALL':<22} {total_n:>5} {overall_sbert:>6.1%} {overall_random:>6.1%}")
+    output = {
+        "mode": "sbert_only_ablation",
+        "sbert_model": args.sbert_model,
+        "overall_sbert_accuracy": round(overall_sbert, 4),
+        "overall_random_accuracy": round(overall_random, 4),
+        "total_samples": total_n,
+        "wall_time_s": round(total_time, 1),
+        "tasks": all_results,
+    }
+    if args.output:
+        with open(args.output, "w") as f:
+            json.dump(output, f, indent=2)
+        print(f"\nResults saved to {args.output}")
+    else:
+        print(f"\n{json.dumps(output, indent=2)}")
+if __name__ == "__main__":
+    main()

tensegrity/broca/controller.py CHANGED Viewed

@@ -395,6 +395,27 @@ class CognitiveController:
         n = len(self.belief_state.hypotheses) or self.agent.n_states
         features = np.zeros(n)
         # Map entities and relations to hypothesis dimensions using the
         # known hypothesis labels. The LLM parser (or template fallback)
         # extracts entities that may match hypothesis names.

         n = len(self.belief_state.hypotheses) or self.agent.n_states
         features = np.zeros(n)
+        # Detect binary yes/no tasks. For these tasks, the template parser's
+        # keyword-based polarity detection is systematically wrong because
+        # passages about questions almost always contain negation words
+        # ("not", "doesn't") that have nothing to do with the answer.
+        # When we detect a binary yes/no task, we suppress the template
+        # parser's relation-based evidence entirely and let SBERT carry
+        # the signal. This fixes the BoolQ -12% regression.
+        active_labels = [
+            h.description.lower() for h in self.belief_state.hypotheses
+            if not h.description.startswith("_empty_")
+        ]
+        is_binary_yesno = (
+            len(active_labels) == 2
+            and any(l in ("yes", "no", "true", "false") for l in active_labels)
+        )
+        if is_binary_yesno:
+            # For binary yes/no: return zero vector (no template-parser evidence).
+            # SBERT sentence similarity in the canonical pipeline will provide
+            # the actual signal. The template parser does more harm than good here.
+            return features
         # Map entities and relations to hypothesis dimensions using the
         # known hypothesis labels. The LLM parser (or template fallback)
         # extracts entities that may match hypothesis names.