File size: 6,068 Bytes

641ae8e

#!/usr/bin/env python3
"""
SBERT-Only Ablation Baseline.

This script answers the most important question about Tensegrity:
"Does the cognitive layer add value above SBERT-alone?"

It runs the same benchmark tasks but uses ONLY SBERT cosine similarity
to score choices — no NGC, no causal arena, no Hopfield memory, no
belief updates, no falsification. Just:

    score(choice_i) = cosine_sim(sbert(prompt), sbert(prompt + choice_i))

This is the honest baseline the cognitive layer must beat. If the
cognitive layer's Δ over SBERT-alone is positive, the manifold is
doing real work. If it's zero, the manifold is expensive SBERT.

Usage:
    python scripts/ablation_sbert_only.py --max-samples 100
    python scripts/ablation_sbert_only.py --tasks copa,boolq,sciq
"""
import sys
import os
import time
import json
import argparse
import hashlib
import logging

import numpy as np

logger = logging.getLogger(__name__)


def main():
    parser = argparse.ArgumentParser(description="SBERT-only ablation baseline")
    parser.add_argument("--tasks", default=None, help="Comma-separated task names")
    parser.add_argument("--max-samples", type=int, default=None, help="Max samples per task")
    parser.add_argument("--sbert-model", default="all-MiniLM-L6-v2", help="SBERT model name")
    parser.add_argument("--output", default=None, help="Save JSON results to file")
    parser.add_argument("--seed", type=int, default=42)
    args = parser.parse_args()

    from tensegrity.bench.tasks import TASK_REGISTRY, load_task_samples

    # Load SBERT
    try:
        from sentence_transformers import SentenceTransformer
        sbert = SentenceTransformer(args.sbert_model)
        print(f"Loaded SBERT: {args.sbert_model}")
    except Exception as e:
        print(f"FATAL: Could not load SBERT: {e}")
        sys.exit(1)

    tasks = args.tasks.split(",") if args.tasks else list(TASK_REGISTRY.keys())

    print(f"\n{'█' * 60}")
    print(f"  SBERT-ONLY ABLATION BASELINE")
    print(f"  Model: {args.sbert_model}")
    print(f"  Tasks: {len(tasks)}")
    print(f"  N/task: {args.max_samples or 'all'}")
    print(f"{'█' * 60}")

    t_start = time.time()
    all_results = []
    total_correct_sbert = 0
    total_correct_random = 0
    total_n = 0

    for task_name in tasks:
        config = TASK_REGISTRY[task_name]
        samples = load_task_samples(task_name, args.max_samples)
        print(f"\n  ▸ {task_name}: {config.description} ({len(samples)} samples)")

        task_correct_sbert = 0
        task_correct_random = 0
        task_n = len(samples)

        for sample in samples:
            n = len(sample.choices)
            if n == 0:
                continue

            # SBERT-only scoring: cosine(prompt, prompt+choice)
            texts = [sample.prompt] + [f"{sample.prompt} {c}" for c in sample.choices]
            embs = sbert.encode(texts, show_progress_bar=False)
            pe = embs[0]
            pn = np.linalg.norm(pe)
            scores = np.zeros(n)
            if pn > 1e-8:
                for i in range(n):
                    ce = embs[i + 1]
                    cn = np.linalg.norm(ce)
                    if cn > 1e-8:
                        scores[i] = np.dot(pe, ce) / (pn * cn)

            sbert_pred = int(np.argmax(scores))
            if sbert_pred == sample.gold:
                task_correct_sbert += 1

            # Random baseline for comparison
            seed_bytes = hashlib.sha256(sample.id.encode("utf-8")).digest()
            sample_seed = int.from_bytes(seed_bytes[:8], "big", signed=False) % (2**31)
            rng = np.random.RandomState(sample_seed)
            random_pred = int(np.argmax(rng.randn(n)))
            if random_pred == sample.gold:
                task_correct_random += 1

        sbert_acc = task_correct_sbert / max(task_n, 1)
        random_acc = task_correct_random / max(task_n, 1)
        chance = 1.0 / config.n_choices if config.n_choices > 0 else 0.25

        total_correct_sbert += task_correct_sbert
        total_correct_random += task_correct_random
        total_n += task_n

        result = {
            "task": task_name, "domain": config.domain, "n": task_n,
            "sbert_accuracy": round(sbert_acc, 4),
            "random_accuracy": round(random_acc, 4),
            "chance": round(chance, 4),
            "sbert_over_chance": round(sbert_acc - chance, 4),
        }
        all_results.append(result)
        print(f"    SBERT={sbert_acc:.1%}  random={random_acc:.1%}  "
              f"chance={chance:.1%}  SBERT-chance={sbert_acc-chance:+.1%}")

    total_time = time.time() - t_start
    overall_sbert = total_correct_sbert / max(total_n, 1)
    overall_random = total_correct_random / max(total_n, 1)

    print(f"\n{'═' * 75}")
    print(f"  SBERT-only overall: {overall_sbert:.1%}  (random: {overall_random:.1%})")
    print(f"  Total: {total_n} samples, {total_time:.1f}s")
    print(f"{'═' * 75}")

    # Print comparison table
    print(f"\n{'Task':<22} {'N':>5} {'SBERT':>7} {'Random':>7} {'Chance':>7} {'SBERT-Chance':>12}")
    print("─" * 65)
    for r in sorted(all_results, key=lambda x: x["sbert_over_chance"], reverse=True):
        print(f"{r['task']:<22} {r['n']:>5} {r['sbert_accuracy']:>6.1%} "
              f"{r['random_accuracy']:>6.1%} {r['chance']:>6.1%} "
              f"{r['sbert_over_chance']:>+11.1%}")
    print("─" * 65)
    print(f"{'OVERALL':<22} {total_n:>5} {overall_sbert:>6.1%} {overall_random:>6.1%}")

    output = {
        "mode": "sbert_only_ablation",
        "sbert_model": args.sbert_model,
        "overall_sbert_accuracy": round(overall_sbert, 4),
        "overall_random_accuracy": round(overall_random, 4),
        "total_samples": total_n,
        "wall_time_s": round(total_time, 1),
        "tasks": all_results,
    }

    if args.output:
        with open(args.output, "w") as f:
            json.dump(output, f, indent=2)
        print(f"\nResults saved to {args.output}")
    else:
        print(f"\n{json.dumps(output, indent=2)}")


if __name__ == "__main__":
    main()