"""SDR Composition Analysis v3 — using cached retina.npz.""" import json, os from pathlib import Path import numpy as np OUT_DIR = Path(__file__).resolve().parents[1] / "docs" RETINA = Path.home() / ".cache" / "autoresearch" / "retina.npz" print("[SDR] Loading retina...") data = np.load(RETINA) sdr = data["sdr"] # (65536, 16384) bool n_tok, n_bits = sdr.shape n_active = int(sdr.sum(axis=1).mean()) print(f"[SDR] {n_tok} tokens x {n_bits} bits, ~{n_active} active/token ({n_active/n_bits*100:.2f}% density)") # Sample 500 tokens for pairwise Jaccard rng = np.random.RandomState(42) sample_n = 500 idx = rng.choice(n_tok, sample_n, replace=False) codes = [set(np.where(sdr[i])[0]) for i in idx] # Pairwise Jaccard (vectorized via set ops on sampled tokens) jaccards = np.array([ len(codes[i] & codes[j]) / max(len(codes[i] | codes[j]), 1) for i in range(sample_n) for j in range(i+1, sample_n) ]) print(f"[SDR] Jaccard: mean={jaccards.mean():.4f} median={np.median(jaccards):.4f} " f"P95={np.percentile(jaccards,95):.4f} any_overlap={ (jaccards>0).mean()*100:.1f}%") # Union generalization: 100 random pairs pair_results = [] for _ in range(100): i, j = rng.randint(sample_n, size=2) if i == j: continue u = codes[i] | codes[j] best = max(len(u & codes[k]) / max(len(u | codes[k]), 1) for k in range(sample_n) if k not in (i, j)) pair_results.append({"i": int(idx[i]), "j": int(idx[j]), "best_union_jaccard": float(best)}) mean_best = np.mean([p["best_union_jaccard"] for p in pair_results]) pct_match = sum(1 for p in pair_results if p["best_union_jaccard"] > 0.3) / len(pair_results) * 100 print(f"[SDR] Union: mean_best={mean_best:.4f} pct_match_third_token={pct_match:.1f}%") # Intersection sparsity: for random pairs, how many bits do they share? inters = [len(codes[rng.randint(sample_n)] & codes[rng.randint(sample_n)]) for _ in range(500)] print(f"[SDR] Intersection: mean={np.mean(inters):.1f} bits median={np.median(inters):.1f} max={max(inters)}") results = { "pairwise_jaccard": { "mean": float(jaccards.mean()), "median": float(np.median(jaccards)), "p95": float(np.percentile(jaccards,95)), "min": float(jaccards.min()), "max": float(jaccards.max()), "pct_with_any_overlap": float((jaccards>0).mean()*100), }, "union_generalization": { "n_pairs": len(pair_results), "mean_best_union_jaccard": float(mean_best), "pct_union_matches_third_token": float(pct_match), }, "intersection": {"mean_active_shared": float(np.mean(inters)), "median_active_shared": float(np.median(inters)), "max_active_shared": int(max(inters))}, "sparsity": {"n_tokens": int(n_tok), "sdr_dim": int(n_bits), "active_bits": int(n_active), "density_pct": float(n_active / n_bits * 100)}, } Path(OUT_DIR / "results_sdr_composition.json").write_text(json.dumps(results, indent=2)) print(f"[SDR] Saved results_sdr_composition.json")