feather-a10g-large-runtime / overlay /scripts /experiment_sdr_composition.py
icarus112's picture
Update Feather a10g-large training runtime image
e5cf7c3 verified
"""SDR Composition Analysis v3 — using cached retina.npz."""
import json, os
from pathlib import Path
import numpy as np
OUT_DIR = Path(__file__).resolve().parents[1] / "docs"
RETINA = Path.home() / ".cache" / "autoresearch" / "retina.npz"
print("[SDR] Loading retina...")
data = np.load(RETINA)
sdr = data["sdr"] # (65536, 16384) bool
n_tok, n_bits = sdr.shape
n_active = int(sdr.sum(axis=1).mean())
print(f"[SDR] {n_tok} tokens x {n_bits} bits, ~{n_active} active/token ({n_active/n_bits*100:.2f}% density)")
# Sample 500 tokens for pairwise Jaccard
rng = np.random.RandomState(42)
sample_n = 500
idx = rng.choice(n_tok, sample_n, replace=False)
codes = [set(np.where(sdr[i])[0]) for i in idx]
# Pairwise Jaccard (vectorized via set ops on sampled tokens)
jaccards = np.array([
len(codes[i] & codes[j]) / max(len(codes[i] | codes[j]), 1)
for i in range(sample_n) for j in range(i+1, sample_n)
])
print(f"[SDR] Jaccard: mean={jaccards.mean():.4f} median={np.median(jaccards):.4f} "
f"P95={np.percentile(jaccards,95):.4f} any_overlap={ (jaccards>0).mean()*100:.1f}%")
# Union generalization: 100 random pairs
pair_results = []
for _ in range(100):
i, j = rng.randint(sample_n, size=2)
if i == j: continue
u = codes[i] | codes[j]
best = max(len(u & codes[k]) / max(len(u | codes[k]), 1) for k in range(sample_n) if k not in (i, j))
pair_results.append({"i": int(idx[i]), "j": int(idx[j]), "best_union_jaccard": float(best)})
mean_best = np.mean([p["best_union_jaccard"] for p in pair_results])
pct_match = sum(1 for p in pair_results if p["best_union_jaccard"] > 0.3) / len(pair_results) * 100
print(f"[SDR] Union: mean_best={mean_best:.4f} pct_match_third_token={pct_match:.1f}%")
# Intersection sparsity: for random pairs, how many bits do they share?
inters = [len(codes[rng.randint(sample_n)] & codes[rng.randint(sample_n)]) for _ in range(500)]
print(f"[SDR] Intersection: mean={np.mean(inters):.1f} bits median={np.median(inters):.1f} max={max(inters)}")
results = {
"pairwise_jaccard": {
"mean": float(jaccards.mean()), "median": float(np.median(jaccards)),
"p95": float(np.percentile(jaccards,95)), "min": float(jaccards.min()), "max": float(jaccards.max()),
"pct_with_any_overlap": float((jaccards>0).mean()*100),
},
"union_generalization": {
"n_pairs": len(pair_results), "mean_best_union_jaccard": float(mean_best),
"pct_union_matches_third_token": float(pct_match),
},
"intersection": {"mean_active_shared": float(np.mean(inters)), "median_active_shared": float(np.median(inters)), "max_active_shared": int(max(inters))},
"sparsity": {"n_tokens": int(n_tok), "sdr_dim": int(n_bits), "active_bits": int(n_active), "density_pct": float(n_active / n_bits * 100)},
}
Path(OUT_DIR / "results_sdr_composition.json").write_text(json.dumps(results, indent=2))
print(f"[SDR] Saved results_sdr_composition.json")