thoughtworks
/

arithmetic-sorl

+"""
+Fourier analysis of SoRL abstract tokens on modular arithmetic.
+Tests Nanda's hypothesis: do abstract tokens encode Fourier components of (a+b) mod p?
+Two analyses:
+  1. Assignment analysis  — for each (a,b) pair, which abstract token does the model assign?
+                            Does the assignment function cluster by (a+b) mod p?
+  2. Embedding analysis   — do abstract token embeddings organize along sin/cos curves
+                            in Fourier frequency space?
+Usage:
+    python -m arithmetic.modular.experiments.11_fourier_analysis.run \
+        --model_dir arithmetic/runs/mod_sorl_fourier/final \
+        --out_dir   arithmetic/modular/experiments/11_fourier_analysis/results
+"""
+import sys, json, argparse
+from pathlib import Path
+sys.path.insert(0, str(Path(__file__).resolve().parents[5]))
+import numpy as np
+import torch
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+from matplotlib.colors import Normalize
+from sorl.sorl_wrapper import SorlModelWrapper
+from sorl.sorl_trainer import sorl_search
+from arithmetic.modular.data.modular import (
+    generate_dataset, P, VOCAB_SIZE, PAD, PROMPT_LEN,
+)
+# ---------------------------------------------------------------------------
+# Load model
+# ---------------------------------------------------------------------------
+def load_model(model_dir: str, device: str) -> SorlModelWrapper:
+    model_dir = Path(model_dir)
+    with open(model_dir / "sorl_config.json") as f:
+        cfg = json.load(f)
+    from transformers import Qwen3Config
+    config = Qwen3Config(
+        hidden_size=cfg["n_embd"], num_hidden_layers=cfg["n_layer"],
+        num_attention_heads=cfg["n_head"], num_key_value_heads=cfg["n_head"],
+        intermediate_size=cfg["d_mlp"], vocab_size=VOCAB_SIZE,
+        max_position_embeddings=32,
+    )
+    model = SorlModelWrapper.from_scratch(config, [VOCAB_SIZE, cfg["abs_vocab"]], PAD)
+    model.load_state_dict(torch.load(model_dir / "model_state_dict.pt", map_location="cpu"))
+    return model.to(device).eval()
+# ---------------------------------------------------------------------------
+# Extract abstract token assignments for all (a, b) pairs
+# ---------------------------------------------------------------------------
+@torch.no_grad()
+def get_assignments(model, all_examples, K: int, device: str, batch_size: int = 256):
+    """
+    For every (a, b) pair return the abstract token IDs assigned at each abstract position.
+    Returns:
+        assignments: np.ndarray shape (N, n_abs_positions)  — token IDs
+        sums:        np.ndarray shape (N,)                  — (a+b) mod p
+        pairs:       list of (a, b) tuples
+    """
+    base_v = int(model.vocab_sizes[0].item())
+    all_assignments = []
+    all_sums = []
+    all_pairs = []
+    for start in range(0, len(all_examples), batch_size):
+        batch = all_examples[start:start + batch_size]
+        ids  = torch.tensor([e.tokens for e in batch], dtype=torch.long, device=device)
+        attn = torch.ones_like(ids)
+        pl   = torch.full((ids.shape[0],), PROMPT_LEN, dtype=torch.long, device=device)
+        best_data, _, _, _, _ = sorl_search(
+            model, ids, attn, pl, PAD,
+            n=1, K=K, max_iterations=2,
+            memory_span_abs=512, memory_span_traj=512,
+            temperature=0.0,
+        )
+        # Abstract positions: tokens >= base_v
+        for i, ex in enumerate(batch):
+            seq = best_data[i].cpu().tolist()
+            abs_tokens = [t - base_v for t in seq if t >= base_v]
+            all_assignments.append(abs_tokens)
+            all_sums.append((ex.a + ex.b) % P)
+            all_pairs.append((ex.a, ex.b))
+    max_len = max(len(a) for a in all_assignments)
+    padded = np.array([a + [-1] * (max_len - len(a)) for a in all_assignments])
+    return padded, np.array(all_sums), all_pairs
+# ---------------------------------------------------------------------------
+# Analysis 1: Assignment purity — does each abstract token cluster by sum?
+# ---------------------------------------------------------------------------
+def assignment_purity(assignments, sums, out_dir: Path, abs_vocab: int):
+    n_pos = assignments.shape[1]
+    fig, axes = plt.subplots(1, n_pos, figsize=(5 * n_pos, 4))
+    if n_pos == 1:
+        axes = [axes]
+    results = {}
+    for pos in range(n_pos):
+        col = assignments[:, pos]
+        valid = col >= 0
+        col_v = col[valid]
+        sums_v = sums[valid]
+        # For each token, what distribution over sums does it cover?
+        token_sum_dist = {}
+        for t in range(abs_vocab):
+            mask = col_v == t
+            if mask.sum() == 0:
+                continue
+            token_sum_dist[t] = sums_v[mask]
+        # Plot: x=token id, y=sum, scatter
+        ax = axes[pos]
+        for t, s in token_sum_dist.items():
+            ax.scatter([t] * len(s), s, alpha=0.1, s=2, color="steelblue")
+        ax.set_xlabel("Abstract token ID")
+        ax.set_ylabel("(a+b) mod p")
+        ax.set_title(f"Position {pos}: token vs sum")
+        # Compute mean sum per token (how ordered is it?)
+        means = {t: s.mean() for t, s in token_sum_dist.items()}
+        results[pos] = {"n_used": len(token_sum_dist), "means": means}
+        print(f"  Position {pos}: {len(token_sum_dist)} tokens used")
+    plt.tight_layout()
+    plt.savefig(out_dir / "assignment_scatter.png", dpi=120)
+    plt.close()
+    return results
+# ---------------------------------------------------------------------------
+# Analysis 2: Fourier structure in assignments
+# ---------------------------------------------------------------------------
+def fourier_of_assignments(assignments, sums, out_dir: Path, abs_vocab: int):
+    """
+    For each abstract position, treat the assignment function f(sum) as a
+    discrete signal over Z_p and compute its DFT. Strong peaks at specific
+    frequencies indicate Fourier structure.
+    """
+    n_pos = assignments.shape[1]
+    fig, axes = plt.subplots(1, n_pos, figsize=(5 * n_pos, 4))
+    if n_pos == 1:
+        axes = [axes]
+    for pos in range(n_pos):
+        col = assignments[:, pos]
+        valid = col >= 0
+        # Build signal: for each possible sum value s in 0..p-1,
+        # compute the average abstract token ID assigned
+        signal = np.zeros(P)
+        counts = np.zeros(P)
+        for tok, s in zip(col[valid], sums[valid]):
+            signal[s] += tok
+            counts[s] += 1
+        counts = np.maximum(counts, 1)
+        signal /= counts  # mean token ID per sum value
+        # DFT
+        freqs = np.abs(np.fft.rfft(signal))
+        ax = axes[pos]
+        ax.bar(range(len(freqs)), freqs)
+        ax.set_xlabel("Frequency k")
+        ax.set_ylabel("|DFT|")
+        ax.set_title(f"Position {pos}: DFT of mean-token-id(sum)")
+        top_k = np.argsort(freqs)[::-1][:5]
+        print(f"  Position {pos} top-5 frequencies: {top_k.tolist()} (magnitudes: {freqs[top_k].round(2).tolist()})")
+    plt.tight_layout()
+    plt.savefig(out_dir / "assignment_fourier.png", dpi=120)
+    plt.close()
+# ---------------------------------------------------------------------------
+# Analysis 3: Abstract token embeddings in Fourier space
+# ---------------------------------------------------------------------------
+def embedding_fourier(model, out_dir: Path):
+    """
+    Extract abstract token embedding vectors and check if they organize
+    along sin/cos curves for specific Fourier frequencies.
+    Analogous to Nanda's analysis of token embeddings via DFT.
+    """
+    base_v = int(model.vocab_sizes[0].item())
+    abs_v  = int(model.vocab_sizes[1].item())   # number of abstract token types
+    # Get embedding matrix for abstract tokens (skip placeholder at base_v)
+    embed = model.model.model.embed_tokens.weight  # (total_vocab, d_model)
+    abs_embeds = embed[base_v + 1: base_v + 1 + abs_v].detach().cpu().float().numpy()
+    # shape: (abs_v, d_model)
+    print(f"  Abstract embedding matrix: {abs_embeds.shape}")
+    # SVD to find dominant directions
+    U, S, Vt = np.linalg.svd(abs_embeds, full_matrices=False)
+    print(f"  Top-5 singular values: {S[:5].round(3).tolist()}")
+    # Plot singular values
+    fig, ax = plt.subplots(figsize=(6, 4))
+    ax.bar(range(len(S)), S)
+    ax.set_xlabel("Component")
+    ax.set_ylabel("Singular value")
+    ax.set_title("Abstract embedding SVD")
+    plt.tight_layout()
+    plt.savefig(out_dir / "embedding_svd.png", dpi=120)
+    plt.close()
+    # Plot top-2 components as scatter to see if they form a circle
+    n_actual = abs_embeds.shape[0]
+    if n_actual >= 3:
+        fig, ax = plt.subplots(figsize=(5, 5))
+        ax.scatter(U[:, 0], U[:, 1], c=list(range(n_actual)), cmap="hsv", s=60)
+        for i in range(n_actual):
+            ax.annotate(str(i), (U[i, 0], U[i, 1]), fontsize=7)
+        ax.set_title("Abstract tokens in top-2 SVD directions")
+        ax.set_xlabel("PC1")
+        ax.set_ylabel("PC2")
+        plt.tight_layout()
+        plt.savefig(out_dir / "embedding_pca.png", dpi=120)
+        plt.close()
+# ---------------------------------------------------------------------------
+# Analysis 4: Heatmap of token assignment over (a, b) grid
+# ---------------------------------------------------------------------------
+def assignment_heatmap(assignments, all_pairs, out_dir: Path):
+    n_pos = assignments.shape[1]
+    fig, axes = plt.subplots(1, n_pos, figsize=(5 * n_pos, 4))
+    if n_pos == 1:
+        axes = [axes]
+    for pos in range(n_pos):
+        grid = np.full((P, P), -1, dtype=float)
+        for (a, b), tok in zip(all_pairs, assignments[:, pos]):
+            if tok >= 0:
+                grid[a, b] = tok
+        ax = axes[pos]
+        im = ax.imshow(grid, origin="lower", cmap="tab20", aspect="auto")
+        ax.set_xlabel("b")
+        ax.set_ylabel("a")
+        ax.set_title(f"Position {pos}: token assignment grid")
+        plt.colorbar(im, ax=ax, fraction=0.046)
+    plt.tight_layout()
+    plt.savefig(out_dir / "assignment_heatmap.png", dpi=120)
+    plt.close()
+# ---------------------------------------------------------------------------
+# Analysis 5: Fourier analysis over EMBEDDING SPACE (not token ID)
+# ---------------------------------------------------------------------------
+def embedding_fourier_by_sum(model, assignments, sums, out_dir: Path):
+    """
+    For each abstract position, compute h(a,b) = embedding of assigned token.
+    Then DFT over (a+b) mod p to find dominant Fourier frequencies.
+    This is the correct Nanda-style analysis: not which token was assigned,
+    but what embedding vector the model placed at that position.
+    """
+    base_v = int(model.vocab_sizes[0].item())
+    embed = model.model.model.embed_tokens.weight.detach().cpu().float().numpy()
+    # embed[base_v + 1 + t] = embedding of abstract token t
+    n_pos = assignments.shape[1]
+    d_model = embed.shape[1]
+    fig, axes = plt.subplots(2, n_pos, figsize=(5 * n_pos, 8))
+    for pos in range(n_pos):
+        col = assignments[:, pos]  # (N,) token IDs (0-indexed within abs vocab)
+        valid = col >= 0
+        # Build (p, d_model) matrix: mean embedding per sum value
+        mean_emb = np.zeros((P, d_model))
+        counts = np.zeros(P)
+        for tok, s in zip(col[valid], sums[valid]):
+            emb = embed[base_v + tok]
+            mean_emb[s] += emb
+            counts[s] += 1
+        counts = np.maximum(counts, 1).reshape(-1, 1)
+        mean_emb /= counts  # (p, d_model)
+        # DFT over sum dimension for each embedding dim
+        freq_power = np.abs(np.fft.rfft(mean_emb, axis=0))  # (p//2+1, d_model)
+        total_power_per_freq = freq_power.sum(axis=1)        # (p//2+1,)
+        top_k = np.argsort(total_power_per_freq)[::-1][:10]
+        print(f"  Position {pos} top-10 frequencies (by total embedding power):")
+        print(f"    freqs:  {top_k.tolist()}")
+        print(f"    powers: {total_power_per_freq[top_k].round(1).tolist()}")
+        # Plot: total power per frequency
+        ax = axes[0, pos]
+        ax.bar(range(len(total_power_per_freq)), total_power_per_freq)
+        ax.set_xlabel("Frequency k")
+        ax.set_ylabel("Total |DFT| across dims")
+        ax.set_title(f"Pos {pos}: embedding DFT power")
+        # Zoom in on non-DC frequencies
+        ax2 = axes[1, pos]
+        ax2.bar(range(1, len(total_power_per_freq)), total_power_per_freq[1:])
+        ax2.set_xlabel("Frequency k (DC removed)")
+        ax2.set_ylabel("Total |DFT| across dims")
+        ax2.set_title(f"Pos {pos}: non-DC frequencies")
+        # Save per-dim frequency matrix for later
+        np.save(out_dir / f"freq_power_pos{pos}.npy", freq_power)
+    plt.tight_layout()
+    plt.savefig(out_dir / "embedding_freq_by_sum.png", dpi=120)
+    plt.close()
+    # Also: check if dominant non-DC freq is consistent across positions
+    print("\n  Summary: dominant non-DC frequency per position:")
+    for pos in range(n_pos):
+        freq_power = np.load(out_dir / f"freq_power_pos{pos}.npy")
+        total = freq_power.sum(axis=1)
+        top_nondc = np.argsort(total[1:])[::-1][:3] + 1
+        print(f"    pos {pos}: top-3 non-DC = {top_nondc.tolist()}, "
+              f"ratio to DC = {total[top_nondc[0]]/total[0]:.3f}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--model_dir", default="arithmetic/runs/mod_sorl_fourier/final")
+    p.add_argument("--out_dir",   default="arithmetic/modular/experiments/11_fourier_analysis/results")
+    p.add_argument("--K",         type=int, default=1)
+    p.add_argument("--abs_vocab", type=int, default=30)
+    p.add_argument("--device",    default="cuda:0")
+    p.add_argument("--batch_size",type=int, default=256)
+    args = p.parse_args()
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    print("Loading model...")
+    model = load_model(args.model_dir, args.device)
+    print("Generating all (a,b) pairs...")
+    train_ex, test_ex = generate_dataset(p=P, seed=42)
+    all_ex = train_ex + test_ex
+    print(f"  Total examples: {len(all_ex)}")
+    print("Extracting abstract token assignments...")
+    assignments, sums, pairs = get_assignments(
+        model, all_ex, K=args.K, device=args.device, batch_size=args.batch_size
+    )
+    print(f"  Assignment matrix shape: {assignments.shape}")
+    np.save(out_dir / "assignments.npy", assignments)
+    np.save(out_dir / "sums.npy", sums)
+    print("\n--- Analysis 1: Assignment purity ---")
+    assignment_purity(assignments, sums, out_dir, args.abs_vocab)
+    print("\n--- Analysis 2: Fourier structure in assignments ---")
+    fourier_of_assignments(assignments, sums, out_dir, args.abs_vocab)
+    print("\n--- Analysis 3: Embedding Fourier analysis ---")
+    embedding_fourier(model, out_dir)
+    print("\n--- Analysis 4: Assignment heatmap over (a, b) grid ---")
+    assignment_heatmap(assignments, pairs, out_dir)
+    print("\n--- Analysis 5: Fourier analysis of abstract token EMBEDDINGS ---")
+    embedding_fourier_by_sum(model, assignments, sums, out_dir)
+    print(f"\nDone. Results in {out_dir}")
+if __name__ == "__main__":
+    main()