Prisma / scripts /representation_analysis.py

Initial commit

56e82ec 9 days ago

39 kB

	#!/usr/bin/env python3
	"""
	Representation analysis: CKA and Logit Lens for Prisma / Circuit Transformer.

	CKA (Centered Kernel Alignment):
	Measures representational similarity between all layer pairs.
	Produces a heatmap revealing mirror symmetry, phase transitions,
	and cross-model alignment.

	Logit Lens:
	Projects intermediate representations to vocabulary space at every layer.
	Reveals what the model "thinks" at each processing stage -- from raw
	tokens through the semantic bottleneck back to specific predictions.

	Also computes representation drift (cosine similarity between consecutive layers).

	Usage:
	# Full analysis (CKA + logit lens)
	python -m circuits.scripts.representation_analysis \\
	--checkpoint path/to/checkpoint.pt \\
	--data hf:HuggingFaceFW/fineweb-edu:sample-10BT:train

	# Cross-model CKA
	python -m circuits.scripts.representation_analysis \\
	--checkpoint path/to/prisma.pt --hf-model gpt2-medium \\
	--data hf:HuggingFaceFW/fineweb-edu:sample-10BT:train

	# CKA only (skip logit lens)
	python -m circuits.scripts.representation_analysis \\
	--checkpoint path/to/checkpoint.pt \\
	--data hf:HuggingFaceFW/fineweb-edu:sample-10BT:train \\
	--no-logit-lens
	"""

	import argparse
	import json
	import sys
	import os
	from pathlib import Path
	from collections import OrderedDict

	import numpy as np
	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt


	# ---------------------------------------------------------------------------
	# Model loading
	# ---------------------------------------------------------------------------

	def load_prisma_model(checkpoint_path: str, device: str = "cpu"):
	"""Load a Prisma/Circuit checkpoint, return (model, config_dict, model_type)."""
	sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
	from circuits.config import CircuitConfig
	from circuits.model import CircuitTransformer
	from circuits.mirrored import MirroredConfig, MirroredTransformer

	ckpt = torch.load(checkpoint_path, map_location="cpu", weights_only=False)
	model_type = ckpt.get("model_type", "standard")
	config_dict = ckpt.get("config", {})

	if model_type == "mirrored":
	if config_dict.get("dual_gate_middle"):
	config_dict.pop("dual_gate_middle")
	config = MirroredConfig.from_dict(config_dict)
	model = MirroredTransformer(config)
	else:
	config = CircuitConfig.from_dict(config_dict)
	model = CircuitTransformer(config)

	state_dict = ckpt["model"]
	if any(k.startswith("_orig_mod.") for k in state_dict):
	state_dict = {k.removeprefix("_orig_mod."): v for k, v in state_dict.items()}
	model.load_state_dict(state_dict, strict=False)
	model.to(device).eval()

	return model, config_dict, model_type


	def load_hf_model(model_name: str, device: str = "cpu"):
	"""Load a HuggingFace causal LM."""
	from transformers import AutoModelForCausalLM
	model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, trust_remote_code=True)
	model.to(device).eval()
	return model


	# ---------------------------------------------------------------------------
	# Data loading
	# ---------------------------------------------------------------------------

	def load_data(data_source: str, tokenizer_name: str, num_samples: int = 32,
	context_length: int = 512, device: str = "cpu"):
	"""Load tokenized data. Returns (input_ids, tokenizer).

	Supports:
	- Memmap .bin files (from circuits training cache)
	- hf:dataset:config:split (streaming from HuggingFace)
	- Plain text files
	"""
	sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
	from circuits.data import get_tokenizer

	tokenizer = get_tokenizer(tokenizer_name)

	# Memmap binary file (already tokenized)
	if data_source.endswith(".bin"):
	import struct
	with open(data_source, 'rb') as f:
	n_chunks, seq_len = struct.unpack('II', f.read(8))
	data = np.memmap(data_source, dtype=np.int32, mode='r',
	offset=8, shape=(n_chunks, seq_len))
	n = min(num_samples, n_chunks)
	# Slice to requested context length
	cl = min(context_length, seq_len)
	input_ids = torch.from_numpy(data[:n, :cl].copy()).long().to(device)
	return input_ids, tokenizer

	# HuggingFace dataset
	if data_source.startswith("hf:"):
	from datasets import load_dataset
	parts = data_source[3:].split(":")
	ds_name = parts[0]
	ds_config = parts[1] if len(parts) > 1 else None
	ds_split = parts[2] if len(parts) > 2 else "train"
	dataset = load_dataset(ds_name, ds_config, split=ds_split, streaming=True)
	all_ids = []
	for item in dataset:
	text = item.get("text", "")
	if len(text) < 100:
	continue
	ids = tokenizer.encode(text)
	if len(ids) >= context_length:
	all_ids.append(ids[:context_length])
	if len(all_ids) >= num_samples:
	break
	if not all_ids:
	return None, tokenizer
	return torch.tensor(all_ids, device=device), tokenizer

	# Plain text file
	with open(data_source) as f:
	texts = [line.strip() for line in f if len(line.strip()) > 100]
	all_ids = []
	for text in texts:
	ids = tokenizer.encode(text)
	if len(ids) >= context_length:
	all_ids.append(ids[:context_length])
	if len(all_ids) >= num_samples:
	break
	if not all_ids:
	return None, tokenizer
	return torch.tensor(all_ids, device=device), tokenizer


	def tokenize_for_hf(texts: list, model_name: str, context_length: int = 512,
	device: str = "cpu"):
	"""Tokenize texts for an HF model. Returns (input_ids, tokenizer)."""
	from transformers import AutoTokenizer
	tokenizer = AutoTokenizer.from_pretrained(model_name,
	use_fast=False,
	trust_remote_code=True)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	all_ids = []
	for text in texts:
	ids = tokenizer.encode(text, max_length=context_length, truncation=True)
	if len(ids) >= context_length:
	all_ids.append(ids[:context_length])
	elif len(ids) > 32:
	all_ids.append(ids + [tokenizer.eos_token_id] * (context_length - len(ids)))

	if not all_ids:
	return None, tokenizer

	return torch.tensor(all_ids, device=device), tokenizer


	# ---------------------------------------------------------------------------
	# Activation collection
	# ---------------------------------------------------------------------------

	def collect_mirrored_activations(model, input_ids, word_positions=None):
	"""Collect activations from MirroredTransformer at every processing stage."""
	activations = OrderedDict()

	with torch.no_grad():
	x = model.embed(input_ids)
	if model.embed_proj is not None:
	if model.embed_g3 is not None:
	g4 = F.silu(model.embed_g4(x))
	g3 = F.silu(model.embed_g3(x) * g4)
	x = model.embed_proj(x) * g3
	else:
	x = F.silu(model.embed_proj(x))
	x = x * model.embed_scale
	activations["embedding"] = x.detach().cpu()

	for i, block in enumerate(model.mirror_blocks):
	x, _ = block(x, word_positions=word_positions)
	activations[f"expand_{i}"] = x.detach().cpu()

	for i, block in enumerate(model.middle_blocks):
	x, _ = block(x, word_positions=word_positions)
	activations[f"middle_{i}"] = x.detach().cpu()

	for i in reversed(range(len(model.mirror_blocks))):
	x, _ = model.mirror_blocks[i](x, word_positions=word_positions)
	compress_idx = len(model.mirror_blocks) - 1 - i
	activations[f"compress_{compress_idx}"] = x.detach().cpu()

	x = model.norm(x)
	activations["final_norm"] = x.detach().cpu()

	return activations


	def collect_standard_activations(model, input_ids, word_positions=None):
	"""Collect activations from standard CircuitTransformer."""
	activations = OrderedDict()

	with torch.no_grad():
	x = model.embed(input_ids)
	if model.embed_proj is not None:
	x = F.silu(model.embed_proj(x))
	x = x * model.embed_scale
	activations["embedding"] = x.detach().cpu()

	for i, layer in enumerate(model.layers):
	x, _ = layer(x, word_positions=word_positions)
	activations[f"layer_{i}"] = x.detach().cpu()

	x = model.norm(x)
	activations["final_norm"] = x.detach().cpu()

	return activations


	def collect_hf_activations(model, input_ids):
	"""Hook-based activation collection for HuggingFace models."""
	activations = OrderedDict()
	hooks = []

	if hasattr(model, 'transformer'):
	# GPT-2 style
	blocks = model.transformer.h
	embed = model.transformer.wte
	final_norm = model.transformer.ln_f
	elif hasattr(model, 'model'):
	# Llama / Mistral style
	blocks = model.model.layers
	embed = model.model.embed_tokens
	final_norm = model.model.norm
	else:
	raise ValueError(f"Unsupported HF model: {type(model)}")

	def make_hook(name):
	def hook_fn(module, input, output):
	out = output[0] if isinstance(output, tuple) else output
	activations[name] = out.detach().cpu()
	return hook_fn

	hooks.append(embed.register_forward_hook(make_hook("embedding")))
	for i, block in enumerate(blocks):
	hooks.append(block.register_forward_hook(make_hook(f"layer_{i}")))
	hooks.append(final_norm.register_forward_hook(make_hook("final_norm")))

	with torch.no_grad():
	model(input_ids)

	for h in hooks:
	h.remove()

	return activations


	def collect_activations(model, model_type, config_dict, input_ids, device):
	"""Dispatch to the right collector based on model type."""
	word_positions = None
	word_rope_dims = config_dict.get("word_rope_dims", 0) if config_dict else 0

	if word_rope_dims > 0 and model_type in ("standard", "mirrored"):
	sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
	from circuits.data import get_tokenizer
	from circuits.layers import build_word_start_table, compute_word_positions
	tokenizer_name = config_dict.get("tokenizer_name", "gpt2")
	# Try to get tokenizer from the model's config
	tokenizer = get_tokenizer(tokenizer_name)
	word_start_table = build_word_start_table(tokenizer, len(tokenizer)).to(device)
	word_positions = compute_word_positions(input_ids, word_start_table)

	if model_type == "mirrored":
	return collect_mirrored_activations(model, input_ids, word_positions)
	elif model_type == "standard":
	return collect_standard_activations(model, input_ids, word_positions)
	else:
	return collect_hf_activations(model, input_ids)


	# ---------------------------------------------------------------------------
	# Linear CKA
	# ---------------------------------------------------------------------------

	def linear_cka(X: torch.Tensor, Y: torch.Tensor) -> float:
	"""Compute linear CKA between two [N, D] representation matrices.

	CKA(X, Y) = \|\|Yc^T Xc\|\|_F^2 / (\|\|Xc^T Xc\|\|_F * \|\|Yc^T Yc\|\|_F)
	"""
	X = X.float()
	Y = Y.float()

	# Center
	X = X - X.mean(0, keepdim=True)
	Y = Y - Y.mean(0, keepdim=True)

	N = X.shape[0]

	if N < min(X.shape[1], Y.shape[1]):
	# Kernel formulation (N < D): K=XX^T, L=YY^T — [N,N] matrices
	K = X @ X.T
	L = Y @ Y.T
	numerator = (K * L).sum()
	denominator = torch.sqrt((K * K).sum() * (L * L).sum())
	else:
	# Feature formulation (D <= N)
	XtY = X.T @ Y
	XtX = X.T @ X
	YtY = Y.T @ Y
	numerator = (XtY * XtY).sum()
	denominator = torch.sqrt((XtX * XtX).sum() * (YtY * YtY).sum())

	if denominator < 1e-10:
	return 0.0

	return (numerator / denominator).item()


	def compute_cka_matrix(activations: dict, subsample: int = 4) -> tuple:
	"""Compute CKA between all layer pairs. Returns (cka_matrix, layer_names)."""
	names = list(activations.keys())
	n_layers = len(names)

	# Flatten and subsample: [B, L, D] -> [N, D]
	flat_acts = {}
	for name, act in activations.items():
	act_sub = act[:, ::subsample, :]
	flat_acts[name] = act_sub.reshape(-1, act_sub.shape[-1])

	cka_matrix = np.zeros((n_layers, n_layers))

	for i in range(n_layers):
	cka_matrix[i, i] = 1.0
	for j in range(i + 1, n_layers):
	cka_val = linear_cka(flat_acts[names[i]], flat_acts[names[j]])
	cka_matrix[i, j] = cka_val
	cka_matrix[j, i] = cka_val
	if (i + 1) % 5 == 0 or i == n_layers - 1:
	print(f" CKA: {i+1}/{n_layers} rows computed")

	return cka_matrix, names


	def compute_cross_model_cka(acts_a: dict, acts_b: dict) -> tuple:
	"""Cross-model CKA using sample-level (avg-pooled) representations."""
	names_a = list(acts_a.keys())
	names_b = list(acts_b.keys())

	def pool(activations):
	return {name: act.mean(dim=1) for name, act in activations.items()}

	pooled_a = pool(acts_a)
	pooled_b = pool(acts_b)

	# Ensure same number of samples
	n_samples = min(
	next(iter(pooled_a.values())).shape[0],
	next(iter(pooled_b.values())).shape[0]
	)

	cka_matrix = np.zeros((len(names_a), len(names_b)))

	for i, na in enumerate(names_a):
	for j, nb in enumerate(names_b):
	cka_matrix[i, j] = linear_cka(pooled_a[na][:n_samples], pooled_b[nb][:n_samples])
	if (i + 1) % 5 == 0 or i == len(names_a) - 1:
	print(f" Cross-CKA: {i+1}/{len(names_a)} rows computed")

	return cka_matrix, names_a, names_b


	# ---------------------------------------------------------------------------
	# Logit Lens
	# ---------------------------------------------------------------------------

	def get_unembed_components(model, model_type):
	"""Extract (norm_module, unembed_weight) for logit lens projection."""
	if model_type in ("standard", "mirrored"):
	return model.norm, model.embed.weight
	elif hasattr(model, 'transformer'):
	return model.transformer.ln_f, model.transformer.wte.weight
	elif hasattr(model, 'model'):
	return model.model.norm, model.model.embed_tokens.weight
	else:
	raise ValueError(f"Unsupported model: {type(model)}")


	def compute_logit_lens(activations: dict, norm: nn.Module, unembed_weight: torch.Tensor,
	labels: torch.Tensor, device: str = "cpu",
	chunk_size: int = 2048) -> OrderedDict:
	"""Compute logit lens statistics at every layer.

	Projects intermediate hidden states through final norm + unembedding.
	Computes entropy, top-1 probability, correct token rank, and
	agreement with the final layer's predictions.

	Args:
	activations: OrderedDict[name] = [B, L, D]
	norm: final layer norm module
	unembed_weight: [V, D] unembedding matrix
	labels: [B, L-1] next-token labels (input_ids[:, 1:])
	device: computation device
	chunk_size: number of positions per batch for projection

	Returns:
	OrderedDict[name] = {entropy, top1_prob, correct_rank, ...}
	"""
	names = list(activations.keys())
	final_name = names[-1] # "final_norm"
	results = OrderedDict()

	unembed = unembed_weight.to(device)
	norm_mod = norm.to(device)
	labels_flat = labels.reshape(-1).to(device)

	def process_layer(name, act, apply_norm=True):
	"""Project one layer's activations and compute all metrics."""
	B, L, D = act.shape
	flat = act[:, :-1, :].reshape(-1, D) # [B*(L-1), D]
	N = flat.shape[0]

	all_entropy = []
	all_top1_prob = []
	all_correct_rank = []
	all_top1_idx = []

	for start in range(0, N, chunk_size):
	end = min(start + chunk_size, N)
	chunk = flat[start:end].to(device)
	chunk_labels = labels_flat[start:end]

	if apply_norm:
	chunk = norm_mod(chunk)

	logits = chunk @ unembed.T # [cs, V]
	log_probs = F.log_softmax(logits, dim=-1)
	probs = log_probs.exp()

	# Entropy
	entropy = -(probs * log_probs).sum(dim=-1)
	all_entropy.append(entropy.cpu())

	# Top-1 probability
	top1_prob = probs.max(dim=-1).values
	all_top1_prob.append(top1_prob.cpu())

	# Correct token rank
	correct_logits = logits.gather(1, chunk_labels.unsqueeze(1))
	rank = (logits > correct_logits).sum(dim=-1) + 1
	all_correct_rank.append(rank.cpu())

	# Top-1 index
	all_top1_idx.append(logits.argmax(dim=-1).cpu())

	entropy_t = torch.cat(all_entropy)
	top1_t = torch.cat(all_top1_prob)
	rank_t = torch.cat(all_correct_rank).float()
	top1_idx = torch.cat(all_top1_idx)

	return {
	"entropy": entropy_t.mean().item(),
	"entropy_std": entropy_t.std().item(),
	"top1_prob": top1_t.mean().item(),
	"correct_rank_mean": rank_t.mean().item(),
	"correct_rank_median": rank_t.median().item(),
	"log_rank_mean": rank_t.log().mean().item(),
	"_top1_idx": top1_idx,
	}

	# Process all layers
	for name in names:
	is_final = (name == final_name)
	act = activations[name]
	stats = process_layer(name, act, apply_norm=not is_final)
	results[name] = stats
	print(f" Logit lens: {name:20s} entropy={stats['entropy']:.2f} "
	f"top1={stats['top1_prob']:.4f} rank={stats['correct_rank_median']:.0f}")

	# Compute agreement with final layer
	final_top1 = results[final_name]["_top1_idx"]
	for name in names:
	layer_top1 = results[name]["_top1_idx"]
	agreement = (layer_top1 == final_top1).float().mean().item()
	results[name]["agreement_with_final"] = agreement

	# Clean up internal tensors
	for name in names:
	del results[name]["_top1_idx"]

	return results


	# ---------------------------------------------------------------------------
	# Representation drift
	# ---------------------------------------------------------------------------

	def compute_drift(activations: dict) -> OrderedDict:
	"""Cosine similarity between consecutive layers' representations."""
	names = list(activations.keys())
	drift = OrderedDict()

	for i in range(1, len(names)):
	prev = activations[names[i - 1]]
	curr = activations[names[i]]

	# Flatten to [N, D]
	prev_flat = prev.reshape(-1, prev.shape[-1])
	curr_flat = curr.reshape(-1, curr.shape[-1])

	# Mean cosine similarity
	cos = F.cosine_similarity(prev_flat, curr_flat, dim=-1)
	drift[names[i]] = {
	"cos_sim_mean": cos.mean().item(),
	"cos_sim_std": cos.std().item(),
	"l2_distance": (curr_flat - prev_flat).norm(dim=-1).mean().item(),
	}

	return drift


	# ---------------------------------------------------------------------------
	# Plotting
	# ---------------------------------------------------------------------------

	def _phase_color(name):
	"""Return color based on layer phase."""
	if "expand" in name:
	return "steelblue"
	elif "middle" in name:
	return "goldenrod"
	elif "compress" in name:
	return "coral"
	elif "embedding" in name:
	return "gray"
	elif "final" in name:
	return "gray"
	else:
	return "mediumpurple"


	def _layer_sort_key(name):
	"""Sort key for processing order."""
	order = {"embedding": -1, "final_norm": 9999}
	if name in order:
	return order[name]
	parts = name.split("_")
	phase = parts[0]
	idx = int(parts[1]) if len(parts) > 1 and parts[1].isdigit() else 0
	phase_offset = {"expand": 0, "middle": 1000, "compress": 2000, "layer": 0}
	return phase_offset.get(phase, 3000) + idx


	def _short_name(name):
	"""Shorten layer name for plot labels."""
	if name == "embedding":
	return "emb"
	if name == "final_norm":
	return "out"
	parts = name.split("_")
	if parts[0] == "expand":
	return f"E{parts[1]}"
	elif parts[0] == "middle":
	return f"M{parts[1]}"
	elif parts[0] == "compress":
	return f"C{parts[1]}"
	elif parts[0] == "layer":
	return f"L{parts[1]}"
	return name[:6]


	def plot_cka_self(cka_matrix: np.ndarray, names: list, output_dir: Path,
	model_label: str):
	"""Plot self-CKA heatmap."""
	n = len(names)
	short = [_short_name(n) for n in names]

	fig, ax = plt.subplots(figsize=(max(10, n * 0.35), max(8, n * 0.3)))
	fig.suptitle(f"{model_label} -- CKA Self-Similarity", fontsize=14)

	im = ax.imshow(cka_matrix, cmap="inferno", vmin=0, vmax=1, aspect="equal")

	# Phase separators
	for i, name in enumerate(names):
	if i > 0:
	prev = names[i - 1].split("_")[0]
	curr = name.split("_")[0]
	if prev != curr:
	ax.axhline(i - 0.5, color="white", linewidth=1.5, alpha=0.8)
	ax.axvline(i - 0.5, color="white", linewidth=1.5, alpha=0.8)

	ax.set_xticks(range(n))
	ax.set_xticklabels(short, rotation=90, fontsize=7)
	ax.set_yticks(range(n))
	ax.set_yticklabels(short, fontsize=7)

	plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="CKA")
	plt.tight_layout()
	fig.savefig(output_dir / "cka_self.png", dpi=150)
	plt.close(fig)


	def plot_cka_cross(cka_matrix: np.ndarray, names_a: list, names_b: list,
	output_dir: Path, label_a: str, label_b: str):
	"""Plot cross-model CKA heatmap."""
	short_a = [_short_name(n) for n in names_a]
	short_b = [_short_name(n) for n in names_b]

	na, nb = len(names_a), len(names_b)
	fig, ax = plt.subplots(figsize=(max(10, nb * 0.35), max(8, na * 0.3)))
	fig.suptitle(f"Cross-CKA: {label_a} vs {label_b}", fontsize=14)

	im = ax.imshow(cka_matrix, cmap="inferno", vmin=0, vmax=1, aspect="auto")

	ax.set_xticks(range(nb))
	ax.set_xticklabels(short_b, rotation=90, fontsize=7)
	ax.set_xlabel(label_b)
	ax.set_yticks(range(na))
	ax.set_yticklabels(short_a, fontsize=7)
	ax.set_ylabel(label_a)

	plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04, label="CKA")
	plt.tight_layout()
	fig.savefig(output_dir / "cka_cross.png", dpi=150)
	plt.close(fig)


	def plot_logit_lens(lens_results: OrderedDict, output_dir: Path,
	model_label: str):
	"""Plot logit lens summary: entropy, confidence, rank, agreement."""
	names = list(lens_results.keys())
	sorted_names = sorted(names, key=_layer_sort_key)
	short = [_short_name(n) for n in sorted_names]
	colors = [_phase_color(n) for n in sorted_names]
	x = range(len(sorted_names))

	fig, axes = plt.subplots(2, 2, figsize=(16, 10))
	fig.suptitle(f"{model_label} -- Logit Lens", fontsize=14)

	# Entropy
	vals = [lens_results[n]["entropy"] for n in sorted_names]
	axes[0, 0].bar(x, vals, color=colors, alpha=0.85)
	axes[0, 0].set_ylabel("Entropy (nats)")
	axes[0, 0].set_title("Prediction entropy per layer")
	axes[0, 0].set_xticks(x)
	axes[0, 0].set_xticklabels(short, rotation=90, fontsize=7)

	# Top-1 probability
	vals = [lens_results[n]["top1_prob"] for n in sorted_names]
	axes[0, 1].bar(x, vals, color=colors, alpha=0.85)
	axes[0, 1].set_ylabel("Top-1 probability")
	axes[0, 1].set_title("Prediction confidence per layer")
	axes[0, 1].set_xticks(x)
	axes[0, 1].set_xticklabels(short, rotation=90, fontsize=7)

	# Correct rank (log scale)
	vals = [lens_results[n]["correct_rank_median"] for n in sorted_names]
	axes[1, 0].bar(x, vals, color=colors, alpha=0.85)
	axes[1, 0].set_ylabel("Median rank of correct token")
	axes[1, 0].set_yscale("log")
	axes[1, 0].set_title("When does the model find the answer?")
	axes[1, 0].set_xticks(x)
	axes[1, 0].set_xticklabels(short, rotation=90, fontsize=7)

	# Agreement with final layer
	vals = [lens_results[n]["agreement_with_final"] for n in sorted_names]
	axes[1, 1].bar(x, vals, color=colors, alpha=0.85)
	axes[1, 1].set_ylabel("Agreement with final layer")
	axes[1, 1].set_title("Convergence toward final prediction")
	axes[1, 1].set_ylim(0, 1.05)
	axes[1, 1].set_xticks(x)
	axes[1, 1].set_xticklabels(short, rotation=90, fontsize=7)

	plt.tight_layout()
	fig.savefig(output_dir / "logit_lens_summary.png", dpi=150)
	plt.close(fig)


	def plot_logit_lens_trajectory(activations: dict, norm: nn.Module,
	unembed_weight: torch.Tensor, input_ids: torch.Tensor,
	tokenizer, output_dir: Path, model_label: str,
	device: str = "cpu",
	n_positions: int = 6, n_layers: int = 10):
	"""Show top-5 predicted tokens at selected layers for a few positions.

	Picks positions spread across the first sample and shows how the
	model's prediction evolves through the network.
	"""
	names = sorted(activations.keys(), key=_layer_sort_key)

	# Select layers evenly spread across the network
	if len(names) > n_layers:
	indices = np.linspace(0, len(names) - 1, n_layers, dtype=int)
	selected_layers = [names[i] for i in indices]
	else:
	selected_layers = names

	# Select positions from the first sample
	seq_len = input_ids.shape[1]
	pos_indices = np.linspace(10, seq_len - 2, n_positions, dtype=int)

	unembed = unembed_weight.to(device)
	norm_mod = norm.to(device)
	final_name = names[-1]

	fig, axes = plt.subplots(n_positions, 1, figsize=(14, 3 * n_positions))
	if n_positions == 1:
	axes = [axes]
	fig.suptitle(f"{model_label} -- Token prediction trajectory", fontsize=14, y=1.02)

	for pos_idx, pos in enumerate(pos_indices):
	ax = axes[pos_idx]
	actual_token = tokenizer.decode([input_ids[0, pos + 1].item()])
	context = tokenizer.decode(input_ids[0, max(0, pos - 5):pos + 1].tolist())

	layer_labels = []
	top_tokens_per_layer = []

	for name in selected_layers:
	is_final = (name == final_name)
	hidden = activations[name][0, pos:pos + 1, :].to(device) # [1, D]
	if not is_final:
	hidden = norm_mod(hidden)
	logits = (hidden @ unembed.T).squeeze(0) # [V]
	probs = F.softmax(logits, dim=-1)
	top5_vals, top5_idx = probs.topk(5)

	tokens_str = []
	for val, idx in zip(top5_vals, top5_idx):
	tok = tokenizer.decode([idx.item()]).replace("\n", "\\n")
	tokens_str.append(f"{tok}({val:.2f})")

	layer_labels.append(_short_name(name))
	top_tokens_per_layer.append("\n".join(tokens_str))

	# Create a text table
	ax.set_xlim(-0.5, len(layer_labels) - 0.5)
	ax.set_ylim(-0.5, 5.5)
	ax.set_xticks(range(len(layer_labels)))
	ax.set_xticklabels(layer_labels, fontsize=8)
	ax.set_yticks([])

	for li, tokens_str in enumerate(top_tokens_per_layer):
	lines = tokens_str.split("\n")
	for rank, line in enumerate(lines):
	color = "darkgreen" if actual_token.strip() in line else "black"
	fontweight = "bold" if actual_token.strip() in line else "normal"
	ax.text(li, rank, line, ha="center", va="center", fontsize=7,
	color=color, fontweight=fontweight)

	ax.set_title(f'pos {pos}: "...{context}" -> [{actual_token.strip()}]',
	fontsize=9, loc="left")
	ax.invert_yaxis()
	ax.spines["top"].set_visible(False)
	ax.spines["right"].set_visible(False)
	ax.spines["left"].set_visible(False)

	plt.tight_layout()
	fig.savefig(output_dir / "logit_lens_trajectory.png", dpi=150, bbox_inches="tight")
	plt.close(fig)


	def plot_drift(drift: OrderedDict, output_dir: Path, model_label: str):
	"""Plot representation drift between consecutive layers."""
	names = list(drift.keys())
	sorted_names = sorted(names, key=_layer_sort_key)
	short = [_short_name(n) for n in sorted_names]
	colors = [_phase_color(n) for n in sorted_names]
	x = range(len(sorted_names))

	fig, axes = plt.subplots(1, 2, figsize=(14, 5))
	fig.suptitle(f"{model_label} -- Representation drift", fontsize=14)

	# Cosine similarity with previous layer
	vals = [drift[n]["cos_sim_mean"] for n in sorted_names]
	axes[0].bar(x, vals, color=colors, alpha=0.85)
	axes[0].set_ylabel("Cosine similarity with previous layer")
	axes[0].set_title("How much each layer preserves direction")
	axes[0].set_xticks(x)
	axes[0].set_xticklabels(short, rotation=90, fontsize=7)

	# L2 distance
	vals = [drift[n]["l2_distance"] for n in sorted_names]
	axes[1].bar(x, vals, color=colors, alpha=0.85)
	axes[1].set_ylabel("L2 distance from previous layer")
	axes[1].set_title("How much each layer changes magnitude")
	axes[1].set_xticks(x)
	axes[1].set_xticklabels(short, rotation=90, fontsize=7)

	plt.tight_layout()
	fig.savefig(output_dir / "representation_drift.png", dpi=150)
	plt.close(fig)


	# ---------------------------------------------------------------------------
	# Results saving
	# ---------------------------------------------------------------------------

	def save_results(cka_matrix, cka_names, lens_results, drift, cross_cka, output_dir):
	"""Save all numerical results to JSON."""
	out = {}

	if cka_matrix is not None:
	out["cka_self"] = {
	"names": cka_names,
	"matrix": cka_matrix.tolist(),
	}

	if lens_results:
	out["logit_lens"] = {name: data for name, data in lens_results.items()}

	if drift:
	out["drift"] = {name: data for name, data in drift.items()}

	if cross_cka is not None:
	matrix, names_a, names_b = cross_cka
	out["cka_cross"] = {
	"names_a": names_a,
	"names_b": names_b,
	"matrix": matrix.tolist(),
	}

	with open(output_dir / "results.json", "w") as f:
	json.dump(out, f, indent=2, default=str)


	# ---------------------------------------------------------------------------
	# Main
	# ---------------------------------------------------------------------------

	def main():
	parser = argparse.ArgumentParser(
	description="CKA and Logit Lens analysis for Prisma / Circuit Transformer")
	parser.add_argument("--checkpoint", type=str, required=True,
	help="Path to Prisma/Circuit checkpoint")
	parser.add_argument("--checkpoint-b", type=str, default=None,
	help="Second Prisma checkpoint for cross-model CKA")
	parser.add_argument("--hf-model", type=str, default=None,
	help="HuggingFace model for cross-model CKA (e.g. gpt2-medium)")
	parser.add_argument("--data", type=str, required=True,
	help="Data source (hf:dataset:config:split or file path)")
	parser.add_argument("--num-samples", type=int, default=32,
	help="Number of text samples (default: 32)")
	parser.add_argument("--context-length", type=int, default=512,
	help="Sequence length (default: 512)")
	parser.add_argument("--cka-subsample", type=int, default=4,
	help="Position subsampling for CKA (default: 4)")
	parser.add_argument("--no-logit-lens", action="store_true",
	help="Skip logit lens analysis")
	parser.add_argument("--no-cka", action="store_true",
	help="Skip CKA analysis")
	parser.add_argument("--output-dir", type=str, default=None,
	help="Output directory (default: auto)")
	parser.add_argument("--gpu", type=int, default=0, help="GPU index")
	args = parser.parse_args()

	device = f"cuda:{args.gpu}" if torch.cuda.is_available() else "cpu"
	print(f"Device: {device}")

	# Output directory
	if args.output_dir:
	output_dir = Path(args.output_dir)
	else:
	ckpt_name = Path(args.checkpoint).parent.name
	output_dir = Path("circuits/scripts/representation_output") / ckpt_name
	output_dir.mkdir(parents=True, exist_ok=True)
	print(f"Output: {output_dir}")

	# === Load model A ===
	print(f"\nLoading: {args.checkpoint}")
	model_a, config_a, model_type_a = load_prisma_model(args.checkpoint, device)
	label_a = Path(args.checkpoint).parent.name
	n_params = sum(p.numel() for p in model_a.parameters())
	print(f" Type: {model_type_a}, params: {n_params:,}")

	# === Load data ===
	ckpt_data = torch.load(args.checkpoint, map_location="cpu", weights_only=False)
	tokenizer_name = ckpt_data.get("tokenizer_name", config_a.get("tokenizer_name", "gpt2"))
	del ckpt_data

	print(f"\nLoading data ({args.num_samples} samples, ctx={args.context_length})...")
	result = load_data(
	args.data, tokenizer_name, args.num_samples, args.context_length, device
	)
	if result[0] is None:
	print("ERROR: No valid samples loaded.")
	return
	input_ids, tokenizer = result
	print(f" Data shape: {input_ids.shape}")

	# === Collect activations (model A) ===
	print(f"\nCollecting activations ({model_type_a})...")
	acts_a = collect_activations(model_a, model_type_a, config_a, input_ids, device)
	print(f" Collected {len(acts_a)} layers")

	# Free GPU memory
	del model_a
	if device.startswith("cuda"):
	torch.cuda.empty_cache()

	# === CKA (self) ===
	cka_matrix = None
	cka_names = None
	if not args.no_cka:
	print(f"\nComputing self-CKA (subsample={args.cka_subsample})...")
	cka_matrix, cka_names = compute_cka_matrix(acts_a, subsample=args.cka_subsample)
	plot_cka_self(cka_matrix, cka_names, output_dir, label_a)
	print(f" Saved: cka_self.png")

	# === Cross-model CKA ===
	cross_cka = None
	if not args.no_cka and (args.checkpoint_b or args.hf_model):
	if args.checkpoint_b:
	print(f"\nLoading comparison: {args.checkpoint_b}")
	model_b, config_b, model_type_b = load_prisma_model(args.checkpoint_b, device)
	label_b = Path(args.checkpoint_b).parent.name
	acts_b = collect_activations(model_b, model_type_b, config_b, input_ids, device)
	del model_b
	else:
	print(f"\nLoading HF model: {args.hf_model}")
	model_b = load_hf_model(args.hf_model, device)
	label_b = args.hf_model
	# Decode texts from our tokens and re-tokenize for HF model
	print(f" Re-tokenizing for {args.hf_model}...")
	raw_texts = [tokenizer.decode(input_ids[i].tolist()) for i in range(input_ids.shape[0])]
	input_ids_b, _ = tokenize_for_hf(
	raw_texts, args.hf_model, args.context_length, device
	)
	if input_ids_b is not None:
	print(f" HF data shape: {input_ids_b.shape}")
	acts_b = collect_hf_activations(model_b, input_ids_b)
	else:
	acts_b = None
	del model_b

	if device.startswith("cuda"):
	torch.cuda.empty_cache()

	if acts_b:
	print(f"\nComputing cross-model CKA...")
	cross_matrix, cross_names_a, cross_names_b = compute_cross_model_cka(acts_a, acts_b)
	cross_cka = (cross_matrix, cross_names_a, cross_names_b)
	plot_cka_cross(cross_matrix, cross_names_a, cross_names_b,
	output_dir, label_a, label_b)
	print(f" Saved: cka_cross.png")
	del acts_b

	# === Logit lens ===
	lens_results = None
	if not args.no_logit_lens:
	# Reload model for unembedding components (we deleted it for memory)
	print(f"\nReloading model for logit lens...")
	model_a, _, _ = load_prisma_model(args.checkpoint, device)
	norm, unembed_weight = get_unembed_components(model_a, model_type_a)

	labels = input_ids[:, 1:].cpu() # next-token labels

	print(f"Computing logit lens...")
	lens_results = compute_logit_lens(acts_a, norm, unembed_weight, labels, device)
	plot_logit_lens(lens_results, output_dir, label_a)
	print(f" Saved: logit_lens_summary.png")

	# Token trajectory visualization
	print(f" Generating token trajectories...")
	plot_logit_lens_trajectory(
	acts_a, norm, unembed_weight, input_ids.cpu(), tokenizer,
	output_dir, label_a, device
	)
	print(f" Saved: logit_lens_trajectory.png")

	del model_a
	if device.startswith("cuda"):
	torch.cuda.empty_cache()

	# === Representation drift ===
	print(f"\nComputing representation drift...")
	drift = compute_drift(acts_a)
	plot_drift(drift, output_dir, label_a)
	print(f" Saved: representation_drift.png")

	# === Save results ===
	save_results(cka_matrix, cka_names, lens_results, drift, cross_cka, output_dir)
	print(f"\nAll outputs saved to: {output_dir}")
	n_plots = len(list(output_dir.glob("*.png")))
	print(f" Plots: {n_plots} PNG files")
	print(f" Data: results.json")


	if __name__ == "__main__":
	main()