Spaces:

reuAC
/

reFlow

Sleeping

reFlow / experiments.py

”reuAC“

Add bilingual interpretability demo with 11 experiments

bf44358 15 days ago

28.7 kB

	"""Experiment functions for the reFlow interpretability demo, adapted for Gradio."""

	import torch
	import torch.nn.functional as F
	import numpy as np
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker
	import seaborn as sns
	from sklearn.decomposition import PCA
	from sklearn.metrics import silhouette_score

	try:
	from adjustText import adjust_text
	except ImportError:
	adjust_text = lambda texts, **kwargs: None

	from model_loader import get_model, get_cached_tensors

	REAL_VOCAB = 50257

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------

	def _embed(model, ids):
	result = model.transformer.wte(ids)
	return result[0] if isinstance(result, tuple) else result


	def _get_vocab_signals(model):
	wte = model.transformer.wte
	if hasattr(wte, '_apply_sparsity'):
	return wte._apply_sparsity(wte.vocab_to_signals.weight.data)
	return wte.vocab_to_signals.weight.data


	def _forward_through_layers(model, ids):
	with torch.no_grad():
	x = _embed(model, ids)
	freqs_cis = model.freqs_cis[:ids.size(1)]
	for block in model.transformer.h:
	x = block(x, freqs_cis)
	return x


	def _get_logits_from_hidden(model, x_norm):
	vocab_matrix = model.transformer.wte.get_dynamic_vocab_matrix()
	return F.linear(x_norm, vocab_matrix)


	def _gini(arr):
	arr = np.sort(np.abs(arr))
	n = len(arr)
	if n == 0 or np.sum(arr) == 0:
	return 0.0
	index = np.arange(1, n + 1)
	return (2 * np.sum(index * arr) / (n * np.sum(arr))) - (n + 1) / n


	# ---------------------------------------------------------------------------
	# 1. Semantic Galaxy (PCA)
	# ---------------------------------------------------------------------------

	DEFAULT_CLUSTERS = {
	"Countries": ["China", "France", "Germany", "Japan", "India", "Russia"],
	"Animals": ["cat", "dog", "fish", "bird", "horse", "bear"],
	"Numbers": ["one", "two", "three", "four", "five", "ten"],
	"Colors": ["red", "blue", "green", "black", "white", "yellow"],
	"Emotions": ["happy", "sad", "angry", "love", "fear", "hate"],
	}


	@torch.inference_mode()
	def exp_semantic_galaxy(
	use_countries, use_animals, use_numbers, use_colors, use_emotions, custom_words
	):
	model, enc, device = get_model()
	W_v2s = _get_vocab_signals(model).cpu().numpy()

	# Build clusters from checkboxes
	clusters = {}
	if use_countries:
	clusters["Countries"] = DEFAULT_CLUSTERS["Countries"]
	if use_animals:
	clusters["Animals"] = DEFAULT_CLUSTERS["Animals"]
	if use_numbers:
	clusters["Numbers"] = DEFAULT_CLUSTERS["Numbers"]
	if use_colors:
	clusters["Colors"] = DEFAULT_CLUSTERS["Colors"]
	if use_emotions:
	clusters["Emotions"] = DEFAULT_CLUSTERS["Emotions"]

	# Custom words
	if custom_words and custom_words.strip():
	custom_list = [w.strip() for w in custom_words.split(",") if w.strip()]
	if custom_list:
	clusters["Custom"] = custom_list

	if not clusters:
	clusters = DEFAULT_CLUSTERS

	recipes, labels, words = [], [], []
	for cat, wl in clusters.items():
	for w in wl:
	tids = enc.encode(" " + w)
	if tids and tids[0] < REAL_VOCAB:
	recipes.append(W_v2s[tids[0]])
	labels.append(cat)
	words.append(w)

	if len(words) < 3:
	fig, ax = plt.subplots(figsize=(8, 6))
	ax.text(0.5, 0.5, "Need at least 3 valid words", ha='center', va='center', fontsize=14)
	ax.axis('off')
	return fig

	recipes_arr = np.array(recipes)
	coords = PCA(n_components=2).fit_transform(recipes_arr)

	label_ids = [list(clusters.keys()).index(l) for l in labels]
	sil = silhouette_score(recipes_arr, label_ids) if len(set(label_ids)) >= 2 else 0.0

	fig = plt.figure(figsize=(12, 9))
	color_map = dict(zip(clusters.keys(), sns.color_palette("Set2", len(clusters))))

	texts = []
	for i, w in enumerate(words):
	plt.scatter(coords[i, 0], coords[i, 1], color=color_map[labels[i]],
	s=150, alpha=0.7, edgecolors='white', linewidths=0.5)
	texts.append(plt.text(coords[i, 0], coords[i, 1], w, fontsize=11))

	if callable(adjust_text) and getattr(adjust_text, '__name__', '') != '<lambda>':
	adjust_text(texts, arrowprops=dict(arrowstyle="-", color='gray'))

	handles = [plt.Line2D([0], [0], marker='o', color='w',
	markerfacecolor=color_map[l], markersize=12, label=l) for l in clusters]
	plt.legend(handles=handles, title="Clusters", fontsize=10)
	plt.title(f"reFlow Semantic Galaxy (PCA)\nSilhouette Score = {sil:.4f}",
	fontsize=14, fontweight='bold')
	plt.xlabel("PC1")
	plt.ylabel("PC2")
	plt.tight_layout()
	return fig


	# ---------------------------------------------------------------------------
	# 2. Semantic Algebra
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_semantic_algebra(positive_words, negative_words):
	model, enc, device = get_model()
	W_v2s = _get_vocab_signals(model)
	W_valid = W_v2s[:REAL_VOCAB]

	pos_list = [w.strip() for w in positive_words.split(",") if w.strip()]
	neg_list = [w.strip() for w in negative_words.split(",") if w.strip()]

	if not pos_list:
	return "Please enter at least one positive word."

	target_vec = torch.zeros(model.config.n_signals, device=device)
	exclude_ids = set()

	for w in pos_list:
	tids = enc.encode(" " + w)
	if tids and tids[0] < REAL_VOCAB:
	target_vec += W_v2s[tids[0]]
	exclude_ids.add(tids[0])
	for w in neg_list:
	tids = enc.encode(" " + w)
	if tids and tids[0] < REAL_VOCAB:
	target_vec -= W_v2s[tids[0]]
	exclude_ids.add(tids[0])

	sims = F.cosine_similarity(target_vec.unsqueeze(0), W_valid)
	for tid in exclude_ids:
	sims[tid] = -1.0

	top_vals, top_ids = torch.topk(sims, 20)

	expr = " + ".join(pos_list)
	if neg_list:
	expr += " - " + " - ".join(neg_list)

	rows = []
	for i in range(len(top_ids)):
	try:
	w = enc.decode([top_ids[i].item()]).strip()
	if len(w) >= 1:
	rows.append(f"#{len(rows)+1:2d} {w:<20s} cos={top_vals[i].item():.4f}")
	except Exception:
	continue
	if len(rows) >= 15:
	break

	header = f"Expression: {expr}\n{'='50}\nRank Word Similarity\n{'-'50}\n"
	return header + "\n".join(rows)


	# ---------------------------------------------------------------------------
	# 3. Typo Resilience
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_typo_resilience(sent_normal, sent_typo, sent_diff):
	model, enc, device = get_model()
	W_basis = model.transformer.wte.signal_basis.data

	def get_deep_signal(text):
	ids = torch.tensor(enc.encode(text), device=device).unsqueeze(0)
	x = _forward_through_layers(model, ids)
	x_norm = model.transformer.ln_f(x[0, -1, :])
	return x_norm @ W_basis.t()

	sig_normal = get_deep_signal(sent_normal)
	sig_typo = get_deep_signal(sent_typo)
	sig_diff = get_deep_signal(sent_diff)

	sim_typo = F.cosine_similarity(sig_normal.unsqueeze(0), sig_typo.unsqueeze(0)).item()
	sim_diff = F.cosine_similarity(sig_normal.unsqueeze(0), sig_diff.unsqueeze(0)).item()

	fig, ax = plt.subplots(figsize=(8, 5))
	categories = ['Self\n(baseline)', 'Normal vs Typo\n(same meaning)', 'Normal vs Different\n(different meaning)']
	values = [1.0, sim_typo, sim_diff]
	colors = ['#2ecc71', '#f39c12', '#e74c3c']
	bars = ax.bar(categories, values, color=colors, alpha=0.8, edgecolor='black', width=0.5)
	for bar, val in zip(bars, values):
	ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
	f'{val:.4f}', ha='center', fontsize=11, fontweight='bold')
	ax.set_ylim(0, 1.15)
	ax.set_ylabel("Cosine Similarity")
	ax.set_title("reFlow Typo Resilience - Deep Signal Similarity", fontsize=13, fontweight='bold')
	ax.grid(axis='y', alpha=0.3)
	plt.tight_layout()
	return fig


	# ---------------------------------------------------------------------------
	# 4. Sparsity Profile
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_sparsity_profile(word_to_inspect):
	model, enc, device = get_model()
	W_v2s = _get_vocab_signals(model)
	W = W_v2s[:REAL_VOCAB]
	vocab_size, n_signals = W.shape

	mean_val = W.abs().mean().item()
	std_val = W.abs().std().item()
	threshold = mean_val + std_val
	active_mask = W.abs() > threshold

	active_per_word = active_mask.sum(dim=1).cpu().numpy()
	active_per_signal = active_mask.sum(dim=0).cpu().numpy()

	fig, axes = plt.subplots(1, 2, figsize=(14, 5))

	# Histogram of active signals per word
	int_bins = np.arange(active_per_word.min(), active_per_word.max() + 2) - 0.5
	axes[0].hist(active_per_word, bins=int_bins, color='teal', alpha=0.7, edgecolor='black')
	axes[0].axvline(x=np.mean(active_per_word), color='red', linestyle='--',
	label=f'Mean: {np.mean(active_per_word):.1f}')
	axes[0].set_title("Per-Word Sparsity (# Active Signals)")
	axes[0].set_xlabel("Number of Active Signals")
	axes[0].set_ylabel("Frequency")
	axes[0].legend()

	# Signal utilization
	axes[1].bar(range(n_signals), active_per_signal, color='coral', alpha=0.7, width=1.0)
	axes[1].set_title("Signal Utilization (# words activating each signal)")
	axes[1].set_xlabel("Signal Index")
	axes[1].set_ylabel("# Words")
	axes[1].axhline(y=np.mean(active_per_signal), color='red', linestyle='--',
	label=f'Mean: {np.mean(active_per_signal):.0f}')
	axes[1].legend()

	plt.suptitle("reFlow Sparsity Profile", fontsize=14, fontweight='bold')
	plt.tight_layout(rect=[0, 0, 1, 0.95])

	# Per-word stats
	stats_text = f"Threshold: {threshold:.4f} (mean + std)\n"
	stats_text += f"Avg active signals per word: {np.mean(active_per_word):.1f} / {n_signals}\n"
	stats_text += f"Global activation rate: {active_mask.float().mean().item():.2%}\n"

	if word_to_inspect and word_to_inspect.strip():
	w = word_to_inspect.strip()
	tids = enc.encode(" " + w)
	if tids and tids[0] < REAL_VOCAB:
	word_recipe = W[tids[0]]
	word_active = (word_recipe.abs() > threshold).sum().item()
	top_sigs = torch.argsort(word_recipe.abs(), descending=True)[:10]
	stats_text += f"\n--- '{w}' ---\n"
	stats_text += f"Active signals: {word_active}\n"
	stats_text += f"Top 10 signal indices: {top_sigs.tolist()}\n"
	stats_text += f"Top 10 amplitudes: {[f'{word_recipe[s].item():.4f}' for s in top_sigs]}\n"
	else:
	stats_text += f"\n'{w}' not found in vocabulary.\n"

	return fig, stats_text


	# ---------------------------------------------------------------------------
	# 5. Layer Evolution
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_layer_evolution(prompt_text):
	model, enc, device = get_model()
	vocab_matrix = model.transformer.wte.get_dynamic_vocab_matrix()
	n_layers = len(model.transformer.h)

	ids = torch.tensor(enc.encode(prompt_text), device=device).unsqueeze(0)
	layer_probs = []
	layer_entropies = []

	x = _embed(model, ids)
	freqs_cis = model.freqs_cis[:ids.size(1)]
	for block in model.transformer.h:
	x = block(x, freqs_cis)
	x_norm = model.transformer.ln_f(x[0, -1, :])
	probs = F.softmax(_get_logits_from_hidden(model, x_norm), dim=-1)
	layer_probs.append(probs.cpu().numpy())
	entropy = -torch.sum(probs * torch.log(probs + 1e-9)).item()
	layer_entropies.append(entropy)

	final_probs = layer_probs[-1][:REAL_VOCAB]
	top_idx = np.argsort(final_probs)[-6:]
	prob_flow = np.array([[p[i] for i in top_idx] for p in layer_probs])
	layers = np.arange(1, n_layers + 1)

	fig, (ax_prob, ax_ent) = plt.subplots(1, 2, figsize=(16, 5))

	colors_palette = sns.color_palette("husl", len(top_idx))
	for i, idx in enumerate(top_idx):
	label = repr(enc.decode([idx])).strip("'")
	ax_prob.plot(layers, prob_flow[:, i], label=label, lw=2.5, color=colors_palette[i])
	ax_prob.set_title(f"Probability Evolution: '{prompt_text}'", fontsize=11, fontweight='bold')
	ax_prob.set_xlabel("Layer")
	ax_prob.set_ylabel("Probability")
	ax_prob.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0, decimals=0))
	ax_prob.legend(fontsize=8, loc='upper left')
	ax_prob.grid(True, alpha=0.3)

	ax_ent.plot(layers, layer_entropies, color='#FF6B35', lw=2.5, marker='o', markersize=3)
	ax_ent.set_title(f"Entropy Decay: '{prompt_text}'", fontsize=11, fontweight='bold')
	ax_ent.set_xlabel("Layer")
	ax_ent.set_ylabel("Entropy (nats)")
	ax_ent.grid(True, alpha=0.3)

	predicted = enc.decode([np.argmax(final_probs)])
	plt.suptitle(f"reFlow Layer Evolution \| Prediction: '{predicted}' (p={final_probs.max():.2%})",
	fontsize=13, fontweight='bold')
	plt.tight_layout(rect=[0, 0, 1, 0.95])
	return fig


	# ---------------------------------------------------------------------------
	# 6. Causal Ablation
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_causal_ablation(prompt_text):
	model, enc, device = get_model()
	W_basis = model.transformer.wte.signal_basis.data
	W_v2s = _get_vocab_signals(model)

	ablation_steps = [1, 2, 4, 8, 16, 32, 64, 128]

	ids = torch.tensor(enc.encode(prompt_text), device=device).unsqueeze(0)
	x = _forward_through_layers(model, ids)
	x_norm = model.transformer.ln_f(x[0, -1, :])
	sig_acts = x_norm @ W_basis.t()

	logits_base = sig_acts @ W_v2s[:REAL_VOCAB].t()
	probs_base = F.softmax(logits_base, dim=-1)
	pred_id = torch.argmax(probs_base).item()
	pred_word = enc.decode([pred_id])
	pred_prob = probs_base[pred_id].item()

	contribs = sig_acts * W_v2s[pred_id]
	sorted_sig_ids = torch.argsort(contribs, descending=True)

	steps, probs_list, new_preds = [], [], []
	for n_ablate in ablation_steps:
	if n_ablate > len(sorted_sig_ids):
	break
	ablated = sig_acts.clone()
	ablated[sorted_sig_ids[:n_ablate]] = 0.0
	logits_abl = ablated @ W_v2s[:REAL_VOCAB].t()
	probs_abl = F.softmax(logits_abl, dim=-1)
	new_pred_id = torch.argmax(probs_abl).item()
	steps.append(n_ablate)
	probs_list.append(probs_abl[pred_id].item())
	new_preds.append(enc.decode([new_pred_id]))

	# Codebook for top signal
	top_sig = sorted_sig_ids[0].item()
	col = W_v2s[:REAL_VOCAB, top_sig]
	top_vals, top_ids = torch.topk(col, 8)
	cb_words = []
	for tid in top_ids:
	try:
	cb_words.append(enc.decode([tid.item()]).strip())
	except Exception:
	cb_words.append(f"[{tid.item()}]")

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

	ax1.plot(steps, [max(p, 1e-8) for p in probs_list],
	'o-', color='#e74c3c', lw=2.5, markersize=6)
	ax1.axhline(y=pred_prob, color='blue', linestyle='--', alpha=0.5,
	label=f"Baseline: {pred_prob:.1%}")
	ax1.set_title(f"'{prompt_text}'\nPrediction: '{pred_word}'", fontsize=10, fontweight='bold')
	ax1.set_xlabel("# Signals Ablated")
	ax1.set_ylabel("P(original prediction)")
	ax1.set_yscale('log')
	ax1.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0, decimals=2))
	ax1.set_xscale('log', base=2)
	ax1.legend(fontsize=8)
	ax1.grid(True, alpha=0.3)

	# Text summary
	ax2.axis('off')
	summary = f"Baseline: '{pred_word}' (p={pred_prob:.2%})\n"
	summary += f"Key Signal: #{top_sig}\n"
	summary += f"Codebook: {', '.join(cb_words[:6])}\n\n"
	summary += "Ablation Results:\n" + "-"*40 + "\n"
	for s, p, nw in zip(steps, probs_list, new_preds):
	summary += f" {s:3d} signals removed -> p={p:.2%}, pred='{nw}'\n"

	ax2.text(0.05, 0.95, summary, transform=ax2.transAxes, fontsize=10,
	verticalalignment='top', fontfamily='monospace',
	bbox=dict(boxstyle='round', facecolor='lightyellow', alpha=0.8))

	plt.suptitle("reFlow Causal Ablation", fontsize=14, fontweight='bold')
	plt.tight_layout(rect=[0, 0, 1, 0.95])
	return fig


	# ---------------------------------------------------------------------------
	# 7. Concept Inception
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_concept_inception(prompt_text, target_word, alpha_max):
	model, enc, device = get_model()
	W_basis = model.transformer.wte.signal_basis.data
	W_v2s = _get_vocab_signals(model)

	tid = enc.encode(" " + target_word)[0]
	target_recipe = W_v2s[tid]

	ids = torch.tensor(enc.encode(prompt_text), device=device).unsqueeze(0)
	x = _forward_through_layers(model, ids)
	x_norm = model.transformer.ln_f(x[0, -1, :])
	base_sig = x_norm @ W_basis.t()

	logits_base = base_sig @ W_v2s[:REAL_VOCAB].t()
	probs_base = F.softmax(logits_base, dim=-1)
	orig_word = enc.decode([torch.argmax(probs_base).item()])
	orig_prob = probs_base[tid].item()

	# Binary search for critical alpha
	lo, hi = 0.0, float(alpha_max)
	critical_alpha = None
	probs_hi = F.softmax((base_sig + hi * target_recipe) @ W_v2s[:REAL_VOCAB].t(), dim=-1)
	if torch.argmax(probs_hi).item() == tid:
	for _ in range(20):
	mid = (lo + hi) / 2
	probs_mid = F.softmax((base_sig + mid * target_recipe) @ W_v2s[:REAL_VOCAB].t(), dim=-1)
	if torch.argmax(probs_mid).item() == tid:
	hi = mid
	else:
	lo = mid
	critical_alpha = hi

	# Build curve
	alpha_range = min(float(alpha_max), (critical_alpha or float(alpha_max)) * 1.5)
	alphas = np.linspace(0, alpha_range, 50)
	target_probs = []
	for a in alphas:
	probs = F.softmax((base_sig + a * target_recipe) @ W_v2s[:REAL_VOCAB].t(), dim=-1)
	target_probs.append(probs[tid].item())

	fig, ax = plt.subplots(figsize=(8, 5))
	ax.plot(alphas, target_probs, 'o-', color='#9b59b6', lw=2, markersize=3)
	if critical_alpha:
	ax.axvline(critical_alpha, color='red', linestyle='--',
	label=f"Critical alpha={critical_alpha:.1f}")
	ax.axhline(y=orig_prob, color='gray', linestyle=':', alpha=0.5,
	label=f"Baseline P('{target_word}')={orig_prob:.1e}")
	ax.set_title(f"'{prompt_text}'\n'{orig_word}' -> '{target_word}'",
	fontsize=11, fontweight='bold')
	ax.set_xlabel("Steering Strength (alpha)")
	ax.set_ylabel(f"P('{target_word}')")
	ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0, decimals=0))
	ax.legend(fontsize=9)
	ax.grid(True, alpha=0.3)
	plt.tight_layout()

	info = f"Original prediction: '{orig_word}'\n"
	info += f"Target: '{target_word}'\n"
	if critical_alpha:
	info += f"Critical flip point: alpha = {critical_alpha:.2f}\n"
	else:
	info += f"Target not reached within alpha <= {alpha_max}\n"

	return fig, info


	# ---------------------------------------------------------------------------
	# 8. Text Generation
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_generate(prompt_text, num_samples, max_tokens, temperature, top_k, repetition_penalty):
	model, enc, device = get_model()

	num_samples = int(num_samples)
	max_tokens = int(max_tokens)
	top_k = int(top_k) if top_k and top_k > 0 else None
	temperature = float(temperature)
	repetition_penalty = float(repetition_penalty)

	if not prompt_text.strip():
	return "Please enter a prompt."

	ids = torch.tensor(enc.encode(prompt_text), device=device).unsqueeze(0)
	# Repeat for num_samples
	ids = ids.expand(num_samples, -1).contiguous()

	results = []
	for s in range(num_samples):
	x = ids[s:s+1]
	for _ in range(max_tokens):
	x_cond = x if x.size(1) <= model.config.block_size else x[:, -model.config.block_size:]
	logits, _ = model(x_cond)
	logits = logits[:, -1, :]

	# Repetition penalty
	if repetition_penalty != 1.0:
	generated_ids = x[0].tolist()
	for token_id in set(generated_ids):
	if logits[0, token_id] > 0:
	logits[0, token_id] /= repetition_penalty
	else:
	logits[0, token_id] *= repetition_penalty

	# Temperature
	logits = logits / max(temperature, 1e-8)

	# Top-k filtering
	if top_k is not None and top_k > 0:
	v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
	logits[logits < v[:, [-1]]] = -float('Inf')

	probs = F.softmax(logits, dim=-1)
	idx_next = torch.multinomial(probs, num_samples=1)
	x = torch.cat((x, idx_next), dim=1)

	text = enc.decode(x[0].tolist())
	results.append(text)

	separator = "\n" + "=" * 60 + "\n"
	output = ""
	for i, text in enumerate(results):
	if num_samples > 1:
	output += f"--- Sample {i+1}/{num_samples} ---\n"
	output += text + "\n"
	if i < len(results) - 1:
	output += separator
	return output


	# ---------------------------------------------------------------------------
	# 9. Signal Basis Geometry
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_basis_geometry():
	model, enc, device = get_model()

	W_basis = model.transformer.wte.signal_basis.data.cpu().float()
	n_signals, n_embd = W_basis.shape

	U, S, Vt = torch.linalg.svd(W_basis, full_matrices=False)
	S_np = S.numpy()

	s_norm = S_np / S_np.sum()
	effective_rank = np.exp(-np.sum(s_norm * np.log(s_norm + 1e-12)))

	random_mat = torch.randn_like(W_basis)
	_, S_rand, _ = torch.linalg.svd(random_mat, full_matrices=False)
	S_rand_np = S_rand.numpy()
	s_rand_norm = S_rand_np / S_rand_np.sum()
	effective_rank_rand = np.exp(-np.sum(s_rand_norm * np.log(s_rand_norm + 1e-12)))

	show_n = min(64, n_signals)
	W_show = W_basis[:show_n]
	W_normed = F.normalize(W_show, dim=1)
	cos_sim = (W_normed @ W_normed.t()).numpy()

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

	ax1.plot(S_np / S_np[0], 'b-', lw=2, label='Learned Basis')
	ax1.plot(S_rand_np / S_rand_np[0], 'r--', lw=1.5, label='Random Gaussian')
	ax1.set_title(f"Singular Value Spectrum\n(Eff. rank: learned={effective_rank:.0f}, random={effective_rank_rand:.0f})")
	ax1.set_xlabel("Component Index")
	ax1.set_ylabel("Normalized Singular Value")
	ax1.set_yscale('log')
	ax1.legend()
	ax1.grid(True, alpha=0.3)

	im = ax2.imshow(cos_sim, cmap='RdBu_r', vmin=-1, vmax=1, aspect='auto')
	ax2.set_title(f"Cosine Similarity (first {show_n} signals)")
	ax2.set_xlabel("Signal Index")
	ax2.set_ylabel("Signal Index")
	plt.colorbar(im, ax=ax2, fraction=0.046)

	plt.suptitle("reFlow Signal Basis Geometry", fontsize=14, fontweight='bold')
	plt.tight_layout(rect=[0, 0, 1, 0.95])

	stats = f"Signal basis shape: ({n_signals}, {n_embd})\n"
	stats += f"Effective rank (learned): {effective_rank:.1f} / {min(n_signals, n_embd)}\n"
	stats += f"Effective rank (random): {effective_rank_rand:.1f} / {min(n_signals, n_embd)}\n"

	return fig, stats


	# ---------------------------------------------------------------------------
	# 10. Recipe Neighbors (Nearest Neighbor Lookup)
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_recipe_neighbors(query_word, top_n):
	model, enc, device = get_model()
	W_v2s = _get_vocab_signals(model)
	W = W_v2s[:REAL_VOCAB]
	W_normed = F.normalize(W, dim=1)

	top_n = int(top_n)
	words = [w.strip() for w in query_word.split(",") if w.strip()]
	if not words:
	return "Please enter at least one word."

	output = ""
	for w in words:
	tids = enc.encode(" " + w)
	if not tids or tids[0] >= REAL_VOCAB:
	output += f"'{w}' not found in vocabulary.\n\n"
	continue
	tid = tids[0]
	sims = (W_normed[tid] @ W_normed.t())
	sims[tid] = -1
	top_vals, top_ids = torch.topk(sims, top_n)

	output += f"Nearest neighbors for '{w}':\n" + "-" * 40 + "\n"
	for i, (v, nid) in enumerate(zip(top_vals, top_ids)):
	try:
	nw = enc.decode([nid.item()]).strip()
	except Exception:
	nw = f"[{nid.item()}]"
	output += f" #{i+1:2d} {nw:<20s} cos={v.item():.4f}\n"
	output += "\n"

	return output


	# ---------------------------------------------------------------------------
	# 11. Task Crystallization
	# ---------------------------------------------------------------------------

	@torch.inference_mode()
	def exp_task_crystallization(prompt_text, target_word, max_alpha, start_layer):
	model, enc, device = get_model()
	W_basis = model.transformer.wte.signal_basis.data
	W_v2s = _get_vocab_signals(model)
	n_layers = len(model.transformer.h)
	start_layer = int(start_layer)
	max_alpha = float(max_alpha)

	target_tid = enc.encode(" " + target_word.strip())[0]
	ids = torch.tensor(enc.encode(prompt_text), device=device).unsqueeze(0)

	# Get baseline prediction
	x = _forward_through_layers(model, ids)
	x_norm = model.transformer.ln_f(x[0, -1, :])
	logits_base = _get_logits_from_hidden(model, x_norm)
	base_pred_id = torch.argmax(logits_base).item()
	base_pred = enc.decode([base_pred_id])

	# Find working alpha
	def continuous_steer(alpha, intercept_layer):
	steer_vec = W_v2s[target_tid] - W_v2s[base_pred_id]
	x = _embed(model, ids)
	if intercept_layer == 0:
	x[:, -1, :] += (alpha * steer_vec) @ W_basis

	freqs_cis = model.freqs_cis[:ids.size(1)]
	for i, block in enumerate(model.transformer.h):
	x = block(x, freqs_cis)
	if i + 1 >= intercept_layer:
	x[:, -1, :] += (alpha * steer_vec) @ W_basis

	x_norm = model.transformer.ln_f(x[0, -1, :])
	logits = _get_logits_from_hidden(model, x_norm)
	probs = F.softmax(logits, dim=-1)
	pred_id = torch.argmax(logits).item()
	return probs[target_tid].item(), enc.decode([pred_id]).strip()

	# Find minimum alpha that works at start_layer
	working_alpha = None
	for a in np.arange(2.0, max_alpha, 2.0):
	_, pred = continuous_steer(a, start_layer)
	if pred.strip() == target_word.strip():
	working_alpha = a * 1.2
	break

	if working_alpha is None:
	return None, f"Cannot steer to '{target_word}' within alpha <= {max_alpha}"

	# Scan across layers
	layer_probs = []
	c_layer = n_layers
	for L in range(n_layers):
	p_target, pred = continuous_steer(working_alpha, L)
	layer_probs.append(p_target)
	if pred.strip() != target_word.strip() and c_layer == n_layers:
	c_layer = L

	# Plot
	fig, ax = plt.subplots(figsize=(10, 6))
	layers_x = np.arange(n_layers)
	ax.plot(layers_x, layer_probs, 'o-', color='#9b59b6', lw=2.5, markersize=4)
	if c_layer < n_layers:
	ax.scatter(c_layer, layer_probs[c_layer], color='red', s=150, marker='X', edgecolors='black', zorder=5)
	ax.axvline(c_layer, color='red', linestyle='--', alpha=0.5, label=f'Crystallization boundary: Layer {c_layer}')

	ax.set_title(f"Task Crystallization: '{prompt_text}' → '{target_word}'", fontsize=11, fontweight='bold')
	ax.set_xlabel("Intervention Start Layer")
	ax.set_ylabel(f"P('{target_word}')")
	ax.yaxis.set_major_formatter(ticker.PercentFormatter(xmax=1.0, decimals=0))
	ax.legend(fontsize=9)
	ax.grid(True, alpha=0.3)
	plt.tight_layout()

	info = f"Base prediction: '{base_pred}'\n"
	info += f"Target: '{target_word}'\n"
	info += f"Working alpha: {working_alpha:.1f}\n"
	info += f"Crystallization boundary: Layer {c_layer}\n"

	return fig, info