engram / scripts /paper_figures.py

feat: upload scripts

954cf8a verified 7 days ago

44.6 kB

	#!/usr/bin/env python3
	"""ENGRAM Research Paper — Figure Generation.

	Generates all 15 figures for the ENGRAM paper from results/ data files.
	Output: results/figures/*.pdf (LaTeX-compatible, 300 DPI)

	Usage:
	cd ENGRAM && python scripts/paper_figures.py
	python scripts/paper_figures.py --only fig02 # Single figure
	python scripts/paper_figures.py --list # List all figures
	"""

	from __future__ import annotations

	import argparse
	import json
	import sys
	from pathlib import Path
	from typing import Any

	import matplotlib
	matplotlib.use("Agg") # Non-interactive backend
	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker
	import numpy as np

	# ── Configuration ────────────────────────────────────────────────────────

	RESULTS_DIR = Path(__file__).parent.parent / "results"
	FIGURES_DIR = RESULTS_DIR / "figures"
	ABSOLUTE_DIR = RESULTS_DIR / "absolute"
	STRESS_DIR = RESULTS_DIR / "stress"

	# LaTeX-compatible style
	plt.rcParams.update({
	"font.family": "serif",
	"font.size": 11,
	"axes.labelsize": 12,
	"axes.titlesize": 13,
	"xtick.labelsize": 10,
	"ytick.labelsize": 10,
	"legend.fontsize": 10,
	"figure.dpi": 300,
	"savefig.dpi": 300,
	"savefig.bbox": "tight",
	"savefig.pad_inches": 0.1,
	"axes.grid": True,
	"grid.alpha": 0.3,
	"axes.spines.top": False,
	"axes.spines.right": False,
	})

	# Colorblind-safe palette
	COLORS = {
	"blue": "#4477AA",
	"orange": "#EE6677",
	"green": "#228833",
	"purple": "#AA3377",
	"cyan": "#66CCEE",
	"grey": "#BBBBBB",
	"red": "#CC3311",
	"teal": "#009988",
	"yellow": "#CCBB44",
	"indigo": "#332288",
	}

	PASS_COLOR = COLORS["green"]
	FAIL_COLOR = COLORS["red"]


	# ── Data Loading ─────────────────────────────────────────────────────────

	def load_json(path: Path) -> dict[str, Any]:
	"""Load JSON file and return parsed dict."""
	return json.loads(path.read_text())


	def save_figure(fig: plt.Figure, name: str) -> None:
	"""Save figure as PDF and PNG."""
	FIGURES_DIR.mkdir(parents=True, exist_ok=True)
	fig.savefig(FIGURES_DIR / f"{name}.pdf", format="pdf")
	fig.savefig(FIGURES_DIR / f"{name}.png", format="png")
	plt.close(fig)
	print(f" Saved: {name}.pdf + .png")


	# ── Figure 2: Frequency Combination Comparison ──────────────────────────

	def fig02_frequency_comparison() -> None:
	"""Bar chart: 6 frequency combos × recall and margin."""
	print("Fig 02: Frequency combination comparison...")
	data = load_json(ABSOLUTE_DIR / "multifreq_comparison.json")
	results = data["results"]

	combos = list(results.keys())
	recalls = [results[c]["recall"] * 100 for c in combos]
	margins = [results[c]["margin_mean"] * 1000 for c in combos] # ×1000
	failures = [results[c]["n_failures"] for c in combos]

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4.5))

	# Left: Recall
	x = np.arange(len(combos))
	bar_colors = [COLORS["green"] if c == "f0+f1" else COLORS["blue"] for c in combos]
	bars = ax1.bar(x, recalls, color=bar_colors, edgecolor="white", linewidth=0.5)
	ax1.set_xticks(x)
	ax1.set_xticklabels(combos, rotation=30, ha="right")
	ax1.set_ylabel("Recall@1 (%)")
	ax1.set_title("(a) Recall by Frequency Combination")
	ax1.set_ylim(60, 102)
	for bar, val, nf in zip(bars, recalls, failures):
	ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
	f"{val:.0f}%\n({nf} fail)", ha="center", va="bottom", fontsize=8)

	# Right: Mean margin
	bars2 = ax2.bar(x, margins, color=bar_colors, edgecolor="white", linewidth=0.5)
	ax2.set_xticks(x)
	ax2.set_xticklabels(combos, rotation=30, ha="right")
	ax2.set_ylabel("Mean Margin (×10³)")
	ax2.set_title("(b) Mean Discrimination Margin")
	for bar, val in zip(bars2, margins):
	ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05,
	f"{val:.1f}", ha="center", va="bottom", fontsize=8)

	fig.suptitle("Multi-Frequency Fingerprint Ablation (N=200)", fontsize=14, y=1.02)
	fig.tight_layout()
	save_figure(fig, "fig02_frequency_comparison")


	# ── Figure 3: Margin Power Law ──────────────────────────────────────────

	def fig03_margin_power_law() -> None:
	"""Log-log plot: margin vs N for f1 and f0+f1 with fitted power laws."""
	print("Fig 03: Margin power law...")
	f1_data = load_json(ABSOLUTE_DIR / "margin_compression_law.json")
	f0f1_data = load_json(ABSOLUTE_DIR / "multifreq_law.json")

	# f1 data
	f1_n = [int(n) for n in f1_data["results"].keys()]
	f1_margins = [f1_data["results"][str(n)]["mean_margin"] for n in f1_n]
	f1_alpha = f1_data["alpha"]
	f1_A = f1_data["A"]

	# f0+f1 data
	f0f1_n = [int(n) for n in f0f1_data["results"].keys()]
	f0f1_margins = [f0f1_data["results"][str(n)]["mean_margin"] for n in f0f1_n]
	f0f1_alpha = f0f1_data["alpha"]
	f0f1_A = f0f1_data["A"]

	fig, ax = plt.subplots(figsize=(7, 5))

	# Data points
	ax.scatter(f1_n, f1_margins, color=COLORS["orange"], s=60, zorder=5, label="f1 (data)")
	ax.scatter(f0f1_n, f0f1_margins, color=COLORS["blue"], s=60, zorder=5, label="f0+f1 (data)")

	# Fitted curves
	n_fit = np.linspace(3, 250, 200)
	f1_fit = f1_A * n_fit ** f1_alpha
	f0f1_fit = f0f1_A * n_fit ** f0f1_alpha

	ax.plot(n_fit, f1_fit, color=COLORS["orange"], linestyle="--", alpha=0.7,
	label=f"f1 fit: {f1_A:.4f}·N^{{{f1_alpha:.3f}}}")
	ax.plot(n_fit, f0f1_fit, color=COLORS["blue"], linestyle="--", alpha=0.7,
	label=f"f0+f1 fit: {f0f1_A:.4f}·N^{{{f0f1_alpha:.3f}}}")

	ax.set_xscale("log")
	ax.set_yscale("log")
	ax.set_xlabel("Corpus Size N")
	ax.set_ylabel("Mean Discrimination Margin")
	ax.set_title("Margin Power Law: Graceful Degradation")
	ax.legend(loc="upper right")
	ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
	ax.set_xticks([5, 10, 20, 50, 100, 200])

	# Annotation
	ax.annotate(
	f"f0+f1: α={f0f1_alpha:.3f} (shallower)\nf1: α={f1_alpha:.3f}",
	xy=(100, f0f1_A * 100 ** f0f1_alpha), xytext=(30, 0.003),
	arrowprops={"arrowstyle": "->", "color": COLORS["grey"]},
	fontsize=9, bbox={"boxstyle": "round,pad=0.3", "facecolor": "wheat", "alpha": 0.5}
	)

	fig.tight_layout()
	save_figure(fig, "fig03_margin_power_law")


	# ── Figure 4: Recall vs N — Fourier vs FCDB ─────────────────────────────

	def fig04_recall_vs_n() -> None:
	"""Fourier f0+f1 recall vs FCDB recall across corpus sizes."""
	print("Fig 04: Recall vs N (Fourier vs FCDB)...")
	f0f1_data = load_json(ABSOLUTE_DIR / "multifreq_law.json")
	stress_data = load_json(STRESS_DIR / "STRESS_SUMMARY.json")

	# Fourier f0+f1
	fourier_n = [int(n) for n in f0f1_data["results"].keys()]
	fourier_recall = [f0f1_data["results"][str(n)]["recall"] * 100 for n in fourier_n]

	# FCDB cross-model
	fcdb_map = stress_data["recall_at_1_vs_n_fcdb"]
	fcdb_n = [int(n) for n in fcdb_map.keys()]
	fcdb_recall = [v * 100 for v in fcdb_map.values()]

	fig, ax = plt.subplots(figsize=(7, 5))

	ax.plot(fourier_n, fourier_recall, "o-", color=COLORS["blue"], linewidth=2,
	markersize=7, label="Fourier f0+f1 (same-model)", zorder=5)
	ax.plot(fcdb_n, fcdb_recall, "s--", color=COLORS["orange"], linewidth=2,
	markersize=7, label="FCDB (cross-model)", zorder=5)

	# Collapse annotation
	ax.axvline(x=100, color=COLORS["red"], linestyle=":", alpha=0.5)
	ax.annotate("FCDB collapse\n(N=100)", xy=(100, 30), xytext=(140, 50),
	arrowprops={"arrowstyle": "->", "color": COLORS["red"]},
	fontsize=9, color=COLORS["red"])

	ax.set_xlabel("Corpus Size N")
	ax.set_ylabel("Recall@1 (%)")
	ax.set_title("Retrieval Recall vs Corpus Size")
	ax.legend(loc="lower left")
	ax.set_ylim(-5, 105)
	ax.set_xlim(0, 210)

	fig.tight_layout()
	save_figure(fig, "fig04_recall_vs_n")


	# ── Figure 5: Cross-Model Strategy Comparison ───────────────────────────

	def fig05_cross_model_strategies() -> None:
	"""Horizontal bar chart: 9 cross-model methods × margin."""
	print("Fig 05: Cross-model strategy comparison...")

	strategies = [
	("CCA", -0.420, False),
	("Residual FCB", -0.382, False),
	("Procrustes", -0.104, False),
	("RR (K=20)", -0.066, False),
	("FCB+ridge", -0.017, False),
	("Contrastive", 0.001, True),
	("JCB", 0.011, True),
	("JCB+delta", 0.037, True),
	("FCDB", 0.124, True),
	]

	names = [s[0] for s in strategies]
	margins = [s[1] for s in strategies]
	colors = [PASS_COLOR if s[2] else FAIL_COLOR for s in strategies]

	fig, ax = plt.subplots(figsize=(8, 5))
	y_pos = np.arange(len(names))

	bars = ax.barh(y_pos, margins, color=colors, edgecolor="white", linewidth=0.5, height=0.7)
	ax.set_yticks(y_pos)
	ax.set_yticklabels(names)
	ax.set_xlabel("Retrieval Margin")
	ax.set_title("Cross-Model Transfer Strategies (Llama 3B → 8B)")
	ax.axvline(x=0, color="black", linewidth=0.8)

	# Value labels
	for bar, val in zip(bars, margins):
	x_offset = 0.005 if val >= 0 else -0.005
	ha = "left" if val >= 0 else "right"
	ax.text(val + x_offset, bar.get_y() + bar.get_height() / 2,
	f"{val:+.3f}", ha=ha, va="center", fontsize=9, fontweight="bold")

	# Legend
	from matplotlib.patches import Patch
	legend_elements = [Patch(facecolor=PASS_COLOR, label="PASS (margin > 0)"),
	Patch(facecolor=FAIL_COLOR, label="FAIL (margin ≤ 0)")]
	ax.legend(handles=legend_elements, loc="lower right")

	fig.tight_layout()
	save_figure(fig, "fig05_cross_model_strategies")


	# ── Figure 6: CKA Layer Similarity ──────────────────────────────────────

	def fig06_cka_layers() -> None:
	"""CKA similarity per layer: within-family vs cross-family."""
	print("Fig 06: CKA layer similarity...")
	within = load_json(ABSOLUTE_DIR / "FAMILY_CKA.json")
	cross = load_json(ABSOLUTE_DIR / "FAMILY_CKA_CROSS.json")

	within_cka = within["layer_ckas"]
	cross_cka = cross["layer_ckas"]
	layers = list(range(len(within_cka)))

	fig, ax = plt.subplots(figsize=(8, 4.5))

	ax.plot(layers, within_cka, "o-", color=COLORS["blue"], markersize=5, linewidth=1.5,
	label=f"Within-family (Llama 3B↔8B), μ={within['mean_cka']:.3f}")
	ax.plot(layers, cross_cka, "s--", color=COLORS["orange"], markersize=5, linewidth=1.5,
	label=f"Cross-family (Llama↔Qwen), μ={cross['mean_cka']:.3f}")

	ax.axhline(y=0.95, color=COLORS["grey"], linestyle=":", alpha=0.5, label="0.95 threshold")
	ax.set_xlabel("Layer Index")
	ax.set_ylabel("CKA Similarity")
	ax.set_title("Centered Kernel Alignment Across Layers")
	ax.legend(loc="lower left", fontsize=9)
	ax.set_ylim(0.85, 1.0)

	# Annotate min
	min_idx_w = int(np.argmin(within_cka))
	min_idx_c = int(np.argmin(cross_cka))
	ax.annotate(f"min={within_cka[min_idx_w]:.3f}", xy=(min_idx_w, within_cka[min_idx_w]),
	xytext=(min_idx_w + 2, within_cka[min_idx_w] - 0.01),
	fontsize=8, color=COLORS["blue"])
	ax.annotate(f"min={cross_cka[min_idx_c]:.3f}", xy=(min_idx_c, cross_cka[min_idx_c]),
	xytext=(min_idx_c + 2, cross_cka[min_idx_c] - 0.01),
	fontsize=8, color=COLORS["orange"])

	fig.tight_layout()
	save_figure(fig, "fig06_cka_layers")


	# ── Figure 7: Domain Confusion Before/After ──────────────────────────────

	def fig07_confusion_matrix() -> None:
	"""Heatmaps: f1 confusion vs f0+f1 confusion across domains."""
	print("Fig 07: Domain confusion matrix...")
	data = load_json(ABSOLUTE_DIR / "confusion_analysis.json")

	domains = sorted({
	k.split(" -> ")[0] for k in data["f1_confusion"].keys()
	} \| {
	k.split(" -> ")[1] for k in data["f1_confusion"].keys()
	})

	def build_matrix(confusion_dict: dict[str, int]) -> np.ndarray:
	n = len(domains)
	mat = np.zeros((n, n))
	for key, count in confusion_dict.items():
	src, dst = key.split(" -> ")
	if src in domains and dst in domains:
	i = domains.index(src)
	j = domains.index(dst)
	mat[i, j] = count
	return mat

	f1_mat = build_matrix(data["f1_confusion"])
	best_mat = build_matrix(data["best_confusion"])

	# Short domain labels
	short_labels = [d[:6] for d in domains]

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

	im1 = ax1.imshow(f1_mat, cmap="Reds", aspect="auto", interpolation="nearest")
	ax1.set_xticks(range(len(domains)))
	ax1.set_yticks(range(len(domains)))
	ax1.set_xticklabels(short_labels, rotation=45, ha="right", fontsize=8)
	ax1.set_yticklabels(short_labels, fontsize=8)
	ax1.set_title("(a) f1 Only — 28 Failures")
	ax1.set_xlabel("Confused With")
	ax1.set_ylabel("True Domain")
	fig.colorbar(im1, ax=ax1, shrink=0.8)

	im2 = ax2.imshow(best_mat, cmap="Blues", aspect="auto", interpolation="nearest")
	ax2.set_xticks(range(len(domains)))
	ax2.set_yticks(range(len(domains)))
	ax2.set_xticklabels(short_labels, rotation=45, ha="right", fontsize=8)
	ax2.set_yticklabels(short_labels, fontsize=8)
	ax2.set_title("(b) f0+f1 — 4 Failures")
	ax2.set_xlabel("Confused With")
	ax2.set_ylabel("True Domain")
	fig.colorbar(im2, ax=ax2, shrink=0.8)

	fig.suptitle("Domain Confusion Analysis (N=200)", fontsize=14, y=1.02)
	fig.tight_layout()
	save_figure(fig, "fig07_confusion_matrix")


	# ── Figure 8: Domain Recall Radar ────────────────────────────────────────

	def fig08_domain_recall_radar() -> None:
	"""Radar chart: per-domain recall with f0+f1."""
	print("Fig 08: Domain recall radar...")
	data = load_json(ABSOLUTE_DIR / "confusion_analysis.json")
	domain_recall = data["domain_recall"]

	categories = list(domain_recall.keys())
	values = [domain_recall[c] * 100 for c in categories]

	# Close the polygon
	values_closed = values + [values[0]]
	n = len(categories)
	angles = [i / n * 2 * np.pi for i in range(n)]
	angles_closed = angles + [angles[0]]

	fig, ax = plt.subplots(figsize=(6, 6), subplot_kw={"projection": "polar"})

	ax.plot(angles_closed, values_closed, "o-", color=COLORS["blue"], linewidth=2, markersize=6)
	ax.fill(angles_closed, values_closed, color=COLORS["blue"], alpha=0.15)

	ax.set_xticks(angles)
	ax.set_xticklabels([c.replace("_", "\n") for c in categories], fontsize=9)
	ax.set_ylim(80, 102)
	ax.set_yticks([85, 90, 95, 100])
	ax.set_yticklabels(["85%", "90%", "95%", "100%"], fontsize=8)
	ax.set_title("Per-Domain Recall@1 (f0+f1, N=200)", pad=20)

	# Annotate minimum
	min_idx = int(np.argmin(values))
	ax.annotate(f"{values[min_idx]:.0f}%",
	xy=(angles[min_idx], values[min_idx]),
	xytext=(angles[min_idx] + 0.2, values[min_idx] - 3),
	fontsize=9, fontweight="bold", color=COLORS["red"])

	fig.tight_layout()
	save_figure(fig, "fig08_domain_recall_radar")


	# ── Figure 9: HNSW Benchmark ────────────────────────────────────────────

	def fig09_hnsw_benchmark() -> None:
	"""Bar chart: HNSW vs brute-force latency."""
	print("Fig 09: HNSW benchmark...")
	data = load_json(ABSOLUTE_DIR / "HNSW_BENCH.json")

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))

	# Latency comparison
	methods = ["Brute-Force", "HNSW"]
	latencies = [data["bf_latency_us"], data["hnsw_latency_us"]]
	colors = [COLORS["orange"], COLORS["blue"]]
	bars = ax1.bar(methods, latencies, color=colors, edgecolor="white", width=0.5)
	ax1.set_ylabel("Latency (μs)")
	ax1.set_title(f"(a) Search Latency — {data['speedup']:.1f}× Speedup")
	for bar, val in zip(bars, latencies):
	ax1.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 3,
	f"{val:.1f} μs", ha="center", va="bottom", fontsize=10)

	# Recall comparison
	recalls = [data["bruteforce_recall"] * 100, data["hnsw_recall"] * 100]
	bars2 = ax2.bar(methods, recalls, color=colors, edgecolor="white", width=0.5)
	ax2.set_ylabel("Recall@1 (%)")
	ax2.set_title("(b) Recall Preserved")
	ax2.set_ylim(98, 100.5)
	for bar, val in zip(bars2, recalls):
	ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.05,
	f"{val:.1f}%", ha="center", va="bottom", fontsize=10)

	fig.suptitle("HNSW Index Benchmark (N=200)", fontsize=14, y=1.02)
	fig.tight_layout()
	save_figure(fig, "fig09_hnsw_benchmark")


	# ── Figure 10: INT8 Compression ──────────────────────────────────────────

	def fig10_int8_compression() -> None:
	"""Bar chart: FP16 vs INT8 comparison."""
	print("Fig 10: INT8 compression...")

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(9, 4))

	# Size comparison
	configs = ["591 tok", "6,403 tok"]
	fp16_sizes = [73.9, 800.4]
	int8_sizes = [37.5, 406.5]
	x = np.arange(len(configs))
	w = 0.35
	ax1.bar(x - w / 2, fp16_sizes, w, label="FP16", color=COLORS["orange"], edgecolor="white")
	ax1.bar(x + w / 2, int8_sizes, w, label="INT8", color=COLORS["blue"], edgecolor="white")
	ax1.set_xticks(x)
	ax1.set_xticklabels(configs)
	ax1.set_ylabel("File Size (MB)")
	ax1.set_title("(a) .eng File Size — 1.97× Compression")
	ax1.legend()

	# Quality metrics
	metrics = ["Cosine\nSimilarity", "Margin\n(FP16)", "Margin\n(INT8)"]
	values = [0.99998, 0.381, 0.262]
	bar_colors = [COLORS["green"], COLORS["blue"], COLORS["cyan"]]
	bars = ax2.bar(metrics, values, color=bar_colors, edgecolor="white", width=0.5)
	ax2.set_ylabel("Value")
	ax2.set_title("(b) Quality Preservation")
	for bar, val in zip(bars, values):
	ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
	f"{val:.5f}" if val > 0.9 else f"{val:.3f}",
	ha="center", va="bottom", fontsize=9)

	fig.suptitle("INT8 Quantization Impact", fontsize=14, y=1.02)
	fig.tight_layout()
	save_figure(fig, "fig10_int8_compression")


	# ── Figure 12: Margin Distribution ───────────────────────────────────────

	def fig12_margin_distribution() -> None:
	"""Distribution comparison: f1 vs f0+f1 summary statistics."""
	print("Fig 12: Margin distribution...")
	data = load_json(ABSOLUTE_DIR / "multifreq_comparison.json")
	results = data["results"]

	fig, ax = plt.subplots(figsize=(7, 4.5))

	# We'll show key statistics as a visualization
	combos = ["f1", "f0+f1"]
	means = [results[c]["margin_mean"] * 1000 for c in combos]
	medians = [results[c]["margin_median"] * 1000 for c in combos]
	mins = [results[c]["margin_min"] * 1000 for c in combos]

	x = np.arange(len(combos))
	w = 0.25
	ax.bar(x - w, means, w, label="Mean", color=COLORS["blue"], edgecolor="white")
	ax.bar(x, medians, w, label="Median", color=COLORS["green"], edgecolor="white")
	ax.bar(x + w, mins, w, label="Min", color=COLORS["red"], edgecolor="white")

	ax.set_xticks(x)
	ax.set_xticklabels(combos, fontsize=12)
	ax.set_ylabel("Margin (×10³)")
	ax.set_title("Margin Statistics: f1 vs f0+f1 (N=200)")
	ax.legend()
	ax.axhline(y=0, color="black", linewidth=0.5)

	# Annotate improvement
	ax.annotate(
	f"+76% mean margin\n25/28 failures fixed",
	xy=(1, means[1]), xytext=(1.3, means[1] + 1),
	arrowprops={"arrowstyle": "->", "color": COLORS["green"]},
	fontsize=9, bbox={"boxstyle": "round,pad=0.3", "facecolor": "#e6ffe6", "alpha": 0.8}
	)

	fig.tight_layout()
	save_figure(fig, "fig12_margin_distribution")


	# ── Figure 13: FCDB Stability-Discrimination Tradeoff ────────────────────

	def fig13_fcdb_tradeoff() -> None:
	"""Dual-axis: basis stability vs retrieval margin vs corpus size."""
	print("Fig 13: FCDB stability-discrimination tradeoff...")

	# Data from PAPER_TABLE.md
	n_vals = [50, 100, 125, 200]
	stability = [0.82, 0.906, 0.983, 0.999] # subspace agreement
	margin = [0.124, None, None, 0.013] # Only measured at 50 and 200
	margin_n = [50, 200]
	margin_v = [0.124, 0.013]

	fig, ax1 = plt.subplots(figsize=(7, 5))
	ax2 = ax1.twinx()

	# Stability (left axis)
	line1 = ax1.plot(n_vals, stability, "o-", color=COLORS["blue"], linewidth=2,
	markersize=8, label="Basis Stability", zorder=5)
	ax1.set_xlabel("Corpus Size N")
	ax1.set_ylabel("Subspace Agreement", color=COLORS["blue"])
	ax1.tick_params(axis="y", labelcolor=COLORS["blue"])
	ax1.set_ylim(0.7, 1.05)

	# Margin (right axis)
	line2 = ax2.plot(margin_n, margin_v, "s--", color=COLORS["orange"], linewidth=2,
	markersize=8, label="Retrieval Margin", zorder=5)
	ax2.set_ylabel("Cross-Model Margin", color=COLORS["orange"])
	ax2.tick_params(axis="y", labelcolor=COLORS["orange"])
	ax2.set_ylim(-0.01, 0.15)

	# Threshold line
	ax1.axhline(y=0.99, color=COLORS["grey"], linestyle=":", alpha=0.5)
	ax1.annotate("Stable (≥0.99)", xy=(125, 0.99), fontsize=8, color=COLORS["grey"])

	# Combined legend
	lines = line1 + line2
	labels = [l.get_label() for l in lines]
	ax1.legend(lines, labels, loc="center left")

	ax1.set_title("FCDB Stability–Discrimination Tradeoff")
	fig.tight_layout()
	save_figure(fig, "fig13_fcdb_tradeoff")


	# ── Figure 14: TTFT Speedup ─────────────────────────────────────────────

	def fig14_ttft_speedup() -> None:
	"""Grouped bar chart: cold vs warm TTFT."""
	print("Fig 14: TTFT speedup...")

	configs = ["3B / 4K tok", "3B / 16K tok", "8B / 591 tok"]
	cold_ttft = [11439, 94592, 3508] # ms
	warm_ttft = [170, 1777, 116] # ms
	speedups = [67.2, 53.2, 30.8]

	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4.5))

	x = np.arange(len(configs))
	w = 0.35
	ax1.bar(x - w / 2, cold_ttft, w, label="Cold TTFT", color=COLORS["orange"], edgecolor="white")
	ax1.bar(x + w / 2, warm_ttft, w, label="Warm TTFT", color=COLORS["blue"], edgecolor="white")
	ax1.set_xticks(x)
	ax1.set_xticklabels(configs, fontsize=9)
	ax1.set_ylabel("TTFT (ms)")
	ax1.set_title("(a) Time to First Token")
	ax1.set_yscale("log")
	ax1.legend()

	# Speedup bars
	bars = ax2.bar(configs, speedups, color=COLORS["green"], edgecolor="white", width=0.5)
	ax2.set_ylabel("Speedup (×)")
	ax2.set_title("(b) KV Cache Restoration Speedup")
	ax2.set_xticklabels(configs, fontsize=9)
	for bar, val in zip(bars, speedups):
	ax2.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.5,
	f"{val:.1f}×", ha="center", va="bottom", fontsize=10, fontweight="bold")

	fig.suptitle("KV Cache Warm Start Performance", fontsize=14, y=1.02)
	fig.tight_layout()
	save_figure(fig, "fig14_ttft_speedup")


	# ── Figure 15: EGR Overhead Scaling ──────────────────────────────────────

	def fig15_egr_overhead() -> None:
	"""Scatter/line: EGR overhead vs token count."""
	print("Fig 15: EGR overhead scaling...")

	tokens = [600, 6403, 600]
	overhead_ms = [30.6, 48.8, 84.0]
	labels = ["16 layers\n(8-24)", "16 layers\n(8-24)", "32 layers\n(all)"]
	colors_pts = [COLORS["blue"], COLORS["blue"], COLORS["orange"]]

	fig, ax = plt.subplots(figsize=(6, 4.5))

	for t, o, l, c in zip(tokens, overhead_ms, labels, colors_pts):
	ax.scatter(t, o, s=100, color=c, zorder=5, edgecolor="white", linewidth=1.5)
	ax.annotate(l, xy=(t, o), xytext=(t + 200, o + 2), fontsize=9)

	ax.set_xlabel("Context Length (tokens)")
	ax.set_ylabel("EGR Overhead (ms)")
	ax.set_title("Fingerprint Extraction Overhead")
	ax.set_xlim(0, 7000)
	ax.set_ylim(20, 95)

	# Reference lines
	ax.axhline(y=50, color=COLORS["grey"], linestyle=":", alpha=0.3)
	ax.text(100, 51, "50ms threshold", fontsize=8, color=COLORS["grey"])

	fig.tight_layout()
	save_figure(fig, "fig15_egr_overhead")


	# ── Figure 1: Architecture Diagram (Mermaid) ────────────────────────────

	def fig01_architecture_mermaid() -> None:
	"""Generate Mermaid flowchart for system architecture."""
	print("Fig 01: Architecture diagram (Mermaid)...")
	mermaid = """\
	%%{init: {'theme': 'base', 'themeVariables': {'primaryColor': '#4477AA', 'primaryTextColor': '#fff', 'primaryBorderColor': '#335588', 'lineColor': '#666', 'secondaryColor': '#EE6677', 'tertiaryColor': '#228833'}}}%%
	flowchart TD
	A[LLM Runtime<br/>llama.cpp] -->\|KV cache blob\| B[Blob Parser]
	B -->\|Layer keys K\| C[Fourier Fingerprint<br/>f0+f1 DFT]
	C -->\|2048-dim vector\| D{Storage}
	D -->\|.eng binary\| E[EIGENGRAM File<br/>v1.2 format]
	D -->\|HNSW index\| F[FAISS IndexHNSW<br/>M=32]

	G[Query Session] -->\|New KV cache\| C
	C -->\|Query fingerprint\| H[Geodesic Retrieval]
	F -->\|Top-k candidates\| H

	H --> I{Stage 0<br/>Prior Check}
	I -->\|chronic failure\| J[Skip / LOW]
	I -->\|ok\| K{Stage 1<br/>HNSW Search}
	K -->\|HIGH / MEDIUM\| L[Result]
	K -->\|below threshold\| M{Stage 2<br/>Trajectory}
	M -->\|interpolation\| N{Stage 3<br/>Constraints}
	N --> O{Stage 4<br/>Metadata}
	O --> L

	subgraph Confidence Tracking
	P[IndexC<br/>SQLite] ---\|update\| I
	L ---\|record\| P
	end

	style A fill:#4477AA,stroke:#335588,color:#fff
	style C fill:#228833,stroke:#1a6625,color:#fff
	style E fill:#EE6677,stroke:#cc5566,color:#fff
	style F fill:#66CCEE,stroke:#55aabb,color:#000
	style H fill:#AA3377,stroke:#882266,color:#fff
	"""
	mermaid_path = FIGURES_DIR / "fig01_architecture.mmd"
	mermaid_path.write_text(mermaid)
	print(f" Saved: fig01_architecture.mmd")


	# ── Figure 11: Retrieval Pipeline (Mermaid) ──────────────────────────────

	def fig11_retrieval_pipeline_mermaid() -> None:
	"""Generate Mermaid diagram for 4-stage geodesic retrieval."""
	print("Fig 11: Retrieval pipeline (Mermaid)...")
	mermaid = """\
	%%{init: {'theme': 'base'}}%%
	flowchart LR
	Q[Query<br/>Fingerprint] --> S0

	S0[Stage 0<br/>Prior Preemption<br/><i>IndexC chronic<br/>failure check</i>]
	S0 -->\|"pass"\| S1
	S0 -->\|"preempt"\| SKIP[SKIP<br/>confidence=LOW]

	S1[Stage 1<br/>HNSW Search<br/><i>cosine top-k</i>]
	S1 -->\|"margin > 0.005"\| HIGH[HIGH<br/>199/200 docs]
	S1 -->\|"margin 0.001-0.005"\| MED[MEDIUM]
	S1 -->\|"margin < 0.001"\| S2

	S2[Stage 2<br/>Trajectory<br/><i>interpolation<br/>w=0.3</i>]
	S2 --> S3

	S3[Stage 3<br/>Negative<br/>Constraints<br/><i>apophatic layer</i>]
	S3 --> S4

	S4[Stage 4<br/>Metadata<br/>Disambig<br/><i>domain + keywords<br/>+ norms</i>]
	S4 --> LOW[LOW<br/>1/200 docs<br/><i>doc_146</i>]

	style S0 fill:#66CCEE,stroke:#55aabb
	style S1 fill:#4477AA,stroke:#335588,color:#fff
	style S2 fill:#CCBB44,stroke:#aa9933
	style S3 fill:#EE6677,stroke:#cc5566,color:#fff
	style S4 fill:#AA3377,stroke:#882266,color:#fff
	style HIGH fill:#228833,stroke:#1a6625,color:#fff
	style MED fill:#CCBB44,stroke:#aa9933
	style LOW fill:#EE6677,stroke:#cc5566,color:#fff
	style SKIP fill:#BBBBBB,stroke:#999999
	"""
	mermaid_path = FIGURES_DIR / "fig11_retrieval_pipeline.mmd"
	mermaid_path.write_text(mermaid)
	print(f" Saved: fig11_retrieval_pipeline.mmd")


	# ── Consolidated Findings JSON ───────────────────────────────────────────

	def generate_findings() -> None:
	"""Consolidate all key metrics into a single findings.json."""
	print("Generating consolidated findings...")

	findings = {
	"title": "ENGRAM Protocol — Consolidated Research Findings",
	"date": "2026-04-03",
	"hardware": {
	"platform": "Apple M3, 24GB RAM",
	"gpu": "Metal (n_gpu_layers=-1)",
	"os": "macOS Darwin 25.4.0",
	"llama_cpp": "0.3.19",
	"faiss": "1.13.2",
	"torch": "2.11.0",
	},
	"same_model_retrieval": {
	"method": "Fourier f0+f1 fingerprint",
	"corpus_size": 200,
	"n_domains": 10,
	"recall_at_1": 0.98,
	"n_failures": 4,
	"mean_margin": 0.007201,
	"margin_power_law": {"A": 0.021342, "alpha": -0.2065},
	"f1_only_recall": 0.86,
	"f1_only_failures": 28,
	"improvement_over_f1": "25/28 failures fixed (+76% mean margin)",
	"ml_math_confusion_reduction": "81.5%",
	},
	"frequency_ablation": {
	"combos_tested": 6,
	"best": "f0+f1",
	"results": {
	"f1": {"recall": 0.86, "margin": 0.004087},
	"f2": {"recall": 0.715, "margin": 0.002196},
	"f1+f2": {"recall": 0.95, "margin": 0.004744},
	"f1+f2+f3": {"recall": 0.95, "margin": 0.004129},
	"f0+f1": {"recall": 0.98, "margin": 0.007201},
	"f1+f3": {"recall": 0.89, "margin": 0.003477},
	},
	},
	"hnsw_index": {
	"speedup": 5.65,
	"recall": 0.995,
	"latency_us": 51.83,
	"bruteforce_latency_us": 293.07,
	},
	"geodesic_retrieval": {
	"stages": 4,
	"final_recall": 1.0,
	"n_high": 0,
	"n_medium": 199,
	"n_low": 1,
	"hard_failure": "doc_146 (resolved by Stage 4 metadata)",
	},
	"int8_compression": {
	"ratio": 1.97,
	"cosine_similarity": 0.99998,
	"margin_fp16": 0.381,
	"margin_int8": 0.262,
	"margin_preserved": True,
	},
	"ttft_speedup": {
	"3b_4k": {"cold_ms": 11439, "warm_ms": 170, "speedup": 67.2},
	"3b_16k": {"cold_ms": 94592, "warm_ms": 1777, "speedup": 53.2},
	"8b_591": {"cold_ms": 3508, "warm_ms": 116, "speedup": 30.8},
	},
	"cross_model_transfer": {
	"n_strategies": 9,
	"best_method": "FCDB",
	"best_margin": 0.124,
	"results": {
	"CCA": {"margin": -0.420, "correct": False},
	"Residual_FCB": {"margin": -0.382, "correct": False},
	"Procrustes": {"margin": -0.104, "correct": False},
	"RR": {"margin": -0.066, "correct": False},
	"FCB_ridge": {"margin": -0.017, "correct": False},
	"Contrastive": {"margin": 0.001, "correct": True},
	"JCB": {"margin": 0.011, "correct": True},
	"JCB_delta": {"margin": 0.037, "correct": True},
	"FCDB": {"margin": 0.124, "correct": True},
	},
	"key_insight": "Cross-model transfer requires representing documents as directions from a shared reference point (Frechet mean), not positions in space",
	},
	"fcdb_scaling": {
	"v1_n50": {"stability": 0.82, "margin": 0.124},
	"v2_n200": {"stability": 0.999, "margin": 0.013},
	"collapse_n": 100,
	"tradeoff": "Larger corpus stabilizes basis but dilutes per-document signal",
	},
	"cka_analysis": {
	"within_family": {"models": "Llama 3B ↔ 8B", "mean_cka": 0.975, "f0f1_sim": 0.875},
	"cross_family": {"models": "Llama ↔ Qwen", "mean_cka": 0.927, "f0f1_sim": 0.259},
	"verdict": "Manifolds topologically isomorphic (CKA>0.92 all pairs)",
	},
	"domain_recall": {
	"computer_science": 1.0, "general_world": 0.95, "history": 1.0,
	"language_arts": 1.0, "ml_systems": 0.90, "mathematics": 1.0,
	"philosophy": 1.0, "medicine": 0.95, "biology": 1.0, "physics": 1.0,
	},
	"eigengram_format": {
	"version": "1.2",
	"architectures": ["llama", "gemma", "gemma4/ISWA", "phi", "qwen", "mistral"],
	"iswa_support": "Gemma 4 26B dual-cache (5+25 layers, 6144-dim fingerprint)",
	},
	}

	paper_dir = RESULTS_DIR / "paper"
	paper_dir.mkdir(parents=True, exist_ok=True)
	findings_path = paper_dir / "findings.json"
	findings_path.write_text(json.dumps(findings, indent=2))
	print(f" Saved: paper/findings.json")


	# ── LaTeX Tables ─────────────────────────────────────────────────────────

	def generate_latex_tables() -> None:
	"""Generate LaTeX table source for the paper."""
	print("Generating LaTeX tables...")

	tables = r"""\
	% ──────────────────────────────────────────────────────────────────────
	% Table 1: Multi-Frequency Ablation
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{Multi-frequency fingerprint ablation at $N=200$. The f0+f1 combination
	achieves the highest recall and mean margin, fixing 25 of 28 single-frequency failures.}
	\label{tab:frequency-ablation}
	\begin{tabular}{lcccc}
	\toprule
	Frequencies & Recall@1 & Mean Margin & Min Margin & Failures \\
	\midrule
	$f_1$ & 86.0\% & 4.09$\times 10^{-3}$ & $-4.71\times 10^{-3}$ & 28 \\
	$f_2$ & 71.5\% & 2.20$\times 10^{-3}$ & $-5.85\times 10^{-3}$ & 57 \\
	$f_1 + f_2$ & 95.0\% & 4.74$\times 10^{-3}$ & $-2.68\times 10^{-3}$ & 10 \\
	$f_1 + f_2 + f_3$ & 95.0\% & 4.13$\times 10^{-3}$ & $-2.71\times 10^{-3}$ & 10 \\
	\rowcolor{green!10}
	$f_0 + f_1$ & \textbf{98.0\%} & \textbf{7.20}$\times 10^{-3}$ & $-4.09\times 10^{-3}$ & \textbf{4} \\
	$f_1 + f_3$ & 89.0\% & 3.48$\times 10^{-3}$ & $-4.08\times 10^{-3}$ & 22 \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 2: Cross-Model Transfer Strategies
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{Cross-model transfer strategies (Llama 3B $\to$ 8B). Nine methods tested;
	FCDB achieves the only reliable positive margin without requiring an adapter.}
	\label{tab:cross-model}
	\begin{tabular}{lccc}
	\toprule
	Method & Margin & Correct & Adapter \\
	\midrule
	CCA & $-0.420$ & \xmark & symmetric \\
	Residual FCB & $-0.382$ & \xmark & none \\
	Procrustes & $-0.104$ & \xmark & orthogonal \\
	Relative Repr. & $-0.066$ & \xmark & none \\
	FCB + ridge & $-0.017$ & \xmark & ridge \\
	\midrule
	Contrastive $\delta$ & $+0.001$ & \cmark & ridge \\
	JCB & $+0.011$ & \cmark & none \\
	JCB + $\delta$ & $+0.037$ & \cmark & none \\
	\rowcolor{green!10}
	\textbf{FCDB} & $\mathbf{+0.124}$ & \cmark & \textbf{none} \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 3: TTFT Speedup
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{KV cache warm-start performance. TTFT speedup ranges from 27--67$\times$
	depending on model size and context length.}
	\label{tab:ttft}
	\begin{tabular}{lccccc}
	\toprule
	Model & Tokens & Cold TTFT & Warm TTFT & Speedup & EGR (ms) \\
	\midrule
	Llama 3.2 3B & 4,002 & 11,439\,ms & 170\,ms & 67.2$\times$ & 9.5 \\
	Llama 3.2 3B & 16,382 & 94,592\,ms & 1,777\,ms & 53.2$\times$ & 9.5 \\
	Llama 3.1 8B & 591 & 3,508\,ms & 116\,ms & 30.8$\times$ & 30.6 \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 4: INT8 Compression
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{INT8 quantization results. Per-row symmetric quantization achieves
	1.97$\times$ compression with negligible quality loss (cos\_sim = 0.99998).}
	\label{tab:int8}
	\begin{tabular}{lcccc}
	\toprule
	Tokens & FP16 Size & INT8 Size & Ratio & $\cos(s_\text{fp16}, s_\text{int8})$ \\
	\midrule
	591 & 73.9\,MB & 37.5\,MB & 1.97$\times$ & 0.99998 \\
	6,403 & 800.4\,MB & 406.5\,MB & 1.97$\times$ & 0.99998 \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 5: CKA Analysis
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{Centered Kernel Alignment (CKA) between model families. High CKA values
	($>0.92$) confirm topological isomorphism of key manifolds across architectures.}
	\label{tab:cka}
	\begin{tabular}{lccc}
	\toprule
	Comparison & Mean CKA & f0+f1 Sim & Verdict \\
	\midrule
	Within-family (Llama 3B $\leftrightarrow$ 8B) & 0.975 & 0.875 & Isomorphic \\
	Cross-family (Llama $\leftrightarrow$ Qwen) & 0.927 & 0.259 & Isomorphic \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 6: HNSW Benchmark
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{HNSW index performance at $N=200$. The index provides 5.65$\times$
	speedup over brute-force with no recall loss.}
	\label{tab:hnsw}
	\begin{tabular}{lcc}
	\toprule
	Method & Latency ($\mu$s) & Recall@1 \\
	\midrule
	Brute-force & 293.1 & 99.5\% \\
	HNSW ($M=32$) & 51.8 & 99.5\% \\
	\midrule
	\textbf{Speedup} & \textbf{5.65$\times$} & --- \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 7: Domain Recall
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{Per-domain recall@1 with f0+f1 fingerprint at $N=200$.
	All domains achieve $\geq 90\%$ recall.}
	\label{tab:domain-recall}
	\begin{tabular}{lc}
	\toprule
	Domain & Recall@1 \\
	\midrule
	Biology & 100.0\% \\
	Computer Science & 100.0\% \\
	History & 100.0\% \\
	Language Arts & 100.0\% \\
	Mathematics & 100.0\% \\
	Philosophy & 100.0\% \\
	Physics & 100.0\% \\
	General World & 95.0\% \\
	Medicine & 95.0\% \\
	ML/Systems & 90.0\% \\
	\bottomrule
	\end{tabular}
	\end{table}

	% ──────────────────────────────────────────────────────────────────────
	% Table 8: Margin Power Law
	% ──────────────────────────────────────────────────────────────────────
	\begin{table}[t]
	\centering
	\caption{Margin scaling law parameters. Both fingerprint methods follow
	power-law decay $\bar{m} = A \cdot N^\alpha$ with no hard collapse point.}
	\label{tab:power-law}
	\begin{tabular}{lccc}
	\toprule
	Fingerprint & $A$ & $\alpha$ & Recall@200 \\
	\midrule
	$f_1$ & 0.0181 & $-0.277$ & 86.0\% \\
	$f_0 + f_1$ & 0.0213 & $-0.207$ & 98.0\% \\
	\bottomrule
	\end{tabular}
	\end{table}
	"""

	paper_dir = RESULTS_DIR / "paper"
	paper_dir.mkdir(parents=True, exist_ok=True)
	tables_path = paper_dir / "tables.tex"
	tables_path.write_text(tables)
	print(f" Saved: paper/tables.tex")


	# ── Registry ─────────────────────────────────────────────────────────────

	FIGURE_REGISTRY: dict[str, tuple[str, object]] = {
	"fig01": ("System Architecture (Mermaid)", fig01_architecture_mermaid),
	"fig02": ("Frequency Combination Comparison", fig02_frequency_comparison),
	"fig03": ("Margin Power Law", fig03_margin_power_law),
	"fig04": ("Recall vs N (Fourier vs FCDB)", fig04_recall_vs_n),
	"fig05": ("Cross-Model Strategy Comparison", fig05_cross_model_strategies),
	"fig06": ("CKA Layer Similarity", fig06_cka_layers),
	"fig07": ("Domain Confusion Matrix", fig07_confusion_matrix),
	"fig08": ("Domain Recall Radar", fig08_domain_recall_radar),
	"fig09": ("HNSW Benchmark", fig09_hnsw_benchmark),
	"fig10": ("INT8 Compression", fig10_int8_compression),
	"fig11": ("Retrieval Pipeline (Mermaid)", fig11_retrieval_pipeline_mermaid),
	"fig12": ("Margin Distribution", fig12_margin_distribution),
	"fig13": ("FCDB Tradeoff", fig13_fcdb_tradeoff),
	"fig14": ("TTFT Speedup", fig14_ttft_speedup),
	"fig15": ("EGR Overhead Scaling", fig15_egr_overhead),
	"findings": ("Consolidated Findings JSON", generate_findings),
	"tables": ("LaTeX Tables", generate_latex_tables),
	}


	def main() -> None:
	parser = argparse.ArgumentParser(description="Generate ENGRAM paper figures")
	parser.add_argument("--only", help="Generate only this figure (e.g., fig02)")
	parser.add_argument("--list", action="store_true", help="List all figures")
	args = parser.parse_args()

	if args.list:
	print("\nAvailable figures:")
	for key, (desc, _) in FIGURE_REGISTRY.items():
	print(f" {key:10s} {desc}")
	return

	FIGURES_DIR.mkdir(parents=True, exist_ok=True)
	print(f"\nOutput directory: {FIGURES_DIR}\n")

	if args.only:
	if args.only not in FIGURE_REGISTRY:
	print(f"Unknown figure: {args.only}")
	print(f"Available: {', '.join(FIGURE_REGISTRY.keys())}")
	sys.exit(1)
	desc, func = FIGURE_REGISTRY[args.only]
	func()
	else:
	for key, (desc, func) in FIGURE_REGISTRY.items():
	try:
	func()
	except Exception as e:
	print(f" ERROR generating {key}: {e}")

	print(f"\nDone. Figures saved to: {FIGURES_DIR}")


	if __name__ == "__main__":
	main()