TaoNet-mini-T2 / code /Taotern_SSM /scripts /generate_gamma_benchmark_notebook.py

Add files using upload-large-folder tool

388fd6e verified 15 days ago

52.2 kB

	import json
	from pathlib import Path


	def md(text):
	return {
	"cell_type": "markdown",
	"metadata": {},
	"source": text.strip("\n").splitlines(keepends=True),
	}


	def code(text):
	return {
	"cell_type": "code",
	"execution_count": None,
	"metadata": {},
	"outputs": [],
	"source": text.strip("\n").splitlines(keepends=True),
	}


	QUICK_NOTEBOOK_PATH = Path("output/jupyter-notebook/gamma-s4-sinewave-benchmark.ipynb")
	RESEARCH_NOTEBOOK_PATH = Path("output/jupyter-notebook/gamma-s4-research-benchmark.ipynb")


	def setup_cell():
	return code(
	r"""
	import os
	import sys
	import subprocess
	import importlib
	from pathlib import Path

	IN_COLAB = "google.colab" in sys.modules
	REPO_DIR = Path.cwd()

	if IN_COLAB:
	print("Running in Google Colab")
	from google.colab import userdata
	from getpass import getpass

	REPO_NAME = "gamma_ssm_s4_v2"
	REPO_DIR = Path("/content") / REPO_NAME
	GITHUB_REPO = "StarMists/gamma_SSM_S4_enhanced"

	token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN")
	if not token:
	try:
	token = userdata.get("GITHUB_TOKEN")
	except Exception:
	token = None
	if not token:
	token = getpass("GitHub personal access token for private repo access: ").strip()

	clone_url = f"https://{token}@github.com/{GITHUB_REPO}.git"
	if REPO_DIR.exists():
	subprocess.run(["git", "-C", str(REPO_DIR), "fetch", "origin"], check=True)
	subprocess.run(["git", "-C", str(REPO_DIR), "checkout", "main"], check=True)
	subprocess.run(["git", "-C", str(REPO_DIR), "reset", "--hard", "origin/main"], check=True)
	else:
	subprocess.run(["git", "clone", clone_url, str(REPO_DIR)], check=True)

	os.chdir(REPO_DIR)
	sys.path.insert(0, str(REPO_DIR))
	else:
	print("Running locally from", REPO_DIR)
	sys.path.insert(0, str(REPO_DIR))

	importlib.invalidate_caches()
	for name in list(sys.modules):
	if (
	name == "gamma_space_model"
	or name.startswith("gamma_space_model.")
	or name == "csrc"
	or name.startswith("csrc.")
	or name == "tilelang"
	or name.startswith("tilelang.")
	):
	del sys.modules[name]

	try:
	commit = subprocess.check_output(["git", "-C", str(REPO_DIR), "rev-parse", "--short", "HEAD"], text=True).strip()
	print("Repo commit:", commit)
	except Exception:
	pass
	"""
	)


	def imports_cell():
	return code(
	r"""
	import math
	import random
	import time
	import urllib.request

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader, TensorDataset

	from gamma_space_model import GammaSingleBlock, GammaS4Block, GammaS4MinimalBlock, S4TernaryDPLRBlock

	SEED = 7
	random.seed(SEED)
	np.random.seed(SEED)
	torch.manual_seed(SEED)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(SEED)

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	USE_AMP = DEVICE.type == "cuda"

	if hasattr(torch, "amp") and hasattr(torch.amp, "GradScaler"):
	scaler = torch.amp.GradScaler("cuda", enabled=USE_AMP)
	else:
	scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)


	def synchronize():
	if DEVICE.type == "cuda":
	torch.cuda.synchronize()


	print("Device:", DEVICE)
	print("Deployment cache available on GammaS4Block:", hasattr(GammaS4Block, "allocate_deployment_cache"))
	if DEVICE.type != "cuda":
	print("WARNING: running on CPU. Treat speed numbers as smoke-test only, not main benchmark evidence.")
	"""
	)


	def shared_helpers_cell():
	return code(
	r"""
	def make_forecasting_split(config, split, seed):
	rng = np.random.default_rng(seed)
	count = config["train_samples"] if split == "train" else config["val_samples"]
	seq_len = config["seq_len"]
	features = config["features"]
	complexity = config["complexity"]
	t = np.linspace(0.0, 1.0, seq_len + 1, dtype=np.float32)
	data = np.zeros((count, seq_len + 1, features), dtype=np.float32)

	for i in range(count):
	phase = rng.uniform(0.0, 2.0 * np.pi)
	chirp = np.sin(2.0 * np.pi * (1.0 + 1.75 * t*2) rng.uniform(0.9, 1.15) + phase)
	slow = np.sin(2.0 * np.pi * (0.4 + 0.25 * complexity) * t + 0.7 * phase)
	medium = np.sin(2.0 * np.pi * (2.5 + complexity) * t + 1.1 * phase)
	fast = np.cos(2.0 * np.pi * (5.0 + 2.0 * complexity) * t + 1.5 * phase)
	bursts = (np.sin(2.0 * np.pi * (3.0 + complexity) * t + 0.4 * phase) > 0.8).astype(np.float32)
	bursts = bursts * np.sin(2.0 * np.pi * (10.0 + complexity) * t + 0.3 * phase)
	delayed = np.roll(chirp, 4 * complexity)
	modulated = medium * (1.0 + 0.35 * slow)
	components = [chirp, slow, medium, fast, bursts, delayed, modulated]

	for channel in range(features):
	weights = rng.normal(0.0, 1.0, size=len(components)).astype(np.float32)
	weights[: 2 + complexity] *= 1.2
	signal = sum(w * c for w, c in zip(weights, components))
	if complexity >= 2:
	signal += 0.20 * np.tanh(np.roll(signal, channel + 1))
	if complexity >= 3:
	signal += 0.10 * np.sin(signal * (0.5 + 0.08 * channel))
	signal += rng.normal(0.0, 0.03 + 0.01 * complexity, size=seq_len + 1).astype(np.float32)
	data[i, :, channel] = signal

	data -= data.mean(axis=1, keepdims=True)
	data /= data.std(axis=1, keepdims=True) + 1e-5
	return TensorDataset(torch.from_numpy(data[:, :-1, :]), torch.from_numpy(data[:, 1:, :]))


	class SequenceForecaster(nn.Module):
	def __init__(self, input_dim, model_dim, layers, block_factory):
	super().__init__()
	self.in_proj = nn.Linear(input_dim, model_dim)
	self.layers = nn.ModuleList([block_factory(model_dim) for _ in range(layers)])
	self.out_proj = nn.Linear(model_dim, input_dim)

	def forward(self, x):
	x = self.in_proj(x)
	for layer in self.layers:
	x, _ = layer(x, state=None, return_state=False)
	return self.out_proj(x), None


	def build_forecasting_model(kind, config, overrides=None):
	overrides = overrides or {}
	d_model = config["d_model"]
	hidden_dim = config["hidden_dim"]
	num_layers = config["num_layers"]
	input_dim = config["features"]

	if kind == "gamma_baseline":
	block_factory = lambda width: GammaSingleBlock(
	d_model=width,
	hidden_dim=hidden_dim,
	dropout=0.0,
	)
	elif kind == "gamma_s4_enhanced":
	block_factory = lambda width: GammaS4Block(
	d_model=width,
	hidden_dim=hidden_dim,
	kernel_mode=overrides.get("kernel_mode", "auto"),
	kernel_threshold=overrides.get("kernel_threshold", 384),
	discretization=overrides.get("discretization", "bilinear"),
	gate=overrides.get("gate", True),
	input_gate=overrides.get("input_gate", True),
	activation=overrides.get("activation", "gelu"),
	use_D=overrides.get("use_D", True),
	layer_scale_init=overrides.get("layer_scale_init", 0.1),
	)
	elif kind == "gamma_s4_minimal":
	block_factory = lambda width: GammaS4MinimalBlock(
	d_model=width,
	hidden_dim=hidden_dim,
	kernel_mode=overrides.get("kernel_mode", "auto"),
	kernel_threshold=overrides.get("kernel_threshold", 384),
	discretization=overrides.get("discretization", "bilinear"),
	use_D=overrides.get("use_D", True),
	)
	elif kind == "s4_ternary_dplr_ssm":
	block_factory = lambda width: S4TernaryDPLRBlock(
	d_model=width,
	hidden_dim=hidden_dim,
	rank=overrides.get("rank", 1),
	kernel_mode=overrides.get("kernel_mode", "auto"),
	kernel_threshold=overrides.get("kernel_threshold", 256),
	gate=overrides.get("gate", True),
	input_gate=overrides.get("input_gate", True),
	activation=overrides.get("activation", "gelu"),
	use_D=overrides.get("use_D", True),
	layer_scale_init=overrides.get("layer_scale_init", 0.1),
	)
	else:
	raise ValueError(kind)

	return SequenceForecaster(input_dim, d_model, num_layers, block_factory).to(DEVICE)


	def profile_train_step(model, batch_x, batch_y):
	optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
	model.train()
	batch_x = batch_x.to(DEVICE)
	batch_y = batch_y.to(DEVICE)
	optimizer.zero_grad(set_to_none=True)

	synchronize()
	t0 = time.perf_counter()
	with torch.autocast(device_type=DEVICE.type, enabled=USE_AMP):
	pred, _ = model(batch_x)
	loss = F.mse_loss(pred, batch_y)
	synchronize()
	t1 = time.perf_counter()
	scaler.scale(loss).backward()
	synchronize()
	t2 = time.perf_counter()
	scaler.step(optimizer)
	scaler.update()
	synchronize()
	t3 = time.perf_counter()

	return {
	"forward_ms": 1000.0 * (t1 - t0),
	"backward_ms": 1000.0 * (t2 - t1),
	"optimizer_ms": 1000.0 * (t3 - t2),
	"loss": float(loss.detach().cpu()),
	}


	def run_epoch(model, loader, optimizer=None):
	training = optimizer is not None
	model.train(training)
	losses = []
	synchronize()
	start = time.perf_counter()

	for batch_x, batch_y in loader:
	batch_x = batch_x.to(DEVICE)
	batch_y = batch_y.to(DEVICE)
	if training:
	optimizer.zero_grad(set_to_none=True)
	with torch.autocast(device_type=DEVICE.type, enabled=USE_AMP):
	pred, _ = model(batch_x)
	loss = F.mse_loss(pred, batch_y)
	if training:
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	losses.append(loss.detach().item())

	synchronize()
	return float(np.mean(losses)), time.perf_counter() - start


	def benchmark_inference(model, sample_x):
	model.eval()
	sample_x = sample_x.to(DEVICE)
	with torch.no_grad():
	for _ in range(2):
	_ = model(sample_x)
	synchronize()

	t0 = time.perf_counter()
	pred, _ = model(sample_x)
	synchronize()
	full_latency = time.perf_counter() - t0

	hidden = model.in_proj(sample_x)
	states, caches, outputs = [], [], []
	synchronize()
	t_cache = time.perf_counter()
	for layer in model.layers:
	ssm = getattr(layer, "ssm", None)
	if ssm is None:
	states.append(None)
	caches.append(None)
	else:
	states.append(ssm.init_state(sample_x.size(0), DEVICE, hidden.dtype))
	if hasattr(layer, "allocate_inference_cache"):
	caches.append(layer.allocate_inference_cache(sample_x.size(0), sample_x.size(1), DEVICE, hidden.dtype))
	else:
	caches.append(None)
	synchronize()
	cache_setup = time.perf_counter() - t_cache

	synchronize()
	t1 = time.perf_counter()
	for step in range(sample_x.size(1)):
	token = hidden[:, step, :]
	new_outputs = token
	for idx, layer in enumerate(model.layers):
	if caches[idx] is None:
	new_outputs, states[idx] = layer.step(new_outputs, states[idx])
	else:
	new_outputs, states[idx] = layer.step(new_outputs, states[idx], cache=caches[idx])
	outputs.append(new_outputs)
	recurrent = model.out_proj(torch.stack(outputs, dim=1))
	synchronize()
	recurrent_latency = time.perf_counter() - t1

	def run_cached_recurrent(cache_allocator_name):
	hidden_local = model.in_proj(sample_x)
	states_local, caches_local, outputs_local = [], [], []
	synchronize()
	cache_start = time.perf_counter()
	for layer in model.layers:
	ssm = getattr(layer, "ssm", None)
	if ssm is None:
	states_local.append(None)
	caches_local.append(None)
	else:
	states_local.append(ssm.init_state(sample_x.size(0), DEVICE, hidden_local.dtype))
	if hasattr(layer, cache_allocator_name):
	allocator = getattr(layer, cache_allocator_name)
	caches_local.append(allocator(sample_x.size(0), sample_x.size(1), DEVICE, hidden_local.dtype))
	elif hasattr(layer, "allocate_deployment_cache"):
	caches_local.append(layer.allocate_deployment_cache(sample_x.size(0), sample_x.size(1), DEVICE, hidden_local.dtype))
	elif hasattr(layer, "allocate_inference_cache"):
	caches_local.append(layer.allocate_inference_cache(sample_x.size(0), sample_x.size(1), DEVICE, hidden_local.dtype))
	else:
	caches_local.append(None)
	synchronize()
	cache_elapsed = time.perf_counter() - cache_start

	synchronize()
	start = time.perf_counter()
	for step in range(sample_x.size(1)):
	token = hidden_local[:, step, :]
	new_outputs = token
	for idx, layer in enumerate(model.layers):
	if caches_local[idx] is None:
	new_outputs, states_local[idx] = layer.step(new_outputs, states_local[idx])
	else:
	new_outputs, states_local[idx] = layer.step(new_outputs, states_local[idx], cache=caches_local[idx])
	outputs_local.append(new_outputs)
	recurrent_out = model.out_proj(torch.stack(outputs_local, dim=1))
	synchronize()
	elapsed = time.perf_counter() - start
	return cache_elapsed, elapsed, recurrent_out

	lightweight_latency = float("nan")
	lightweight_tokens_per_s = float("nan")
	deploy_supported = any(hasattr(layer, "allocate_deployment_cache") for layer in model.layers)
	if deploy_supported:
	cache_setup_lightweight, lightweight_latency, recurrent_light = run_cached_recurrent("allocate_deployment_cache")
	lightweight_tokens_per_s = (sample_x.shape[0] * sample_x.shape[1]) / max(lightweight_latency, 1e-9)
	else:
	cache_setup_lightweight = float("nan")
	recurrent_light = None

	balanced_latency = float("nan")
	balanced_tokens_per_s = float("nan")
	balanced_supported = any(hasattr(layer, "allocate_balanced_deployment_cache") for layer in model.layers)
	if balanced_supported:
	cache_setup_balanced, balanced_latency, recurrent_balanced = run_cached_recurrent("allocate_balanced_deployment_cache")
	balanced_tokens_per_s = (sample_x.shape[0] * sample_x.shape[1]) / max(balanced_latency, 1e-9)
	else:
	cache_setup_balanced = float("nan")
	recurrent_balanced = None

	tokens = sample_x.shape[0] * sample_x.shape[1]
	return {
	"full_latency_ms": 1000.0 * full_latency,
	"full_tokens_per_s": tokens / max(full_latency, 1e-9),
	"cache_setup_ms": 1000.0 * cache_setup,
	"recurrent_latency_ms": 1000.0 * recurrent_latency,
	"recurrent_tokens_per_s": tokens / max(recurrent_latency, 1e-9),
	"recurrent_match_mse": float(F.mse_loss(recurrent, pred).detach().cpu()),
	"deploy_supported": deploy_supported,
	"deploy_cache_setup_ms": 1000.0 * cache_setup_lightweight,
	"deploy_recurrent_latency_ms": 1000.0 * lightweight_latency if lightweight_latency == lightweight_latency else float("nan"),
	"deploy_recurrent_tokens_per_s": lightweight_tokens_per_s,
	"deploy_match_mse": float(F.mse_loss(recurrent_light, pred).detach().cpu()) if recurrent_light is not None else float("nan"),
	"balanced_deploy_supported": balanced_supported,
	"balanced_deploy_cache_setup_ms": 1000.0 * cache_setup_balanced,
	"balanced_deploy_recurrent_latency_ms": 1000.0 * balanced_latency if balanced_latency == balanced_latency else float("nan"),
	"balanced_deploy_recurrent_tokens_per_s": balanced_tokens_per_s,
	"balanced_deploy_match_mse": float(F.mse_loss(recurrent_balanced, pred).detach().cpu()) if recurrent_balanced is not None else float("nan"),
	"prediction": pred.detach().cpu(),
	"recurrent_prediction": recurrent.detach().cpu(),
	}


	def show_benchmark_tables(df, title="Benchmark"):
	if df.empty:
	display(df)
	return

	def available(columns):
	return [col for col in columns if col in df.columns]

	ordered = df.sort_values(["task", "kind"]).reset_index(drop=True)
	normal_cols = available([
	"task",
	"kind",
	"params",
	"train_loss",
	"val_loss",
	"mean_epoch_time_s",
	"expected_full_mode",
	"forward_ms",
	"backward_ms",
	"optimizer_ms",
	"full_latency_ms",
	"full_tokens_per_s",
	"cache_setup_ms",
	"recurrent_latency_ms",
	"recurrent_tokens_per_s",
	"recurrent_match_mse",
	])
	deploy_cols = available([
	"task",
	"kind",
	"deploy_supported",
	"deploy_cache_setup_ms",
	"deploy_recurrent_latency_ms",
	"deploy_recurrent_tokens_per_s",
	"deploy_match_mse",
	])
	balanced_cols = available([
	"task",
	"kind",
	"balanced_deploy_supported",
	"balanced_deploy_cache_setup_ms",
	"balanced_deploy_recurrent_latency_ms",
	"balanced_deploy_recurrent_tokens_per_s",
	"balanced_deploy_match_mse",
	])

	print(f"{title}: normal/full-sequence and full recurrent")
	display(ordered[normal_cols])
	print(f"{title}: deployment-lite recurrent")
	display(ordered[deploy_cols])
	print(f"{title}: balanced deployment recurrent")
	display(ordered[balanced_cols])
	"""
	)


	def quick_notebook():
	cells = [
	md(
	"""
	# Gamma Baseline vs Gamma S4 Enhanced Quick Benchmark

	This is the fast notebook for day-to-day iteration.

	It is intentionally narrow:
	- compare `gamma_baseline`, `gamma_s4_enhanced`, and `s4_ternary_dplr_ssm`
	- use practical sequence lengths
	- keep `kernel_mode` conservative
	- report training speed, inference speed, and one-step profiling
	"""
	),
	setup_cell(),
	imports_cell(),
	code(
	r"""
	QUICK_TASKS = {
	"simple": dict(seq_len=192, features=4, train_samples=256, val_samples=64, epochs=4, batch_size=32, d_model=48, hidden_dim=64, num_layers=2, complexity=1),
	"moderate": dict(seq_len=320, features=6, train_samples=320, val_samples=80, epochs=5, batch_size=24, d_model=64, hidden_dim=96, num_layers=2, complexity=2),
	}

	MODEL_OVERRIDES = {
	"gamma_baseline": {},
	"gamma_s4_enhanced": {
	"kernel_mode": "auto",
	"kernel_threshold": 384,
	"discretization": "bilinear",
	"gate": True,
	"input_gate": True,
	"activation": "gelu",
	"use_D": True,
	"layer_scale_init": 0.1,
	},
	"s4_ternary_dplr_ssm": {
	"kernel_mode": "auto",
	"kernel_threshold": 256,
	"rank": 1,
	"gate": True,
	"input_gate": True,
	"activation": "gelu",
	"use_D": True,
	"layer_scale_init": 0.1,
	},
	}

	ACTIVE_TASKS = ["simple", "moderate"]
	MODELS = ["gamma_baseline", "gamma_s4_enhanced", "s4_ternary_dplr_ssm"]
	"""
	),
	shared_helpers_cell(),
	code(
	r"""
	def train_and_benchmark(task_name, kind):
	cfg = QUICK_TASKS[task_name]
	train_ds = make_forecasting_split(cfg, "train", seed=SEED + 11)
	val_ds = make_forecasting_split(cfg, "val", seed=SEED + 29)
	train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True)
	val_loader = DataLoader(val_ds, batch_size=cfg["batch_size"], shuffle=False)

	model = build_forecasting_model(kind, cfg, overrides=MODEL_OVERRIDES.get(kind))
	optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-4)
	history = {"train_loss": [], "val_loss": [], "epoch_time_s": []}

	first_batch_x, first_batch_y = next(iter(train_loader))
	profile = profile_train_step(build_forecasting_model(kind, cfg, overrides=MODEL_OVERRIDES.get(kind)), first_batch_x, first_batch_y)

	for epoch in range(cfg["epochs"]):
	train_loss, epoch_time = run_epoch(model, train_loader, optimizer=optimizer)
	val_loss, _ = run_epoch(model, val_loader)
	history["train_loss"].append(train_loss)
	history["val_loss"].append(val_loss)
	history["epoch_time_s"].append(epoch_time)
	print(f"{task_name} \| {kind} \| epoch={epoch+1:02d} train={train_loss:.6f} val={val_loss:.6f}")

	sample_x, sample_y = next(iter(val_loader))
	sample_x = sample_x[:2]
	sample_y = sample_y[:2]
	inf = benchmark_inference(model, sample_x)

	metrics = {
	"task": task_name,
	"kind": kind,
	"params": sum(p.numel() for p in model.parameters()),
	"train_loss": history["train_loss"][-1],
	"val_loss": history["val_loss"][-1],
	"mean_epoch_time_s": float(np.mean(history["epoch_time_s"])),
	**profile,
	"sample_target": sample_y.cpu(),
	**inf,
	}
	return metrics, history, model
	"""
	),
	md(
	"""
	## Run the Quick Experiment

	This is the cell you will usually run in Colab first.
	"""
	),
	code(
	r"""
	all_metrics = []
	histories = {}
	trained_models = {}

	for task_name in ACTIVE_TASKS:
	for kind in MODELS:
	metrics, history, model = train_and_benchmark(task_name, kind)
	all_metrics.append({k: v for k, v in metrics.items() if k not in {"sample_target", "prediction", "recurrent_prediction"}})
	histories[(task_name, kind)] = history
	trained_models[(task_name, kind)] = metrics

	summary_df = pd.DataFrame(all_metrics).sort_values(["task", "val_loss"]).reset_index(drop=True)
	show_benchmark_tables(summary_df, title="Quick benchmark")
	"""
	),
	code(
	r"""
	fig, axes = plt.subplots(2, 2, figsize=(14, 8))
	metrics_to_plot = ["val_loss", "mean_epoch_time_s", "full_tokens_per_s", "recurrent_tokens_per_s"]
	for ax, metric in zip(axes.flatten(), metrics_to_plot):
	pivot = summary_df.pivot(index="task", columns="kind", values=metric).loc[ACTIVE_TASKS]
	pivot.plot(ax=ax, marker="o")
	ax.set_title(metric.replace("_", " ").title())
	ax.grid(alpha=0.2)
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	The benchmark tables above are intentionally split into normal, deployment-lite, and balanced deployment views so the important columns remain visible in Colab.
	"""
	),
	code(
	r"""
	PLOT_TASK = ACTIVE_TASKS[-1]
	baseline = trained_models[(PLOT_TASK, "gamma_baseline")]
	target = baseline["sample_target"][0].numpy()
	baseline_pred = baseline["prediction"][0].numpy()
	comparison_kinds = [kind for kind in MODELS if kind != "gamma_baseline"]

	for compare_kind in comparison_kinds:
	candidate = trained_models[(PLOT_TASK, compare_kind)]
	candidate_pred = candidate["prediction"][0].numpy()
	channels = range(min(3, target.shape[-1]))
	time_axis = np.arange(target.shape[0])
	fig, axes = plt.subplots(len(list(channels)), 1, figsize=(12, 3.5 * len(list(channels))), sharex=True)
	if target.shape[-1] == 1:
	axes = [axes]
	for row, channel in enumerate(channels):
	ax = axes[row]
	ax.plot(time_axis, target[:, channel], label="ground truth", linewidth=2)
	ax.plot(time_axis, baseline_pred[:, channel], label="baseline", alpha=0.9)
	ax.plot(time_axis, candidate_pred[:, channel], label=compare_kind, alpha=0.9)
	ax.set_title(f"{PLOT_TASK} task, channel {channel}")
	ax.grid(alpha=0.2)
	if row == 0:
	ax.legend()
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Reading This Notebook

	Use this notebook for fast feedback:
	- if either S4-style model loses badly here, do not trust it on bigger tasks
	- if one of them is competitive here, then move to the research notebook
	"""
	),
	]
	return cells


	def research_notebook():
	cells = [
	md(
	"""
	# Gamma S4 Practical Benchmark

	This notebook is the second benchmark track after the quick notebook.

	It is meant to be closer to practical sequence modeling while staying reasonable on Colab:
	- one harder long-context synthetic benchmark
	- enhanced-model ablations on that harder task
	- an optional lightweight token benchmark
	"""
	),
	setup_cell(),
	imports_cell(),
	code(
	r"""
	PRACTICAL_CONFIGS = {
	"current_reference": dict(seq_len=320, features=6, train_samples=320, val_samples=80, epochs=5, batch_size=24, d_model=64, hidden_dim=96, num_layers=2, complexity=2),
	"long_context": dict(seq_len=768, features=8, train_samples=256, val_samples=64, epochs=4, batch_size=12, d_model=80, hidden_dim=128, num_layers=3, complexity=3),
	}

	RUN_PRACTICAL_SWEEP = True
	RUN_ABLATIONS = False
	RUN_TOKEN_TASK = True

	PRACTICAL_MODELS = ["gamma_baseline", "gamma_s4_enhanced", "s4_ternary_dplr_ssm"]

	MODEL_OVERRIDES = {
	"gamma_baseline": {},
	"gamma_s4_enhanced": {
	"kernel_mode": "auto",
	"kernel_threshold": 384,
	"discretization": "bilinear",
	"gate": True,
	"input_gate": True,
	"activation": "gelu",
	"use_D": True,
	"layer_scale_init": 0.1,
	},
	"gamma_s4_minimal": {
	"kernel_mode": "auto",
	"kernel_threshold": 512,
	"discretization": "bilinear",
	"use_D": True,
	},
	"s4_ternary_dplr_ssm": {
	"kernel_mode": "auto",
	"kernel_threshold": 256,
	"rank": 1,
	"gate": True,
	"input_gate": True,
	"activation": "gelu",
	"use_D": True,
	"layer_scale_init": 0.1,
	},
	}

	ABLATIONS = [
	("default", {}),
	("no_input_gate", {"input_gate": False}),
	("no_gate", {"gate": False}),
	("no_skip_D", {"use_D": False}),
	("euler", {"discretization": "euler"}),
	]
	"""
	),
	shared_helpers_cell(),
	code(
	r"""
	def train_practical_model(task_name, kind, overrides=None):
	cfg = PRACTICAL_CONFIGS[task_name]
	train_ds = make_forecasting_split(cfg, "train", seed=SEED + 101)
	val_ds = make_forecasting_split(cfg, "val", seed=SEED + 151)
	train_loader = DataLoader(train_ds, batch_size=cfg["batch_size"], shuffle=True)
	val_loader = DataLoader(val_ds, batch_size=cfg["batch_size"], shuffle=False)
	model = build_forecasting_model(kind, cfg, overrides=overrides or MODEL_OVERRIDES.get(kind))
	optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-4)
	history = {"train_loss": [], "val_loss": [], "epoch_time_s": []}

	for epoch in range(cfg["epochs"]):
	train_loss, epoch_time = run_epoch(model, train_loader, optimizer=optimizer)
	val_loss, _ = run_epoch(model, val_loader)
	history["train_loss"].append(train_loss)
	history["val_loss"].append(val_loss)
	history["epoch_time_s"].append(epoch_time)
	print(f"{task_name} \| {kind} \| epoch={epoch+1:02d} train={train_loss:.6f} val={val_loss:.6f}")

	sample_x, sample_y = next(iter(val_loader))
	sample_x = sample_x[:2]
	sample_y = sample_y[:2]
	inf = benchmark_inference(model, sample_x)
	result = {
	"task": task_name,
	"kind": kind,
	"params": sum(p.numel() for p in model.parameters()),
	"train_loss": history["train_loss"][-1],
	"val_loss": history["val_loss"][-1],
	"mean_epoch_time_s": float(np.mean(history["epoch_time_s"])),
	"expected_full_mode": "conv" if (kind != "gamma_baseline" and cfg["seq_len"] >= MODEL_OVERRIDES.get(kind, {}).get("kernel_threshold", 10**9)) else "recurrent_like",
	"sample_target": sample_y.cpu(),
	**inf,
	}
	return result, history, model
	"""
	),
	md(
	"""
	## Practical Sweep
	"""
	),
	code(
	r"""
	practical_rows = []
	practical_artifacts = {}
	practical_models = {}
	if RUN_PRACTICAL_SWEEP:
	for task_name in PRACTICAL_CONFIGS:
	for kind in PRACTICAL_MODELS:
	result, _, model = train_practical_model(task_name, kind)
	practical_artifacts[(task_name, kind)] = result
	practical_models[(task_name, kind)] = model
	practical_rows.append({k: v for k, v in result.items() if k not in {"sample_target", "prediction", "recurrent_prediction"}})

	practical_df = pd.DataFrame(practical_rows)
	show_benchmark_tables(practical_df, title="Practical sweep")
	"""
	),
	code(
	r"""
	if not practical_df.empty:
	fig, axes = plt.subplots(2, 2, figsize=(14, 8))
	for ax, metric in zip(axes.flatten(), ["val_loss", "mean_epoch_time_s", "full_tokens_per_s", "recurrent_tokens_per_s"]):
	pivot = practical_df.pivot(index="task", columns="kind", values=metric)
	pivot.plot(ax=ax, marker="o")
	ax.set_title(metric.replace("_", " ").title())
	ax.grid(alpha=0.2)
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Practical Inference Scaling

	This section benchmarks trained practical models across sequence lengths and batch sizes. It separates full-sequence prefill-like throughput from recurrent/deployment decode-like throughput.
	"""
	),
	code(
	r"""
	inference_scaling_rows = []
	if RUN_PRACTICAL_SWEEP and practical_models:
	INFERENCE_SEQ_LENS = [64, 192, 384, 768]
	INFERENCE_BATCH_SIZES = [1, 2]
	for (task_name, kind), model in practical_models.items():
	feature_dim = model.out_proj.out_features
	for seq_len in INFERENCE_SEQ_LENS:
	for batch_size in INFERENCE_BATCH_SIZES:
	sample_x = torch.randn(batch_size, seq_len, feature_dim)
	metrics = benchmark_inference(model, sample_x)
	inference_scaling_rows.append({
	"task": task_name,
	"kind": kind,
	"batch_size": batch_size,
	"seq_len": seq_len,
	"full_latency_ms": metrics["full_latency_ms"],
	"full_tokens_per_s": metrics["full_tokens_per_s"],
	"recurrent_latency_ms": metrics["recurrent_latency_ms"],
	"recurrent_tokens_per_s": metrics["recurrent_tokens_per_s"],
	"deploy_recurrent_latency_ms": metrics["deploy_recurrent_latency_ms"],
	"deploy_recurrent_tokens_per_s": metrics["deploy_recurrent_tokens_per_s"],
	"balanced_deploy_recurrent_latency_ms": metrics["balanced_deploy_recurrent_latency_ms"],
	"balanced_deploy_recurrent_tokens_per_s": metrics["balanced_deploy_recurrent_tokens_per_s"],
	"recurrent_match_mse": metrics["recurrent_match_mse"],
	"deploy_match_mse": metrics["deploy_match_mse"],
	"balanced_deploy_match_mse": metrics["balanced_deploy_match_mse"],
	})

	inference_scaling_df = pd.DataFrame(inference_scaling_rows)
	display(inference_scaling_df.sort_values(["task", "kind", "seq_len", "batch_size"]))
	"""
	),
	code(
	r"""
	if "inference_scaling_df" in globals() and not inference_scaling_df.empty:
	for task_name in inference_scaling_df["task"].unique():
	subset = inference_scaling_df[(inference_scaling_df["task"] == task_name) & (inference_scaling_df["batch_size"] == 1)]
	fig, axes = plt.subplots(1, 2, figsize=(14, 4))
	for kind in subset["kind"].unique():
	model_subset = subset[subset["kind"] == kind].sort_values("seq_len")
	axes[0].plot(model_subset["seq_len"], model_subset["full_tokens_per_s"], marker="o", label=kind)
	axes[1].plot(model_subset["seq_len"], model_subset["recurrent_tokens_per_s"], marker="o", label=kind)
	axes[0].set_title(f"{task_name}: full-sequence throughput, batch=1")
	axes[1].set_title(f"{task_name}: recurrent throughput, batch=1")
	for ax in axes:
	ax.set_xlabel("seq_len")
	ax.set_ylabel("tokens/s")
	ax.grid(alpha=0.25)
	ax.legend()
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Task Visual Preview

	These plots show what the synthetic tasks actually look like before we discuss scores.
	"""
	),
	code(
	r"""
	if RUN_PRACTICAL_SWEEP:
	fig, axes = plt.subplots(len(PRACTICAL_CONFIGS), 1, figsize=(14, 3.5 * len(PRACTICAL_CONFIGS)), sharex=False)
	if len(PRACTICAL_CONFIGS) == 1:
	axes = [axes]
	for ax, task_name in zip(axes, PRACTICAL_CONFIGS):
	cfg = PRACTICAL_CONFIGS[task_name]
	preview_ds = make_forecasting_split(cfg, "val", seed=SEED + 151)
	preview_x, preview_y = preview_ds[0]
	channels = range(min(3, preview_y.shape[-1]))
	time_axis = np.arange(preview_y.shape[0])
	for channel in channels:
	ax.plot(time_axis, preview_y[:, channel].numpy(), label=f"channel {channel}", linewidth=1.5)
	ax.set_title(f"{task_name} target preview")
	ax.grid(alpha=0.2)
	if task_name == list(PRACTICAL_CONFIGS.keys())[0]:
	ax.legend(ncol=min(3, preview_y.shape[-1]))
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Prediction Comparison Plots

	These are the most presentation-friendly plots in the notebook:
	ground truth vs baseline vs each S4-style model on the same held-out sample.
	"""
	),
	code(
	r"""
	if practical_artifacts:
	for task_name in PRACTICAL_CONFIGS:
	baseline = practical_artifacts[(task_name, "gamma_baseline")]
	target = baseline["sample_target"][0].numpy()
	baseline_pred = baseline["prediction"][0].numpy()
	for compare_kind in [kind for kind in PRACTICAL_MODELS if kind != "gamma_baseline"]:
	candidate = practical_artifacts[(task_name, compare_kind)]
	candidate_pred = candidate["prediction"][0].numpy()

	channels = range(min(3, target.shape[-1]))
	time_axis = np.arange(target.shape[0])
	fig, axes = plt.subplots(len(list(channels)), 1, figsize=(14, 3.5 * len(list(channels))), sharex=True)
	if target.shape[-1] == 1:
	axes = [axes]
	for row, channel in enumerate(channels):
	ax = axes[row]
	ax.plot(time_axis, target[:, channel], label="ground truth", linewidth=2)
	ax.plot(time_axis, baseline_pred[:, channel], label="baseline", alpha=0.9)
	ax.plot(time_axis, candidate_pred[:, channel], label=compare_kind, alpha=0.9)
	ax.set_title(f"{task_name} prediction comparison, channel {channel}")
	ax.grid(alpha=0.2)
	if row == 0:
	ax.legend()
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Error Comparison Plots

	These show where each model is missing the target signal. Lower absolute error should visually hug zero.
	"""
	),
	code(
	r"""
	if practical_artifacts:
	for task_name in PRACTICAL_CONFIGS:
	baseline = practical_artifacts[(task_name, "gamma_baseline")]
	target = baseline["sample_target"][0].numpy()
	baseline_err = baseline["prediction"][0].numpy() - target
	for compare_kind in [kind for kind in PRACTICAL_MODELS if kind != "gamma_baseline"]:
	candidate = practical_artifacts[(task_name, compare_kind)]
	candidate_err = candidate["prediction"][0].numpy() - target

	channels = range(min(2, target.shape[-1]))
	time_axis = np.arange(target.shape[0])
	fig, axes = plt.subplots(len(list(channels)), 1, figsize=(14, 3.0 * len(list(channels))), sharex=True)
	if len(list(channels)) == 1:
	axes = [axes]
	for row, channel in enumerate(channels):
	ax = axes[row]
	ax.plot(time_axis, baseline_err[:, channel], label="baseline error", alpha=0.9)
	ax.plot(time_axis, candidate_err[:, channel], label=f"{compare_kind} error", alpha=0.9)
	ax.axhline(0.0, color="black", linewidth=1, alpha=0.5)
	ax.set_title(f"{task_name} error comparison, channel {channel}")
	ax.grid(alpha=0.2)
	if row == 0:
	ax.legend()
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Enhanced Ablations On Long Context
	"""
	),
	code(
	r"""
	ablation_rows = []
	if RUN_ABLATIONS:
	for name, override in ABLATIONS:
	merged = {MODEL_OVERRIDES["gamma_s4_enhanced"], override}
	result, _, _ = train_practical_model("long_context", "gamma_s4_enhanced", overrides=merged)
	row = {k: v for k, v in result.items() if k not in {"sample_target", "prediction", "recurrent_prediction"}}
	row["ablation"] = name
	ablation_rows.append(row)

	ablation_df = pd.DataFrame(ablation_rows)
	ablation_df
	"""
	),
	code(
	r"""
	if not ablation_df.empty:
	fig, axes = plt.subplots(1, 3, figsize=(16, 4))
	ablation_df.sort_values("val_loss").plot.bar(x="ablation", y="val_loss", ax=axes[0], legend=False)
	ablation_df.sort_values("mean_epoch_time_s").plot.bar(x="ablation", y="mean_epoch_time_s", ax=axes[1], legend=False)
	ablation_df.sort_values("recurrent_tokens_per_s").plot.bar(x="ablation", y="recurrent_tokens_per_s", ax=axes[2], legend=False)
	axes[0].set_title("Validation Loss")
	axes[1].set_title("Mean Epoch Time")
	axes[2].set_title("Recurrent Tokens / s")
	for ax in axes:
	ax.tick_params(axis="x", rotation=25)
	ax.grid(alpha=0.2)
	plt.tight_layout()
	plt.show()
	"""
	),
	md(
	"""
	## Optional Token-Lite Task
	"""
	),
	code(
	r"""
	@torch.no_grad()
	def benchmark_token_inference(model, token_batch, target_batch, vocab_size, repeats=6):
	model.eval()
	x = token_batch.to(DEVICE)
	y = target_batch.to(DEVICE)
	batch, seq_len = x.shape
	token_count = batch * seq_len
	rows = []

	def reset_memory():
	if DEVICE.type == "cuda":
	torch.cuda.reset_peak_memory_stats()

	def max_memory_mb():
	if DEVICE.type != "cuda":
	return float("nan")
	return torch.cuda.max_memory_allocated() / (1024 ** 2)

	reset_memory()
	with torch.autocast(device_type=DEVICE.type, enabled=USE_AMP):
	logits = model(x)
	loss = F.cross_entropy(logits.reshape(-1, vocab_size), y.reshape(-1))
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	with torch.autocast(device_type=DEVICE.type, enabled=USE_AMP):
	logits = model(x)
	synchronize()
	elapsed = time.perf_counter() - start
	full_logits = logits.detach()
	rows.append({
	"mode": "prefill_full_sequence",
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": float(loss.detach().cpu()),
	"match_mse": 0.0,
	"max_memory_mb": max_memory_mb(),
	})

	hidden = model.embed(x)
	states = []
	caches = []
	cache_start = time.perf_counter()
	for layer in model.layers:
	state = layer.ssm.init_state(hidden.size(0), DEVICE, hidden.dtype)
	states.append(state)
	if hasattr(layer, "allocate_inference_cache"):
	caches.append(layer.allocate_inference_cache(hidden.size(0), seq_len, DEVICE, hidden.dtype))
	else:
	caches.append(None)
	synchronize()
	cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)

	def recurrent_pass(cache_list):
	local_states = [state.clone() for state in states]
	outputs = []
	for t in range(seq_len):
	step_x = hidden[:, t, :]
	for layer_idx, layer in enumerate(model.layers):
	cache = None if cache_list is None else cache_list[layer_idx]
	try:
	step_x, local_states[layer_idx] = layer.step(step_x, local_states[layer_idx], cache=cache)
	except TypeError:
	step_x, local_states[layer_idx] = layer.step(step_x, local_states[layer_idx])
	outputs.append(model.head(step_x))
	return torch.stack(outputs, dim=1)

	reset_memory()
	recurrent_logits = recurrent_pass(caches)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	recurrent_logits = recurrent_pass(caches)
	synchronize()
	elapsed = time.perf_counter() - start
	recurrent_loss = F.cross_entropy(recurrent_logits.reshape(-1, vocab_size), y.reshape(-1))
	rows.append({
	"mode": "decode_recurrent_exact",
	"cache_setup_ms": cache_setup_ms,
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": float(recurrent_loss.detach().cpu()),
	"match_mse": float(F.mse_loss(recurrent_logits, full_logits).detach().cpu()),
	"max_memory_mb": max_memory_mb(),
	})

	deploy_supported = all(hasattr(layer, "allocate_deployment_cache") for layer in model.layers)
	if deploy_supported:
	cache_start = time.perf_counter()
	deploy_caches = [
	layer.allocate_deployment_cache(hidden.size(0), seq_len, DEVICE, hidden.dtype)
	for layer in model.layers
	]
	synchronize()
	deploy_cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)
	reset_memory()
	deploy_logits = recurrent_pass(deploy_caches)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	deploy_logits = recurrent_pass(deploy_caches)
	synchronize()
	elapsed = time.perf_counter() - start
	deploy_loss = F.cross_entropy(deploy_logits.reshape(-1, vocab_size), y.reshape(-1))
	rows.append({
	"mode": "decode_deploy_lite",
	"cache_setup_ms": deploy_cache_setup_ms,
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": float(deploy_loss.detach().cpu()),
	"match_mse": float(F.mse_loss(deploy_logits, full_logits).detach().cpu()),
	"max_memory_mb": max_memory_mb(),
	})

	balanced_supported = all(hasattr(layer, "allocate_balanced_deployment_cache") for layer in model.layers)
	if balanced_supported:
	cache_start = time.perf_counter()
	balanced_caches = [
	layer.allocate_balanced_deployment_cache(hidden.size(0), seq_len, DEVICE, hidden.dtype)
	for layer in model.layers
	]
	synchronize()
	balanced_cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)
	reset_memory()
	balanced_logits = recurrent_pass(balanced_caches)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	balanced_logits = recurrent_pass(balanced_caches)
	synchronize()
	elapsed = time.perf_counter() - start
	balanced_loss = F.cross_entropy(balanced_logits.reshape(-1, vocab_size), y.reshape(-1))
	rows.append({
	"mode": "decode_balanced",
	"cache_setup_ms": balanced_cache_setup_ms,
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": float(balanced_loss.detach().cpu()),
	"match_mse": float(F.mse_loss(balanced_logits, full_logits).detach().cpu()),
	"max_memory_mb": max_memory_mb(),
	})

	return rows
	"""
	),
	code(
	r"""
	if RUN_TOKEN_TASK:
	TOKEN_DATA_URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
	TOKEN_DATA_PATH = Path("tmp/jupyter-notebook/tinyshakespeare.txt")
	TOKEN_DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
	if not TOKEN_DATA_PATH.exists():
	urllib.request.urlretrieve(TOKEN_DATA_URL, TOKEN_DATA_PATH)

	text = TOKEN_DATA_PATH.read_text(encoding="utf-8")
	vocab = sorted(set(text))
	stoi = {ch: i for i, ch in enumerate(vocab)}
	tokens = torch.tensor([stoi[ch] for ch in text], dtype=torch.long)

	TOKEN_CFG = {
	"seq_len": 192,
	"train_samples": 1200,
	"val_samples": 240,
	"epochs": 2,
	"batch_size": 12 if DEVICE.type == "cuda" else 6,
	}

	def make_token_split(seq_len, train_samples, val_samples):
	max_start = len(tokens) - seq_len - 1
	starts = torch.linspace(0, max_start - 1, steps=train_samples + val_samples).long()
	x = torch.stack([tokens[s : s + seq_len] for s in starts])
	y = torch.stack([tokens[s + 1 : s + seq_len + 1] for s in starts])
	return TensorDataset(x[:train_samples], y[:train_samples]), TensorDataset(x[train_samples:], y[train_samples:])

	class TokenForecaster(nn.Module):
	def __init__(self, vocab_size, kind):
	super().__init__()
	self.embed = nn.Embedding(vocab_size, 64)
	if kind == "gamma_baseline":
	factory = lambda: GammaSingleBlock(d_model=64, hidden_dim=96, dropout=0.0)
	elif kind == "gamma_s4_enhanced":
	factory = lambda: GammaS4Block(d_model=64, hidden_dim=96, kernel_mode="auto", kernel_threshold=160)
	elif kind == "s4_ternary_dplr_ssm":
	factory = lambda: S4TernaryDPLRBlock(d_model=64, hidden_dim=96, kernel_mode="auto", kernel_threshold=160)
	else:
	factory = lambda: GammaS4MinimalBlock(d_model=64, hidden_dim=96, kernel_mode="auto", kernel_threshold=160)
	self.layers = nn.ModuleList([factory(), factory()])
	self.head = nn.Linear(64, vocab_size)

	def forward(self, x):
	x = self.embed(x)
	for layer in self.layers:
	x, _ = layer(x, state=None, return_state=False)
	return self.head(x)

	token_train, token_val = make_token_split(
	seq_len=TOKEN_CFG["seq_len"],
	train_samples=TOKEN_CFG["train_samples"],
	val_samples=TOKEN_CFG["val_samples"],
	)
	token_rows = []
	token_inference_rows = []
	for kind in PRACTICAL_MODELS:
	model = TokenForecaster(len(vocab), kind).to(DEVICE)
	optimizer = torch.optim.AdamW(model.parameters(), lr=3e-3, weight_decay=1e-4)
	train_loader = DataLoader(token_train, batch_size=TOKEN_CFG["batch_size"], shuffle=True)
	val_loader = DataLoader(token_val, batch_size=TOKEN_CFG["batch_size"], shuffle=False)
	history = []
	for epoch in range(TOKEN_CFG["epochs"]):
	model.train()
	train_losses = []
	for batch_x, batch_y in train_loader:
	batch_x = batch_x.to(DEVICE)
	batch_y = batch_y.to(DEVICE)
	optimizer.zero_grad(set_to_none=True)
	with torch.autocast(device_type=DEVICE.type, enabled=USE_AMP):
	logits = model(batch_x)
	loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), batch_y.reshape(-1))
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	train_losses.append(loss.detach().item())
	model.eval()
	val_losses = []
	with torch.no_grad():
	for batch_x, batch_y in val_loader:
	batch_x = batch_x.to(DEVICE)
	batch_y = batch_y.to(DEVICE)
	logits = model(batch_x)
	loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), batch_y.reshape(-1))
	val_losses.append(loss.detach().item())
	history.append((float(np.mean(train_losses)), float(np.mean(val_losses))))
	print(kind, epoch + 1, history[-1])
	token_rows.append({
	"kind": kind,
	"train_ce": history[-1][0],
	"val_ce": history[-1][1],
	"val_ppl": math.exp(history[-1][1]),
	"seq_len": TOKEN_CFG["seq_len"],
	"train_samples": TOKEN_CFG["train_samples"],
	})
	sample_x, sample_y = next(iter(val_loader))
	for row in benchmark_token_inference(model, sample_x[:2], sample_y[:2], len(vocab)):
	row["kind"] = kind
	row["seq_len"] = TOKEN_CFG["seq_len"]
	row["batch_size"] = min(2, sample_x.size(0))
	token_inference_rows.append(row)
	token_df = pd.DataFrame(token_rows).sort_values("val_ce")
	display(token_df)
	token_inference_df = pd.DataFrame(token_inference_rows)
	display(token_inference_df.sort_values(["kind", "mode"]))
	"""
	),
	md(
	"""
	Use this notebook after the quick benchmark. The `long_context` task is the more practical synthetic benchmark, and the optional token-lite section gives a small language-like check without making Colab costs too high.

	`RUN_ABLATIONS` is off by default because the long-context ablation sweep is still materially more expensive than the main comparison.
	"""
	),
	]
	return cells


	def write_notebook(path, cells):
	notebook = {
	"cells": cells,
	"metadata": {
	"kernelspec": {"display_name": "Python 3", "language": "python", "name": "python3"},
	"language_info": {"name": "python", "version": "3.11"},
	},
	"nbformat": 4,
	"nbformat_minor": 5,
	}
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text(json.dumps(notebook, indent=2), encoding="utf-8")
	print(f"Wrote {path}")


	write_notebook(QUICK_NOTEBOOK_PATH, quick_notebook())
	write_notebook(RESEARCH_NOTEBOOK_PATH, research_notebook())