TaoNet-mini-T2 / code /Taotern_SSM /scripts /generate_gamma_challenge_benchmark_notebook.py

Add files using upload-large-folder tool

388fd6e verified 16 days ago

39.1 kB

	import json
	from pathlib import Path


	def md(text):
	return {
	"cell_type": "markdown",
	"metadata": {},
	"source": text.strip("\n").splitlines(keepends=True),
	}


	def code(text):
	return {
	"cell_type": "code",
	"execution_count": None,
	"metadata": {},
	"outputs": [],
	"source": text.strip("\n").splitlines(keepends=True),
	}


	NOTEBOOK_PATH = Path("output/jupyter-notebook/gamma-s4-challenge-benchmark.ipynb")


	def setup_cell():
	return code(
	r"""
	import os
	import sys
	import subprocess
	import importlib
	from pathlib import Path

	IN_COLAB = "google.colab" in sys.modules
	REPO_DIR = Path.cwd()

	if IN_COLAB:
	print("Running in Google Colab")
	from google.colab import userdata
	from getpass import getpass

	REPO_NAME = "gamma_ssm_s4_v2"
	REPO_DIR = Path("/content") / REPO_NAME
	GITHUB_REPO = "StarMists/gamma_SSM_S4_enhanced"

	token = os.environ.get("GITHUB_TOKEN") or os.environ.get("GH_TOKEN")
	if not token:
	try:
	token = userdata.get("GITHUB_TOKEN")
	except Exception:
	token = None
	if not token:
	token = getpass("GitHub personal access token for private repo access: ").strip()

	clone_url = f"https://{token}@github.com/{GITHUB_REPO}.git"
	if REPO_DIR.exists():
	subprocess.run(["git", "-C", str(REPO_DIR), "fetch", "origin"], check=True)
	subprocess.run(["git", "-C", str(REPO_DIR), "checkout", "main"], check=True)
	subprocess.run(["git", "-C", str(REPO_DIR), "reset", "--hard", "origin/main"], check=True)
	else:
	subprocess.run(["git", "clone", clone_url, str(REPO_DIR)], check=True)

	os.chdir(REPO_DIR)
	sys.path.insert(0, str(REPO_DIR))
	else:
	print("Running locally from", REPO_DIR)
	sys.path.insert(0, str(REPO_DIR))

	importlib.invalidate_caches()
	for name in list(sys.modules):
	if (
	name == "gamma_space_model"
	or name.startswith("gamma_space_model.")
	or name == "csrc"
	or name.startswith("csrc.")
	or name == "tilelang"
	or name.startswith("tilelang.")
	):
	del sys.modules[name]

	try:
	commit = subprocess.check_output(["git", "-C", str(REPO_DIR), "rev-parse", "--short", "HEAD"], text=True).strip()
	print("Repo commit:", commit)
	except Exception:
	pass
	"""
	)


	def imports_cell():
	return code(
	r"""
	import math
	import random
	import time
	from dataclasses import dataclass

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader, Dataset, Subset, TensorDataset

	try:
	from torchvision import datasets, transforms
	except Exception:
	if IN_COLAB:
	subprocess.run([sys.executable, "-m", "pip", "install", "-q", "torchvision"], check=True)
	from torchvision import datasets, transforms
	else:
	raise

	from gamma_space_model import GammaSingleBlock, GammaS4Block, S4TernaryDPLRBlock

	SEED = 7
	random.seed(SEED)
	np.random.seed(SEED)
	torch.manual_seed(SEED)
	if torch.cuda.is_available():
	torch.cuda.manual_seed_all(SEED)

	DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	USE_AMP = DEVICE.type == "cuda"

	if hasattr(torch, "amp") and hasattr(torch.amp, "GradScaler"):
	scaler = torch.amp.GradScaler("cuda", enabled=USE_AMP)
	else:
	scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)


	def synchronize():
	if DEVICE.type == "cuda":
	torch.cuda.synchronize()


	print("Device:", DEVICE)
	if DEVICE.type != "cuda":
	print("WARNING: CPU run detected. Treat speed numbers as smoke-test only.")
	"""
	)


	def config_cell():
	return code(
	r"""
	# Set any RUN_* flag to False if you want to skip a benchmark during a quick smoke run.
	RUN_PERMUTED_MNIST = True
	RUN_SELECTIVE_COPYING = True
	RUN_INDUCTION_RECALL = True
	RUN_TOKEN_CURRICULUM = True

	CHALLENGE_CONFIG = {
	"mnist": {
	"epochs": 4,
	"batch_size": 64,
	"train_samples": 6000,
	"val_samples": 1000,
	"d_model": 48,
	"hidden_dim": 96,
	"num_layers": 2,
	"lr": 2e-3,
	"weight_decay": 1e-4,
	},
	"selective_copying": {
	"epochs": 12,
	"batch_size": 128,
	"train_samples": 4096,
	"val_samples": 1024,
	"seq_len": 128,
	"vocab_size": 32,
	"num_mem_tokens": 6,
	"d_model": 64,
	"hidden_dim": 128,
	"num_layers": 2,
	"lr": 2e-3,
	"weight_decay": 1e-4,
	},
	"induction_recall": {
	"epochs": 12,
	"batch_size": 128,
	"train_samples": 4096,
	"val_samples": 1024,
	"seq_len": 128,
	"num_keys": 32,
	"num_values": 32,
	"d_model": 64,
	"hidden_dim": 128,
	"num_layers": 2,
	"lr": 2e-3,
	"weight_decay": 1e-4,
	},
	}

	MODEL_KINDS = ["gamma_baseline", "gamma_s4_enhanced", "s4_ternary_dplr_ssm"]

	TOKEN_CURRICULUM = [
	{
	"difficulty": "easy",
	"seq_len": 64,
	"classes": 8,
	"num_mem_tokens": 3,
	"train_samples": 2048,
	"val_samples": 512,
	"epochs": 10,
	},
	{
	"difficulty": "moderate",
	"seq_len": 96,
	"classes": 16,
	"num_mem_tokens": 4,
	"train_samples": 4096,
	"val_samples": 1024,
	"epochs": 10,
	},
	{
	"difficulty": "hard",
	"seq_len": 128,
	"classes": 32,
	"num_mem_tokens": 6,
	"train_samples": 4096,
	"val_samples": 1024,
	"epochs": 12,
	},
	]
	"""
	)


	def model_cell():
	return code(
	r"""
	def make_block(kind, width, hidden_dim):
	if kind == "gamma_baseline":
	return GammaSingleBlock(
	d_model=width,
	hidden_dim=hidden_dim,
	dropout=0.0,
	)
	if kind == "gamma_s4_enhanced":
	return GammaS4Block(
	d_model=width,
	hidden_dim=hidden_dim,
	kernel_mode="auto",
	kernel_threshold=96,
	discretization="bilinear",
	gate=True,
	input_gate=True,
	activation="gelu",
	use_D=True,
	layer_scale_init=0.1,
	)
	if kind == "s4_ternary_dplr_ssm":
	return S4TernaryDPLRBlock(
	d_model=width,
	hidden_dim=hidden_dim,
	rank=1,
	kernel_mode="auto",
	kernel_threshold=96,
	gate=True,
	input_gate=True,
	activation="gelu",
	use_D=True,
	layer_scale_init=0.1,
	)
	raise ValueError(f"Unknown model kind: {kind}")


	class BlockStack(nn.Module):
	def __init__(self, kind, d_model, hidden_dim, num_layers):
	super().__init__()
	self.layers = nn.ModuleList(
	[make_block(kind, d_model, hidden_dim) for _ in range(num_layers)]
	)

	def forward(self, x):
	for layer in self.layers:
	x, _ = layer(x, state=None, return_state=False)
	return x


	class PermutedMNISTModel(nn.Module):
	def __init__(self, kind, d_model, hidden_dim, num_layers):
	super().__init__()
	self.input_proj = nn.Linear(1, d_model)
	self.stack = BlockStack(kind, d_model, hidden_dim, num_layers)
	self.classifier = nn.Linear(d_model, 10)

	def forward(self, x):
	x = self.input_proj(x)
	x = self.stack(x)
	return self.classifier(x[:, -1])


	class TokenSequenceModel(nn.Module):
	def __init__(self, kind, vocab_size, d_model, hidden_dim, num_layers):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, d_model)
	self.stack = BlockStack(kind, d_model, hidden_dim, num_layers)
	self.head = nn.Linear(d_model, vocab_size)

	def forward(self, tokens):
	x = self.embedding(tokens)
	x = self.stack(x)
	return self.head(x)


	class TokenClassifierModel(nn.Module):
	def __init__(self, kind, vocab_size, num_classes, d_model, hidden_dim, num_layers):
	super().__init__()
	self.embedding = nn.Embedding(vocab_size, d_model)
	self.stack = BlockStack(kind, d_model, hidden_dim, num_layers)
	self.head = nn.Linear(d_model, num_classes)

	def forward(self, tokens):
	x = self.embedding(tokens)
	x = self.stack(x)
	return self.head(x[:, -1])
	"""
	)


	def data_cell():
	return code(
	r"""
	class PermutedMNISTDataset(Dataset):
	def __init__(self, root, train, permutation, limit):
	base = datasets.MNIST(
	root=root,
	train=train,
	download=True,
	transform=transforms.ToTensor(),
	)
	indices = list(range(min(limit, len(base)))) if limit is not None else list(range(len(base)))
	self.base = Subset(base, indices)
	self.permutation = permutation

	def __len__(self):
	return len(self.base)

	def __getitem__(self, idx):
	image, label = self.base[idx]
	seq = image.view(-1)[self.permutation].unsqueeze(-1)
	return seq, torch.tensor(label, dtype=torch.long)


	def make_permuted_mnist_loaders(config):
	generator = torch.Generator().manual_seed(SEED)
	permutation = torch.randperm(28 * 28, generator=generator)
	train_ds = PermutedMNISTDataset(
	root=str(REPO_DIR / "data"),
	train=True,
	permutation=permutation,
	limit=config["train_samples"],
	)
	val_ds = PermutedMNISTDataset(
	root=str(REPO_DIR / "data"),
	train=False,
	permutation=permutation,
	limit=config["val_samples"],
	)
	train_loader = DataLoader(
	train_ds,
	batch_size=config["batch_size"],
	shuffle=True,
	num_workers=2,
	pin_memory=DEVICE.type == "cuda",
	)
	val_loader = DataLoader(
	val_ds,
	batch_size=config["batch_size"],
	shuffle=False,
	num_workers=2,
	pin_memory=DEVICE.type == "cuda",
	)
	return train_loader, val_loader


	def make_selective_copying_dataset(samples, config, seed):
	rng = np.random.default_rng(seed)
	seq_len = config["seq_len"]
	vocab_size = config["vocab_size"]
	num_mem = config["num_mem_tokens"]
	blank = vocab_size
	marker = vocab_size + 1
	query = vocab_size + 2
	model_vocab = vocab_size + 3

	x = np.full((samples, seq_len), blank, dtype=np.int64)
	y = np.full((samples, seq_len), -100, dtype=np.int64)
	memory_window = max(8, seq_len // 2)
	query_start = seq_len - num_mem

	for i in range(samples):
	positions = np.sort(rng.choice(np.arange(1, memory_window - 1), size=num_mem, replace=False))
	values = rng.integers(0, vocab_size, size=num_mem, dtype=np.int64)
	x[i, positions - 1] = marker
	x[i, positions] = values
	x[i, query_start:] = query
	y[i, query_start:] = values

	return TensorDataset(torch.from_numpy(x), torch.from_numpy(y)), model_vocab


	def make_selective_copying_loaders(config):
	train_ds, model_vocab = make_selective_copying_dataset(config["train_samples"], config, SEED + 11)
	val_ds, _ = make_selective_copying_dataset(config["val_samples"], config, SEED + 17)
	train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
	val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False)
	return train_loader, val_loader, model_vocab


	def make_induction_recall_dataset(samples, config, seed):
	rng = np.random.default_rng(seed)
	seq_len = config["seq_len"]
	num_keys = config["num_keys"]
	num_values = config["num_values"]
	key_offset = 0
	value_offset = num_keys
	filler_offset = num_keys + num_values
	query_marker = filler_offset + num_keys
	model_vocab = query_marker + 1

	x = rng.integers(filler_offset, filler_offset + num_keys, size=(samples, seq_len), dtype=np.int64)
	y = np.zeros(samples, dtype=np.int64)

	for i in range(samples):
	key = rng.integers(0, num_keys)
	value = rng.integers(0, num_values)
	early = rng.integers(1, seq_len // 3)
	distractor_count = max(3, seq_len // 24)
	x[i, early] = key_offset + key
	x[i, early + 1] = value_offset + value
	for _ in range(distractor_count):
	pos = rng.integers(seq_len // 3, seq_len - 4)
	d_key = rng.integers(0, num_keys)
	d_value = rng.integers(0, num_values)
	if d_key == key:
	d_key = (d_key + 1) % num_keys
	x[i, pos] = key_offset + d_key
	x[i, pos + 1] = value_offset + d_value
	x[i, -2] = query_marker
	x[i, -1] = key_offset + key
	y[i] = value

	return TensorDataset(torch.from_numpy(x), torch.from_numpy(y)), model_vocab, num_values


	def make_induction_recall_loaders(config):
	train_ds, model_vocab, num_classes = make_induction_recall_dataset(config["train_samples"], config, SEED + 23)
	val_ds, _, _ = make_induction_recall_dataset(config["val_samples"], config, SEED + 29)
	train_loader = DataLoader(train_ds, batch_size=config["batch_size"], shuffle=True)
	val_loader = DataLoader(val_ds, batch_size=config["batch_size"], shuffle=False)
	return train_loader, val_loader, model_vocab, num_classes
	"""
	)


	def training_cell():
	return code(
	r"""
	def count_parameters(model):
	return sum(p.numel() for p in model.parameters() if p.requires_grad)


	def run_classifier_epoch(model, loader, optimizer=None):
	training = optimizer is not None
	model.train(training)
	total_loss = 0.0
	total_correct = 0
	total_count = 0
	total_tokens = 0
	start = time.perf_counter()

	for x, y in loader:
	x = x.to(DEVICE, non_blocking=True)
	y = y.to(DEVICE, non_blocking=True)
	if training:
	optimizer.zero_grad(set_to_none=True)
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	logits = model(x)
	loss = F.cross_entropy(logits, y)
	if training:
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	total_loss += loss.detach().item() * y.numel()
	total_correct += (logits.argmax(dim=-1) == y).sum().item()
	total_count += y.numel()
	total_tokens += x.shape[0] * x.shape[1]

	synchronize()
	elapsed = time.perf_counter() - start
	return {
	"loss": total_loss / max(total_count, 1),
	"accuracy": total_correct / max(total_count, 1),
	"tokens_per_s": total_tokens / max(elapsed, 1e-9),
	"elapsed_s": elapsed,
	}


	def run_sequence_epoch(model, loader, optimizer=None):
	training = optimizer is not None
	model.train(training)
	total_loss = 0.0
	total_correct = 0
	total_count = 0
	total_tokens = 0
	start = time.perf_counter()

	for x, y in loader:
	x = x.to(DEVICE, non_blocking=True)
	y = y.to(DEVICE, non_blocking=True)
	if training:
	optimizer.zero_grad(set_to_none=True)
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	logits = model(x)
	loss = F.cross_entropy(
	logits.reshape(-1, logits.size(-1)),
	y.reshape(-1),
	ignore_index=-100,
	)
	if training:
	scaler.scale(loss).backward()
	scaler.step(optimizer)
	scaler.update()
	valid = y != -100
	pred = logits.argmax(dim=-1)
	total_loss += loss.detach().item() * valid.sum().item()
	total_correct += ((pred == y) & valid).sum().item()
	total_count += valid.sum().item()
	total_tokens += x.numel()

	synchronize()
	elapsed = time.perf_counter() - start
	return {
	"loss": total_loss / max(total_count, 1),
	"accuracy": total_correct / max(total_count, 1),
	"tokens_per_s": total_tokens / max(elapsed, 1e-9),
	"elapsed_s": elapsed,
	}


	@torch.no_grad()
	def benchmark_forward(model, loader, repeats=8):
	model.eval()
	x, _ = next(iter(loader))
	x = x.to(DEVICE)
	for module in model.modules():
	if hasattr(module, "clear_kernel_cache"):
	module.clear_kernel_cache()
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	_ = model(x)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	_ = model(x)
	synchronize()
	elapsed = time.perf_counter() - start
	return {
	"forward_ms": elapsed * 1000.0 / repeats,
	"forward_tokens_per_s": (x.shape[0] * x.shape[1] * repeats) / max(elapsed, 1e-9),
	}


	def train_challenge_model(name, model, train_loader, val_loader, config, epoch_fn):
	model = model.to(DEVICE)
	optimizer = torch.optim.AdamW(
	model.parameters(),
	lr=config["lr"],
	weight_decay=config["weight_decay"],
	)
	rows = []
	for epoch in range(config["epochs"]):
	train_metrics = epoch_fn(model, train_loader, optimizer)
	val_metrics = epoch_fn(model, val_loader)
	row = {
	"epoch": epoch + 1,
	"train_loss": train_metrics["loss"],
	"train_accuracy": train_metrics["accuracy"],
	"val_loss": val_metrics["loss"],
	"val_accuracy": val_metrics["accuracy"],
	"train_tokens_per_s": train_metrics["tokens_per_s"],
	"val_tokens_per_s": val_metrics["tokens_per_s"],
	"epoch_s": train_metrics["elapsed_s"] + val_metrics["elapsed_s"],
	}
	rows.append(row)
	print(
	f"{name} \| epoch={epoch + 1:02d} "
	f"train_loss={row['train_loss']:.4f} train_acc={row['train_accuracy']:.3f} "
	f"val_loss={row['val_loss']:.4f} val_acc={row['val_accuracy']:.3f}"
	)

	history = pd.DataFrame(rows)
	final = rows[-1].copy()
	final["best_val_loss"] = float(history["val_loss"].min())
	final["best_val_accuracy"] = float(history["val_accuracy"].max())
	final.update(benchmark_forward(model, val_loader))
	final["params"] = count_parameters(model)
	return final, history, model


	def model_hidden_and_layers(model, x):
	if isinstance(model, PermutedMNISTModel):
	return model.input_proj(x), model.stack.layers, model.classifier, "classifier"
	if isinstance(model, TokenSequenceModel):
	return model.embedding(x), model.stack.layers, model.head, "sequence"
	if isinstance(model, TokenClassifierModel):
	return model.embedding(x), model.stack.layers, model.head, "classifier"
	raise TypeError(f"Unsupported model type: {type(model).__name__}")


	@torch.no_grad()
	def benchmark_model_inference_modes(model, loader, task_type, repeats=6):
	model.eval()
	x, y = next(iter(loader))
	x = x[: min(2, x.size(0))].to(DEVICE)
	y = y[: min(2, y.size(0))].to(DEVICE)
	token_count = x.shape[0] * x.shape[1]
	rows = []

	def reset_memory():
	if DEVICE.type == "cuda":
	torch.cuda.reset_peak_memory_stats()

	def max_memory_mb():
	if DEVICE.type != "cuda":
	return float("nan")
	return torch.cuda.max_memory_allocated() / (1024 ** 2)

	def score_logits(logits):
	if task_type == "sequence":
	valid = y != -100
	loss = F.cross_entropy(
	logits.reshape(-1, logits.size(-1)),
	y.reshape(-1),
	ignore_index=-100,
	)
	pred = logits.argmax(dim=-1)
	accuracy = ((pred == y) & valid).sum().float() / valid.sum().clamp_min(1)
	else:
	loss = F.cross_entropy(logits, y)
	accuracy = (logits.argmax(dim=-1) == y).float().mean()
	return float(loss.detach().cpu()), float(accuracy.detach().cpu())

	reset_memory()
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	full_logits = model(x)
	full_ce, full_acc = score_logits(full_logits)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	with torch.autocast(device_type=DEVICE.type, dtype=torch.float16, enabled=USE_AMP):
	full_logits = model(x)
	synchronize()
	elapsed = time.perf_counter() - start
	rows.append({
	"mode": "prefill_full_sequence",
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": full_ce,
	"accuracy": full_acc,
	"match_mse": 0.0,
	"max_memory_mb": max_memory_mb(),
	})

	hidden, layers, head, output_kind = model_hidden_and_layers(model, x)
	states = []
	exact_caches = []
	cache_start = time.perf_counter()
	for layer in layers:
	state = layer.ssm.init_state(hidden.size(0), DEVICE, hidden.dtype)
	states.append(state)
	if hasattr(layer, "allocate_inference_cache"):
	exact_caches.append(layer.allocate_inference_cache(hidden.size(0), hidden.size(1), DEVICE, hidden.dtype))
	else:
	exact_caches.append(None)
	synchronize()
	cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)

	def recurrent_logits(cache_list):
	local_states = [state.clone() for state in states]
	outputs = []
	for t in range(hidden.size(1)):
	step_x = hidden[:, t, :]
	for layer_idx, layer in enumerate(layers):
	cache = None if cache_list is None else cache_list[layer_idx]
	try:
	step_x, local_states[layer_idx] = layer.step(step_x, local_states[layer_idx], cache=cache)
	except TypeError:
	step_x, local_states[layer_idx] = layer.step(step_x, local_states[layer_idx])
	outputs.append(step_x)
	hidden_seq = torch.stack(outputs, dim=1)
	if output_kind == "sequence":
	return head(hidden_seq)
	return head(hidden_seq[:, -1])

	def add_recurrent_row(mode, cache_list, setup_ms=None):
	reset_memory()
	logits = recurrent_logits(cache_list)
	ce, acc = score_logits(logits)
	synchronize()
	start = time.perf_counter()
	for _ in range(repeats):
	logits = recurrent_logits(cache_list)
	synchronize()
	elapsed = time.perf_counter() - start
	rows.append({
	"mode": mode,
	"cache_setup_ms": setup_ms,
	"latency_ms": 1000.0 * elapsed / repeats,
	"tokens_per_s": token_count * repeats / max(elapsed, 1e-9),
	"ce": ce,
	"accuracy": acc,
	"match_mse": float(F.mse_loss(logits, full_logits).detach().cpu()),
	"max_memory_mb": max_memory_mb(),
	})

	add_recurrent_row("decode_recurrent_exact", exact_caches, setup_ms=cache_setup_ms)

	if all(hasattr(layer, "allocate_deployment_cache") for layer in layers):
	cache_start = time.perf_counter()
	deploy_caches = [
	layer.allocate_deployment_cache(hidden.size(0), hidden.size(1), DEVICE, hidden.dtype)
	for layer in layers
	]
	synchronize()
	deploy_cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)
	add_recurrent_row("decode_deploy_lite", deploy_caches, setup_ms=deploy_cache_setup_ms)

	if all(hasattr(layer, "allocate_balanced_deployment_cache") for layer in layers):
	cache_start = time.perf_counter()
	balanced_caches = [
	layer.allocate_balanced_deployment_cache(hidden.size(0), hidden.size(1), DEVICE, hidden.dtype)
	for layer in layers
	]
	synchronize()
	balanced_cache_setup_ms = 1000.0 * (time.perf_counter() - cache_start)
	add_recurrent_row("decode_balanced", balanced_caches, setup_ms=balanced_cache_setup_ms)

	return rows
	"""
	)


	def run_cell():
	return code(
	r"""
	all_results = []
	all_histories = {}
	trained_models = {}
	challenge_inference_rows = []

	if RUN_PERMUTED_MNIST:
	config = CHALLENGE_CONFIG["mnist"]
	train_loader, val_loader = make_permuted_mnist_loaders(config)
	for kind in MODEL_KINDS:
	print(f"\n=== Permuted MNIST \| {kind} ===")
	model = PermutedMNISTModel(
	kind=kind,
	d_model=config["d_model"],
	hidden_dim=config["hidden_dim"],
	num_layers=config["num_layers"],
	)
	metrics, history, model = train_challenge_model(
	f"permuted_mnist/{kind}",
	model,
	train_loader,
	val_loader,
	config,
	run_classifier_epoch,
	)
	metrics.update({"task": "permuted_mnist", "model": kind, "chance_accuracy": 0.1})
	all_results.append(metrics)
	all_histories[(metrics["task"], kind)] = history
	trained_models[(metrics["task"], kind)] = model
	for row in benchmark_model_inference_modes(model, val_loader, task_type="classifier"):
	row.update({"task": "permuted_mnist", "model": kind})
	challenge_inference_rows.append(row)

	if RUN_SELECTIVE_COPYING:
	config = CHALLENGE_CONFIG["selective_copying"]
	train_loader, val_loader, vocab_size = make_selective_copying_loaders(config)
	for kind in MODEL_KINDS:
	print(f"\n=== Selective Copying \| {kind} ===")
	model = TokenSequenceModel(
	kind=kind,
	vocab_size=vocab_size,
	d_model=config["d_model"],
	hidden_dim=config["hidden_dim"],
	num_layers=config["num_layers"],
	)
	metrics, history, model = train_challenge_model(
	f"selective_copying/{kind}",
	model,
	train_loader,
	val_loader,
	config,
	run_sequence_epoch,
	)
	metrics.update({"task": "selective_copying", "model": kind, "chance_accuracy": 1.0 / config["vocab_size"]})
	all_results.append(metrics)
	all_histories[(metrics["task"], kind)] = history
	trained_models[(metrics["task"], kind)] = model
	for row in benchmark_model_inference_modes(model, val_loader, task_type="sequence"):
	row.update({"task": "selective_copying", "model": kind})
	challenge_inference_rows.append(row)

	if RUN_INDUCTION_RECALL:
	config = CHALLENGE_CONFIG["induction_recall"]
	train_loader, val_loader, vocab_size, num_classes = make_induction_recall_loaders(config)
	for kind in MODEL_KINDS:
	print(f"\n=== Induction Recall \| {kind} ===")
	model = TokenClassifierModel(
	kind=kind,
	vocab_size=vocab_size,
	num_classes=num_classes,
	d_model=config["d_model"],
	hidden_dim=config["hidden_dim"],
	num_layers=config["num_layers"],
	)
	metrics, history, model = train_challenge_model(
	f"induction_recall/{kind}",
	model,
	train_loader,
	val_loader,
	config,
	run_classifier_epoch,
	)
	metrics.update({"task": "induction_recall", "model": kind, "chance_accuracy": 1.0 / config["num_values"]})
	all_results.append(metrics)
	all_histories[(metrics["task"], kind)] = history
	trained_models[(metrics["task"], kind)] = model
	for row in benchmark_model_inference_modes(model, val_loader, task_type="classifier"):
	row.update({"task": "induction_recall", "model": kind})
	challenge_inference_rows.append(row)

	results_df = pd.DataFrame(all_results)
	display(results_df[
	[
	"task",
	"model",
	"val_loss",
	"val_accuracy",
	"best_val_loss",
	"best_val_accuracy",
	"chance_accuracy",
	"epoch_s",
	"forward_ms",
	"forward_tokens_per_s",
	"params",
	]
	].sort_values(["task", "model"]))
	challenge_inference_df = pd.DataFrame(challenge_inference_rows)
	display(challenge_inference_df.sort_values(["task", "model", "mode"]))
	"""
	)


	def plotting_cell():
	return code(
	r"""
	if len(all_histories) > 0:
	tasks = sorted({task for task, _ in all_histories})
	fig, axes = plt.subplots(len(tasks), 2, figsize=(12, 3.5 * len(tasks)), squeeze=False)
	for row, task in enumerate(tasks):
	for kind in MODEL_KINDS:
	hist = all_histories.get((task, kind))
	if hist is None:
	continue
	axes[row, 0].plot(hist["epoch"], hist["val_loss"], marker="o", label=kind)
	axes[row, 1].plot(hist["epoch"], hist["val_accuracy"], marker="o", label=kind)
	axes[row, 0].set_title(f"{task}: validation loss")
	axes[row, 1].set_title(f"{task}: validation accuracy")
	axes[row, 0].set_xlabel("epoch")
	axes[row, 1].set_xlabel("epoch")
	axes[row, 0].grid(alpha=0.25)
	axes[row, 1].grid(alpha=0.25)
	axes[row, 0].legend()
	axes[row, 1].legend()
	plt.tight_layout()
	plt.show()
	"""
	)


	def curriculum_cell():
	return code(
	r"""
	curriculum_rows = []
	curriculum_histories = {}

	if RUN_TOKEN_CURRICULUM:
	for spec in TOKEN_CURRICULUM:
	shared = {
	"epochs": spec["epochs"],
	"batch_size": 128,
	"train_samples": spec["train_samples"],
	"val_samples": spec["val_samples"],
	"seq_len": spec["seq_len"],
	"d_model": 64,
	"hidden_dim": 128,
	"num_layers": 2,
	"lr": 2e-3,
	"weight_decay": 1e-4,
	}

	copy_cfg = {
	**shared,
	"vocab_size": spec["classes"],
	"num_mem_tokens": spec["num_mem_tokens"],
	}
	train_loader, val_loader, vocab_size = make_selective_copying_loaders(copy_cfg)
	for kind in MODEL_KINDS:
	print(f"\n=== Curriculum selective_copying/{spec['difficulty']} \| {kind} ===")
	model = TokenSequenceModel(
	kind=kind,
	vocab_size=vocab_size,
	d_model=copy_cfg["d_model"],
	hidden_dim=copy_cfg["hidden_dim"],
	num_layers=copy_cfg["num_layers"],
	)
	metrics, history, model = train_challenge_model(
	f"curriculum_selective_copying_{spec['difficulty']}/{kind}",
	model,
	train_loader,
	val_loader,
	copy_cfg,
	run_sequence_epoch,
	)
	metrics.update({
	"task": "selective_copying",
	"difficulty": spec["difficulty"],
	"model": kind,
	"classes": spec["classes"],
	"seq_len": spec["seq_len"],
	"chance_accuracy": 1.0 / spec["classes"],
	})
	curriculum_rows.append(metrics)
	curriculum_histories[(metrics["task"], spec["difficulty"], kind)] = history
	if spec["difficulty"] in {"easy", "hard"}:
	for row in benchmark_model_inference_modes(model, val_loader, task_type="sequence"):
	row.update({
	"task": "selective_copying",
	"difficulty": spec["difficulty"],
	"model": kind,
	})
	challenge_inference_rows.append(row)

	induction_cfg = {
	**shared,
	"num_keys": spec["classes"],
	"num_values": spec["classes"],
	}
	train_loader, val_loader, vocab_size, num_classes = make_induction_recall_loaders(induction_cfg)
	for kind in MODEL_KINDS:
	print(f"\n=== Curriculum induction_recall/{spec['difficulty']} \| {kind} ===")
	model = TokenClassifierModel(
	kind=kind,
	vocab_size=vocab_size,
	num_classes=num_classes,
	d_model=induction_cfg["d_model"],
	hidden_dim=induction_cfg["hidden_dim"],
	num_layers=induction_cfg["num_layers"],
	)
	metrics, history, model = train_challenge_model(
	f"curriculum_induction_recall_{spec['difficulty']}/{kind}",
	model,
	train_loader,
	val_loader,
	induction_cfg,
	run_classifier_epoch,
	)
	metrics.update({
	"task": "induction_recall",
	"difficulty": spec["difficulty"],
	"model": kind,
	"classes": spec["classes"],
	"seq_len": spec["seq_len"],
	"chance_accuracy": 1.0 / spec["classes"],
	})
	curriculum_rows.append(metrics)
	curriculum_histories[(metrics["task"], spec["difficulty"], kind)] = history
	if spec["difficulty"] in {"easy", "hard"}:
	for row in benchmark_model_inference_modes(model, val_loader, task_type="classifier"):
	row.update({
	"task": "induction_recall",
	"difficulty": spec["difficulty"],
	"model": kind,
	})
	challenge_inference_rows.append(row)

	curriculum_df = pd.DataFrame(curriculum_rows)
	if not curriculum_df.empty:
	display(curriculum_df[
	[
	"task",
	"difficulty",
	"model",
	"classes",
	"seq_len",
	"val_loss",
	"val_accuracy",
	"best_val_accuracy",
	"chance_accuracy",
	"epoch_s",
	"forward_tokens_per_s",
	]
	].sort_values(["task", "difficulty", "model"]))
	else:
	display(curriculum_df)

	if challenge_inference_rows:
	challenge_inference_df = pd.DataFrame(challenge_inference_rows)
	sort_cols = ["task"]
	if "difficulty" in challenge_inference_df.columns:
	sort_cols.append("difficulty")
	sort_cols.extend(["model", "mode"])
	display(challenge_inference_df.sort_values(sort_cols))
	"""
	)


	def curriculum_plot_cell():
	return code(
	r"""
	if "curriculum_df" in globals() and not curriculum_df.empty:
	for task in sorted(curriculum_df["task"].unique()):
	subset = curriculum_df[curriculum_df["task"] == task]
	fig, ax = plt.subplots(figsize=(10, 4))
	for kind in MODEL_KINDS:
	model_subset = subset[subset["model"] == kind].set_index("difficulty").loc[["easy", "moderate", "hard"]]
	ax.plot(model_subset.index, model_subset["best_val_accuracy"], marker="o", label=kind)
	chance = subset.drop_duplicates("difficulty").set_index("difficulty").loc[["easy", "moderate", "hard"]]
	ax.plot(chance.index, chance["chance_accuracy"], linestyle="--", color="black", label="chance")
	ax.set_title(f"{task}: best validation accuracy by difficulty")
	ax.set_ylabel("accuracy")
	ax.grid(alpha=0.25)
	ax.legend()
	plt.show()
	"""
	)


	def notes_cell():
	return md(
	"""
	## How To Read This Notebook

	- Permuted MNIST stresses long-range sequence classification because each image is flattened to 784 tokens and then shuffled by a fixed random permutation.
	- Selective copying stresses sparse memory recall: the model sees a few marked values early and must emit them at query positions near the end.
	- Induction recall is an induction-head-style associative recall task: the model observes a key-value pair, later receives the same key as a query, and must predict the associated value.

	The synthetic tasks are intentionally small enough for Colab iteration. The curriculum section is especially important: if a model cannot beat chance on the easy tier, the issue is probably model mechanism or task formulation rather than too few epochs. If it wins easy but fails hard, the next knobs are capacity, sample count, and sequence length.
	"""
	)


	def build_notebook():
	cells = [
	md(
	"""
	# Gamma S4 Challenge Benchmarks

	This notebook adds three harder sequence benchmarks requested by the technical lead:

	1. Permuted MNIST image classification, following the long-memory benchmark tradition used in the HiPPO paper.
	2. Selective copying, a synthetic content-based recall benchmark discussed in the Mamba line of work.
	3. Induction-style associative recall, a compact proxy for induction-head behavior.

	The goal is not to claim state of the art from this first version. The goal is to create a repeatable stress test that tells us whether Gamma S4 is moving toward practical long-context and token-memory behavior.
	"""
	),
	setup_cell(),
	imports_cell(),
	config_cell(),
	model_cell(),
	data_cell(),
	training_cell(),
	run_cell(),
	plotting_cell(),
	md(
	"""
	## Token Memory Curriculum

	The main challenge tasks are deliberately hard. This curriculum repeats selective copying and induction recall at easy, moderate, and hard settings so we can tell whether failures are caused by too few epochs, insufficient capacity, or a missing content-selective memory mechanism.
	"""
	),
	curriculum_cell(),
	curriculum_plot_cell(),
	notes_cell(),
	]
	return {
	"cells": cells,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3",
	},
	"language_info": {
	"name": "python",
	"pygments_lexer": "ipython3",
	},
	"colab": {
	"name": "gamma-s4-challenge-benchmark.ipynb",
	},
	},
	"nbformat": 4,
	"nbformat_minor": 5,
	}


	def main():
	NOTEBOOK_PATH.parent.mkdir(parents=True, exist_ok=True)
	NOTEBOOK_PATH.write_text(json.dumps(build_notebook(), indent=2), encoding="utf-8")
	print(f"Wrote {NOTEBOOK_PATH}")


	if __name__ == "__main__":
	main()