Remove redundant detect_scenario from LossDebugger

2a50172 6 days ago

65.6 kB

	"""LLM Loss Debugging & Optimization Framework.

	A systematic 5-level debugging framework for diagnosing training issues.
	Always start from Level 1 — fixing lower-level bugs before tuning
	hyperparameters saves time.

	Levels:
	0. Status Diagnosis — classify current training health
	1. Data/Implementation — most common cause (70% of issues)
	2. Numerical Stability — dtype, normalization, gradient health
	3. Hyperparameters — LR, batch size, warmup
	4. Fitting Diagnosis — overfitting vs underfitting
	5. Architecture — initialization, component checks
	"""

	import copy
	import math
	from typing import Any, Dict, List, Optional, Tuple

	import torch
	import torch.nn as nn
	import torch.nn.functional as F
	from torch.utils.data import DataLoader

	from llm_lab.config import TrainConfig


	# ═══════════════════════════════════════════════════════════════════
	# Constants
	# ═══════════════════════════════════════════════════════════════════

	# Approximate convergence ranges for a 1B model trained on ~10B tokens.
	# Estimated from GPT-2 scaling benchmarks (Radford et al. 2019) and
	# Chinchilla scaling laws (Hoffmann et al. 2022). Not dataset-specific.
	_EXPECTED_TRAIN_LOSS = (2.5, 3.3)
	_EXPECTED_VAL_LOSS = (2.7, 3.6)
	_EXPECTED_VAL_PPL = (15, 37)

	# Status labels
	STATUS_NORMAL = "NORMAL"
	STATUS_NO_DECREASE = "NO_DECREASE"
	STATUS_DIVERGING = "DIVERGING"
	STATUS_PLATEAU = "PLATEAU"
	STATUS_OVERFITTING = "OVERFITTING"
	STATUS_UNSTABLE = "UNSTABLE"
	STATUS_NAN_DETECTED = "NAN_DETECTED"
	STATUS_LOSS_BOUNCE = "LOSS_BOUNCE"

	# GPT-3 LR reference by model size (Brown et al. 2020, Table 2.1)
	# (param_count, recommended_lr, batch_tokens_str)
	_GPT3_LR_REFERENCE = [
	(125e6, 6e-4, "0.5M"),
	(350e6, 3e-4, "0.5M"),
	(760e6, 2.5e-4, "0.5M"),
	(1.3e9, 2e-4, "1M"),
	(2.7e9, 1.6e-4, "1M"),
	(6.7e9, 1.2e-4, "2M"),
	(13e9, 1e-4, "2M"),
	(175e9, 6e-5, "3.2M"),
	]

	# Known LLM training references
	_LLM_TRAINING_REFS = {
	"TinyLlama-1.1B": {"lr": 4e-4, "beta2": 0.95, "wd": 0.1, "warmup": 2000},
	"LLaMA-7B": {"lr": 3e-4, "beta2": 0.95, "wd": 0.1, "warmup": 2000},
	"Pythia-1B": {"lr": 3e-4, "beta2": 0.95, "wd": 0.01},
	"OLMo-1B": {"lr": 4e-4, "beta2": 0.95, "wd": 0.1},
	}

	# Recommended β₂ for LLM training
	_RECOMMENDED_BETA2 = 0.95
	_DEFAULT_PYTORCH_BETA2 = 0.999


	def _header(title: str) -> str:
	return f"\n{'=' * 60}\n{title}\n{'=' * 60}"


	def _check_result(name: str, passed: bool, detail: str = "") -> Dict[str, Any]:
	return {"name": name, "passed": passed, "detail": detail}


	# ═══════════════════════════════════════════════════════════════════
	# LossDebugger
	# ═══════════════════════════════════════════════════════════════════


	class LossDebugger:
	"""5-level loss debugging framework for LLM training.

	Usage::

	from llm_lab.training.debugger import LossDebugger

	# Quick status check
	status = LossDebugger.diagnose_status(vocab_size=32000,
	metrics_history=trainer.metrics.history)

	# Full diagnostics
	report = LossDebugger.run_diagnostics(
	model=model, dataloader=train_dl, tokenizer=tok,
	train_config=train_cfg, metrics_history=trainer.metrics.history,
	device=device, dtype=torch.bfloat16,
	)
	"""

	# ───────────────────────────────────────────────────────────────
	# Level 0: Status Diagnosis
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def diagnose_status(
	vocab_size: int,
	metrics_history: Dict[str, list],
	) -> Dict[str, Any]:
	"""Classify current training health from metrics history.

	Args:
	vocab_size: model vocabulary size (e.g. 32000)
	metrics_history: dict with keys 'train_loss', 'val_loss', etc.

	Returns:
	dict with 'status', 'severity', 'details', 'recommended_levels'
	"""
	print(_header("Level 0: Training Status Diagnosis"))

	expected_initial = math.log(vocab_size)
	print(f" Expected initial loss (random weights): ln({vocab_size}) = {expected_initial:.2f}")
	print(f" Normal convergence range (1B, 10B tokens):")
	print(f" Train Loss: {_EXPECTED_TRAIN_LOSS[0]} ~ {_EXPECTED_TRAIN_LOSS[1]}")
	print(f" Val Loss: {_EXPECTED_VAL_LOSS[0]} ~ {_EXPECTED_VAL_LOSS[1]}")
	print(f" Val PPL: {_EXPECTED_VAL_PPL[0]} ~ {_EXPECTED_VAL_PPL[1]}")

	raw_train_losses = metrics_history.get("train_loss", [])
	train_losses = [l for l in raw_train_losses if not math.isnan(l)]
	val_losses = [v for v in metrics_history.get("val_loss", []) if v is not None]

	if len(train_losses) < 2:
	print("\n [!] Not enough training data to diagnose. Run more steps first.")
	return {
	"status": "INSUFFICIENT_DATA",
	"severity": "unknown",
	"details": "Need at least 2 logged train loss values.",
	"recommended_levels": [1],
	}

	# Detect NaN presence before filtering
	has_nan = len(train_losses) < len(raw_train_losses)
	if has_nan:
	nan_count = len(raw_train_losses) - len(train_losses)
	print(f"\n ⚠ {nan_count} NaN values detected in train_loss — filtered for analysis")

	first_loss = train_losses[0]
	last_loss = train_losses[-1]
	loss_change = first_loss - last_loss

	# Split into halves for trend analysis
	mid = len(train_losses) // 2
	first_half_avg = sum(train_losses[:mid]) / mid
	second_half_avg = sum(train_losses[mid:]) / (len(train_losses) - mid)

	# Recent window for spike detection
	recent_n = min(50, len(train_losses))
	recent = train_losses[-recent_n:]
	recent_mean = sum(recent) / len(recent)
	recent_var = sum((x - recent_mean) ** 2 for x in recent) / len(recent)
	recent_std = recent_var ** 0.5

	# Val trend
	val_trend = "unknown"
	if len(val_losses) >= 2:
	val_mid = len(val_losses) // 2
	val_first_avg = sum(val_losses[:max(val_mid, 1)]) / max(val_mid, 1)
	val_second_avg = sum(val_losses[val_mid:]) / max(len(val_losses) - val_mid, 1)
	if val_second_avg < val_first_avg - 0.05:
	val_trend = "decreasing"
	elif val_second_avg > val_first_avg + 0.1:
	val_trend = "increasing"
	else:
	val_trend = "flat"

	# Pre-compute bounce detection using moving-average minimum
	# to avoid false positives from single noisy data points
	_ma_window = max(1, len(train_losses) // 20) # 5% window
	_ma_losses = [
	sum(train_losses[max(0, i - _ma_window + 1):i + 1])
	/ (i - max(0, i - _ma_window + 1) + 1)
	for i in range(len(train_losses))
	]
	_min_ma_loss = min(_ma_losses)
	_min_ma_idx = _ma_losses.index(_min_ma_loss)
	_last_ma_loss = _ma_losses[-1]
	_bounce_amount = _last_ma_loss - _min_ma_loss
	_has_bounce = (
	loss_change > 0.1
	and _min_ma_idx < len(train_losses) * 0.85
	and _bounce_amount > _min_ma_loss * 0.05
	)
	# Downgrade bounce severity when val loss is still improving
	_val_improving = (
	val_trend == "decreasing"
	or (len(val_losses) >= 4
	and val_losses[-1] <= min(val_losses[:len(val_losses) // 2]))
	)

	# ── Classify ──
	status = STATUS_NORMAL
	severity = "green"
	details = ""
	recommended_levels: List[int] = []

	# Check 1: No decrease at all
	if loss_change < 0.1 and first_loss > expected_initial - 2.0:
	status = STATUS_NO_DECREASE
	severity = "red"
	details = (
	f"Loss barely changed: {first_loss:.4f} -> {last_loss:.4f} "
	f"(delta={loss_change:.4f}). Likely a data or implementation bug."
	)
	recommended_levels = [1, 2]

	# Check 2: Diverging
	elif last_loss > expected_initial + 1.0:
	status = STATUS_DIVERGING
	severity = "red"
	details = (
	f"Loss ({last_loss:.4f}) exceeds initial value ({expected_initial:.2f}). "
	f"Training is diverging — check LR, data, or numerical issues."
	)
	recommended_levels = [1, 2, 3]

	# Check 3: NaN detected in training loss
	elif has_nan:
	nan_count = len(raw_train_losses) - len(train_losses)
	nan_idx = next(i for i, l in enumerate(raw_train_losses) if math.isnan(l))
	status = STATUS_NAN_DETECTED
	severity = "red"
	details = (
	f"NaN detected in train_loss: {nan_count} NaN values "
	f"(first at step ~{nan_idx}). "
	f"Before NaN: {first_loss:.4f} -> {last_loss:.4f}. "
	f"Check gradient norms, LR schedule, and numerical precision."
	)
	recommended_levels = [2, 3]

	# Check 4: Unstable (large spikes)
	elif recent_std > 0.5 * recent_mean:
	status = STATUS_UNSTABLE
	severity = "yellow"
	details = (
	f"High loss variance: std={recent_std:.4f}, mean={recent_mean:.4f}. "
	f"Training is unstable — likely LR too high or batch too small."
	)
	recommended_levels = [3, 2]

	# Check 5: Loss bounce (decreased then increased again)
	elif _has_bounce:
	status = STATUS_LOSS_BOUNCE
	if _val_improving:
	severity = "green"
	details = (
	f"Train loss bounced (moving-avg): "
	f"{first_loss:.4f} -> min {_min_ma_loss:.4f} -> {_last_ma_loss:.4f} "
	f"(bounce={_bounce_amount:.4f}), but val loss is still improving "
	f"({val_losses[0]:.4f} -> {val_losses[-1]:.4f}). "
	f"Likely data distribution variation, not a real issue."
	)
	recommended_levels = []
	else:
	severity = "yellow"
	details = (
	f"Train loss bounced (moving-avg): "
	f"{first_loss:.4f} -> min {_min_ma_loss:.4f} -> {_last_ma_loss:.4f} "
	f"(bounce={_bounce_amount:.4f}). "
	f"Possible LR too high, data issue, or overfitting."
	)
	recommended_levels = [3, 4]

	# Check 6: Overfitting
	elif val_trend == "increasing" and second_half_avg < first_half_avg:
	status = STATUS_OVERFITTING
	severity = "yellow"
	details = (
	f"Train loss decreasing but val loss increasing. "
	f"Train trend: {first_half_avg:.4f} -> {second_half_avg:.4f}, "
	f"Val trend: {val_trend}."
	)
	recommended_levels = [4]

	# Check 7: Plateau
	elif abs(second_half_avg - first_half_avg) < 0.05 and last_loss > _EXPECTED_TRAIN_LOSS[1]:
	status = STATUS_PLATEAU
	severity = "yellow"
	details = (
	f"Loss has plateaued: first half avg={first_half_avg:.4f}, "
	f"second half avg={second_half_avg:.4f}. "
	f"Current loss ({last_loss:.4f}) is above expected range."
	)
	recommended_levels = [3, 4, 5]

	# Normal
	else:
	status = STATUS_NORMAL
	severity = "green"
	details = (
	f"Training looks healthy: {first_loss:.4f} -> {last_loss:.4f} "
	f"(delta={loss_change:.4f}). Val trend: {val_trend}."
	)
	recommended_levels = []

	# ── Print ──
	icons = {"red": "🔴", "yellow": "🟡", "green": "🟢"}
	icon = icons.get(severity, "⚪")
	print(f"\n {icon} Status: {status}")
	print(f" {details}")
	if recommended_levels:
	print(f" Recommended: check Level(s) {recommended_levels}")

	return {
	"status": status,
	"severity": severity,
	"details": details,
	"recommended_levels": recommended_levels,
	}

	# ───────────────────────────────────────────────────────────────
	# Level 1: Data / Implementation Bug Checks
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def check_data_pipeline(
	model: nn.Module,
	dataloader: DataLoader,
	tokenizer: Any,
	vocab_size: int,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	) -> Dict[str, Any]:
	"""Run 6 data/implementation checks (Level 1).

	This is the most important level — 70% of loss issues are data bugs.

	Checks:
	1. Shift relationship (targets[t] == input_ids[t+1])
	2. Token range (0 <= ids < vocab_size)
	3. Initial loss (≈ ln(vocab_size) for random weights)
	4. Single-batch overfit (loss → ~0 in 200 steps)
	5. Tokenizer roundtrip (encode→decode preserves text)
	6. Data quality sampling (visual inspection)
	"""
	print(_header("Level 1: Data / Implementation Bug Checks"))
	print(" (70% of loss issues come from data pipeline bugs)\n")

	results: List[Dict[str, Any]] = []
	batch = next(iter(dataloader))
	input_ids = batch["input_ids"]
	targets = batch["targets"]

	# ── Check 1: Shift relationship ──
	shift_match = (input_ids[:, 1:] == targets[:, :-1]).float().mean().item()
	passed = shift_match > 0.99
	detail = f"Shift consistency: {shift_match * 100:.1f}% (should be ~100%)"
	results.append(_check_result("Shift relationship", passed, detail))
	icon = "✅" if passed else "❌"
	print(f" {icon} Check 1: {detail}")

	# ── Check 2: Token range ──
	min_id = input_ids.min().item()
	max_id = input_ids.max().item()
	range_ok = min_id >= 0 and max_id < vocab_size
	detail = f"Token range: [{min_id}, {max_id}], vocab_size={vocab_size}"
	results.append(_check_result("Token range", range_ok, detail))
	icon = "✅" if range_ok else "❌"
	print(f" {icon} Check 2: {detail}")

	# ── Check 3: Initial loss ──
	expected_loss = math.log(vocab_size)
	model_copy = copy.deepcopy(model)
	model_copy._init_weights() # re-initialize to random
	model_copy.to(device)
	model_copy.eval()
	with torch.no_grad():
	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	_, initial_loss = model_copy(
	input_ids.to(device),
	targets.to(device),
	)
	initial_loss_val = initial_loss.item()
	loss_diff = abs(initial_loss_val - expected_loss)
	loss_ok = loss_diff < 1.0
	detail = (
	f"Initial loss: {initial_loss_val:.4f} vs expected {expected_loss:.2f} "
	f"(diff={loss_diff:.4f})"
	)
	results.append(_check_result("Initial loss", loss_ok, detail))
	icon = "✅" if loss_ok else "❌"
	print(f" {icon} Check 3: {detail}")
	if initial_loss_val > expected_loss + 1.0:
	print(f" Hint: loss >> ln(V) suggests label mismatch or loss function bug")
	elif initial_loss_val < expected_loss - 2.0:
	print(f" Hint: loss << ln(V) suggests data leakage")
	del model_copy
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# ── Check 4: Single-batch overfit test ──
	# Scale LR and steps based on model size to avoid instability
	num_params = sum(p.numel() for p in model.parameters())
	if num_params > 500e6:
	overfit_lr, overfit_steps = 1e-4, 400
	elif num_params > 50e6:
	overfit_lr, overfit_steps = 3e-4, 300
	else:
	overfit_lr, overfit_steps = 1e-3, 200
	print(f"\n ⏳ Check 4: Single-batch overfit test ({overfit_steps} steps, lr={overfit_lr:.0e})...")
	overfit_model = copy.deepcopy(model)
	overfit_model.to(device)
	overfit_model.train()
	overfit_optimizer = torch.optim.AdamW(overfit_model.parameters(), lr=overfit_lr)
	single_input = input_ids[:1].to(device) # single sample
	single_target = targets[:1].to(device)
	log_interval = max(overfit_steps // 4, 1)

	overfit_losses = []
	for step in range(overfit_steps):
	overfit_optimizer.zero_grad()
	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	_, loss = overfit_model(single_input, single_target)
	loss.backward()
	torch.nn.utils.clip_grad_norm_(overfit_model.parameters(), 1.0)
	overfit_optimizer.step()
	overfit_losses.append(loss.item())
	if (step + 1) % log_interval == 0:
	print(f" Step {step + 1}: Loss = {loss.item():.4f}")

	final_overfit_loss = overfit_losses[-1]
	min_overfit_loss = min(overfit_losses)
	overfit_ok = min_overfit_loss < 0.5
	detail = (
	f"Single-batch overfit: {overfit_losses[0]:.4f} -> {final_overfit_loss:.4f} "
	f"(min={min_overfit_loss:.4f}, target < 0.5)"
	)
	results.append(_check_result("Single-batch overfit", overfit_ok, detail))
	icon = "✅" if overfit_ok else "❌"
	print(f" {icon} Check 4: {detail}")
	if not overfit_ok:
	print(f" CRITICAL: Model cannot memorize a single batch!")
	print(f" This means the model or loss function has a bug.")
	del overfit_model, overfit_optimizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# ── Check 5: Tokenizer roundtrip ──
	test_text = "The quick brown fox jumps over the lazy dog."
	encoded = tokenizer.encode(test_text)
	decoded = tokenizer.decode(encoded)
	roundtrip_ok = test_text.strip() in decoded.strip()
	detail = f"Roundtrip: '{test_text}' -> '{decoded.strip()}'"
	results.append(_check_result("Tokenizer roundtrip", roundtrip_ok, detail))
	icon = "✅" if roundtrip_ok else "❌"
	print(f" {icon} Check 5: {detail}")

	# ── Check 6: Data quality sampling ──
	print(f"\n 📋 Check 6: Data quality sampling (visual inspection)")
	for i in range(min(3, input_ids.shape[0])):
	sample_tokens = input_ids[i][:100].tolist()
	decoded_text = tokenizer.decode(sample_tokens)
	preview = decoded_text[:200].replace("\n", "\\n")
	print(f" Sample {i}: {preview}...")

	passed_count = sum(1 for r in results if r["passed"])
	total_count = len(results)
	print(f"\n Result: {passed_count}/{total_count} checks passed")

	return {
	"level": 1,
	"checks": results,
	"passed": [r for r in results if r["passed"]],
	"failed": [r for r in results if not r["passed"]],
	}

	# ───────────────────────────────────────────────────────────────
	# Level 2: Numerical Stability
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def check_numerical_stability(
	model: nn.Module,
	dataloader: DataLoader,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	) -> Dict[str, Any]:
	"""Check for NaN/Inf in gradients, activations, and logits (Level 2).

	Checks:
	- Mixed precision config (RMSNorm fp32 upcast, loss dtype)
	- NaN/Inf gradients → softmax overflow, bad data
	- Inf gradients → log(0) in loss, missing ignore_index
	- Large activations growing per layer → initialization or norm bug
	- Logit scale → should be < 1000
	"""
	print(_header("Level 2: Numerical Stability Checks"))

	batch = next(iter(dataloader))
	input_ids = batch["input_ids"].to(device)
	targets = batch["targets"].to(device)

	results: List[Dict[str, Any]] = []
	activation_stats: List[Dict[str, Any]] = []

	# ── Mixed Precision Configuration Check ──
	print("\n Mixed Precision Config:")
	print(f" Training dtype: {dtype}")

	# Check RMSNorm fp32 upcast
	norm_fp32_ok = True
	checked_norm_classes: set = set()
	for name, module in model.named_modules():
	cls_name = module.__class__.__name__
	if "Norm" in cls_name and cls_name not in checked_norm_classes:
	checked_norm_classes.add(cls_name)
	import inspect
	try:
	src = inspect.getsource(type(module).forward)
	has_upcast = ".float()" in src or "float32" in src
	except (TypeError, OSError):
	has_upcast = True # assume ok if can't inspect
	if not has_upcast:
	norm_fp32_ok = False
	print(f" 🔴 {cls_name}: no fp32 upcast detected!")
	if norm_fp32_ok:
	print(f" ✅ Norm layers use fp32 upcast (safe)")

	results.append(_check_result(
	"Norm fp32 upcast", norm_fp32_ok,
	"Norm computes in fp32" if norm_fp32_ok else "Norm may lose precision in half dtype",
	))

	# Check loss computation dtype
	if dtype in (torch.bfloat16, torch.float16):
	print(f" ℹ️ Best practice: compute loss in fp32 when using {dtype}")
	print(f" logits_fp32 = logits.float()")
	print(f" loss = F.cross_entropy(logits_fp32.view(-1, V), targets.view(-1))")

	# Common numerical issues reference
	print("\n Common Numerical Issues Reference:")
	print(" ┌──────────────────────┬──────────────────────────┬─────────────────────────┐")
	print(" │ Symptom │ Likely Cause │ Solution │")
	print(" ├──────────────────────┼──────────────────────────┼─────────────────────────┤")
	print(" │ Loss → NaN │ Large logits → softmax │ Check init, logit scale │")
	print(" │ Loss → Inf │ log(0) in CE loss │ Add eps, ignore_index │")
	print(" │ Loss oscillation │ fp16 gradient underflow │ Switch to bf16 / scaler │")
	print(" │ Late-training NaN │ Activation growth │ Check RMSNorm, wd │")
	print(" └──────────────────────┴──────────────────────────┴─────────────────────────┘")

	# ── Activation monitoring via hooks ──
	hooks = []

	def make_hook(name: str):
	def hook_fn(module, input, output):
	if isinstance(output, torch.Tensor):
	out_f = output.float()
	stats = {
	"name": name,
	"mean": out_f.mean().item(),
	"std": out_f.std().item(),
	"max": out_f.abs().max().item(),
	"has_nan": bool(torch.isnan(output).any()),
	"has_inf": bool(torch.isinf(output).any()),
	}
	activation_stats.append(stats)
	return hook_fn

	# Register hooks on transformer layers
	for i, layer in enumerate(model.layers):
	h = layer.register_forward_hook(make_hook(f"layer_{i}"))
	hooks.append(h)

	# ── Forward + Backward ──
	model.train()
	model.zero_grad(set_to_none=True)
	use_scaler = dtype == torch.float16 and torch.cuda.is_available()
	scaler = torch.amp.GradScaler() if use_scaler else None

	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	logits, loss = model(input_ids, targets)

	loss_val = loss.item()
	loss_ok = not (math.isnan(loss_val) or math.isinf(loss_val))
	results.append(_check_result(
	"Loss value",
	loss_ok,
	f"Loss = {loss_val:.4f}" if loss_ok else f"Loss = {loss_val} (NaN/Inf!)"
	))

	if scaler is not None:
	scaler.scale(loss).backward()
	_temp_opt = torch.optim.SGD(model.parameters(), lr=0)
	scaler.unscale_(_temp_opt)
	else:
	loss.backward()

	# Remove hooks
	for h in hooks:
	h.remove()

	# ── Gradient checks ──
	print("\n Gradient Health:")
	grad_issues = []
	for name, param in model.named_parameters():
	if param.grad is None:
	continue
	grad = param.grad
	if torch.isnan(grad).any():
	grad_issues.append(f"🔴 NaN gradient: {name}")
	if torch.isinf(grad).any():
	grad_issues.append(f"🔴 Inf gradient: {name}")
	if grad.abs().max().item() > 100:
	grad_issues.append(
	f"🟡 Large gradient: {name} max={grad.abs().max().item():.1f}"
	)

	grad_ok = len(grad_issues) == 0
	if grad_ok:
	print(" ✅ All gradients are healthy (no NaN/Inf/large values)")
	else:
	for issue in grad_issues[:10]: # limit output
	print(f" {issue}")
	if len(grad_issues) > 10:
	print(f" ... and {len(grad_issues) - 10} more issues")

	results.append(_check_result(
	"Gradient health",
	grad_ok,
	f"{len(grad_issues)} issues found" if not grad_ok else "All healthy",
	))

	# ── Activation checks ──
	print("\n Activation Stats (per transformer layer):")
	act_nan_count = 0
	for stats in activation_stats:
	icon = "🔴" if stats["has_nan"] or stats["has_inf"] else " "
	if stats["has_nan"] or stats["has_inf"]:
	act_nan_count += 1
	print(
	f" {icon} {stats['name']}: "
	f"mean={stats['mean']:.4f}, "
	f"std={stats['std']:.4f}, "
	f"max={stats['max']:.4f}"
	+ (" [NaN!]" if stats["has_nan"] else "")
	+ (" [Inf!]" if stats["has_inf"] else "")
	)

	act_ok = act_nan_count == 0
	results.append(_check_result(
	"Activation health",
	act_ok,
	f"{act_nan_count} layers with NaN/Inf" if not act_ok else "All layers healthy",
	))

	# ── Activation growth trend ──
	if len(activation_stats) >= 2:
	stds = [s["std"] for s in activation_stats]
	if stds[0] > 1e-8:
	growth_ratio = stds[-1] / stds[0]
	growth_ok = growth_ratio < 10
	detail = (
	f"Activation std ratio (last/first): {growth_ratio:.1f}x "
	f"(layer_0={stds[0]:.4f}, last={stds[-1]:.4f})"
	)
	results.append(_check_result("Activation growth", growth_ok, detail))
	icon = "✅" if growth_ok else "🟡"
	print(f" {icon} {detail}")
	if not growth_ok:
	print(f" Possible initialization or normalization issue")

	# ── Logit scale check ──
	logit_max = logits.float().abs().max().item()
	logit_ok = logit_max < 1000
	detail = f"Logit max abs value: {logit_max:.1f} (should be < 1000)"
	results.append(_check_result("Logit scale", logit_ok, detail))
	icon = "✅" if logit_ok else "🔴"
	print(f"\n {icon} Logit scale: {detail}")

	model.zero_grad(set_to_none=True)

	passed_count = sum(1 for r in results if r["passed"])
	print(f"\n Result: {passed_count}/{len(results)} checks passed")

	return {
	"level": 2,
	"checks": results,
	"activation_stats": activation_stats,
	"grad_issues": grad_issues,
	}

	# ───────────────────────────────────────────────────────────────
	# Level 3: Hyperparameter Diagnosis
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def diagnose_hyperparameters(
	metrics_history: Dict[str, list],
	config: TrainConfig,
	) -> Dict[str, Any]:
	"""Analyze hyperparameter health from training metrics (Level 3).

	Checks:
	- LR: too high (grad_norm hitting clip limit) or too low (grad_norm tiny)
	- Batch size: loss variance indicates batch too small
	- Warmup: spikes in early steps indicate warmup too short
	"""
	print(_header("Level 3: Hyperparameter Diagnosis"))

	findings: List[Dict[str, str]] = []
	grad_norms = metrics_history.get("grad_norm", [])
	train_losses = metrics_history.get("train_loss", [])

	# ── LR diagnosis ──
	print("\n Learning Rate Analysis:")
	print(f" Peak LR: {config.learning_rate:.2e}")
	print(f" Min LR: {config.min_learning_rate:.2e}")

	if grad_norms:
	avg_grad = sum(grad_norms) / len(grad_norms)
	# Ref: PyTorch clip_grad_norm_ clips when total_norm > max_norm
	clip_count = sum(1 for g in grad_norms if g >= config.grad_clip)
	clip_rate = clip_count / len(grad_norms)
	# Relative threshold: < 1% of clip limit (model-size independent)
	tiny_threshold = config.grad_clip * 0.01
	tiny_count = sum(1 for g in grad_norms if g < tiny_threshold)
	tiny_rate = tiny_count / len(grad_norms)

	print(f" Avg grad norm: {avg_grad:.4f}")
	print(f" Clip rate: {clip_rate * 100:.1f}% (hitting max_norm={config.grad_clip})")
	print(f" Tiny grad rate: {tiny_rate * 100:.1f}% (< {tiny_threshold:.4f})")

	# Heuristic: >50% clipping means most steps are capped, so the
	# effective LR is lower than configured. Practitioners generally
	# treat this as a sign that peak LR is too high.
	if clip_rate > 0.5:
	findings.append({
	"issue": "LR may be too high",
	"evidence": f"Grad norm hits clip limit {clip_rate * 100:.0f}% of the time",
	"action": f"Try LR = {config.learning_rate / 2:.2e} (÷2)",
	})
	print(f" 🟡 Grad clipping frequent ({clip_rate * 100:.0f}%) → LR may be too high")
	elif tiny_rate > 0.5:
	findings.append({
	"issue": "Possible vanishing gradients",
	"evidence": f"Grad norm < {tiny_threshold:.4f} in {tiny_rate * 100:.0f}% of steps",
	"action": "Check weight initialization, layer norms, and model depth",
	})
	print(f" 🟡 Grad norm too small ({tiny_rate * 100:.0f}% < {tiny_threshold:.4f}) → possible vanishing gradients")
	else:
	print(f" ✅ LR looks appropriate")

	# ── Batch size diagnosis ──
	print("\n Batch Size Analysis:")
	print(f" Effective batch: {config.effective_batch_size}")

	if len(train_losses) >= 50:
	recent_losses = train_losses[-50:]
	loss_mean = sum(recent_losses) / len(recent_losses)
	loss_var = sum((x - loss_mean) ** 2 for x in recent_losses) / len(recent_losses)
	loss_cv = (loss_var ** 0.5) / max(loss_mean, 1e-8)

	print(f" Recent loss CV: {loss_cv:.4f} (coefficient of variation, last 50 steps)")

	if loss_cv > 0.1:
	findings.append({
	"issue": "Training loss has high variance",
	"evidence": f"Loss CV = {loss_cv:.4f} over last 50 steps",
	"action": "Check: (1) LR may be too high, (2) increase gradient_accumulation_steps, (3) inspect data quality",
	})
	print(f" 🟡 High loss variance → check LR, batch size, or data quality")
	else:
	print(f" ✅ Loss variance is acceptable")

	# ── β₂ diagnosis ──
	print("\n β₂ (Adam second momentum) Analysis:")
	print(f" Current β₂: {config.beta2}")
	if config.beta2 >= _DEFAULT_PYTORCH_BETA2:
	findings.append({
	"issue": "β₂ may be too high for LLM training",
	"evidence": (
	f"β₂={config.beta2} (PyTorch default). "
	f"LLM standard is {_RECOMMENDED_BETA2}"
	),
	"action": f"Set beta2={_RECOMMENDED_BETA2} (used by LLaMA, TinyLlama, OLMo)",
	})
	print(f" 🟡 β₂={config.beta2} is PyTorch default → "
	f"LLM training standard is {_RECOMMENDED_BETA2}")
	print(f" Why: β₂=0.999 averages ~1000 steps of gradient stats,")
	print(f" β₂=0.95 averages ~20 steps → faster adaptation to changing data")
	print(f" (Cattaneo & Shigida 2025, 'Tuning Adam(W)')")
	else:
	print(f" ✅ β₂={config.beta2} is within LLM standard range")

	# ── Weight Decay diagnosis ──
	print("\n Weight Decay Analysis:")
	print(f" Current weight_decay: {config.weight_decay}")
	if config.weight_decay == 0:
	findings.append({
	"issue": "Weight decay is disabled",
	"evidence": "weight_decay=0 increases overfitting risk",
	"action": "Set weight_decay=0.1 (standard for LLaMA, TinyLlama, GPT-3, OLMo)",
	})
	print(f" 🟡 weight_decay=0 → overfitting risk. Standard is 0.1")
	elif config.weight_decay > 0.3:
	findings.append({
	"issue": "Weight decay may be too high",
	"evidence": f"weight_decay={config.weight_decay} (unusually high)",
	"action": "Try weight_decay=0.1 (standard value)",
	})
	print(f" 🟡 weight_decay={config.weight_decay} is unusually high (standard: 0.1)")
	else:
	print(f" ✅ weight_decay={config.weight_decay} is within normal range")

	# ── Model-size LR reference ──
	print("\n GPT-3 LR Reference (Brown et al. 2020):")
	print(" ┌──────────┬───────────┬──────────────┐")
	print(" │ Model │ Peak LR │ Batch Tokens │")
	print(" ├──────────┼───────────┼──────────────┤")
	for params, lr, batch_tok in _GPT3_LR_REFERENCE:
	label = f"{params / 1e9:.1f}B" if params >= 1e9 else f"{params / 1e6:.0f}M"
	marker = " ←" if abs(params - 1.1e9) < 0.5e9 else ""
	print(f" │ {label:<8} │ {lr:.1e} │ {batch_tok:<12} │{marker}")
	print(" └──────────┴───────────┴──────────────┘")
	print(" → Larger models need lower LR and larger batch")

	# ── Batch-LR scaling guidance ──
	print("\n Batch-LR Scaling Rules:")
	print(" • Batch ×2 → LR ×√2 (square root scaling, recommended for Adam)")
	print(" (Malladi et al. NeurIPS 2022, 'On the SDEs and Scaling Rules for Adaptive Gradient Algorithms')")
	print(" • Batch ×2 → LR ×2 (linear scaling, Goyal et al. 2017, mainly SGD)")
	print(" • 1B model: ~1K-2K sequences (~2-4M tokens) is typical")
	print(" (Pythia-1B: ~2M tokens, TinyLlama: ~2M, OLMo-1B: ~4M)")

	# ── Warmup diagnosis ──
	print("\n Warmup Analysis:")
	print(f" Warmup steps: {config.warmup_steps} "
	f"({config.warmup_steps / config.total_steps * 100:.1f}% of total)")

	if len(train_losses) >= 10:
	early_losses = train_losses[:min(50, len(train_losses))]
	# Detect spikes in early training
	spike_count = 0
	for i in range(1, len(early_losses)):
	if early_losses[i] > early_losses[i - 1] * 1.5:
	spike_count += 1

	if spike_count > 3:
	findings.append({
	"issue": "Warmup may be too short",
	"evidence": f"{spike_count} loss spikes in first {len(early_losses)} steps",
	"action": f"Try warmup_steps = {config.warmup_steps * 2}",
	})
	print(f" 🟡 {spike_count} spikes in early training → warmup may be too short")
	else:
	print(f" ✅ Early training is stable")

	# ── Summary ──
	if not findings:
	print("\n ✅ No hyperparameter issues detected")
	else:
	print(f"\n Found {len(findings)} potential issue(s):")
	for f in findings:
	print(f" • {f['issue']}: {f['action']}")

	# ── Warmup reference from real projects ──
	print("\n Warmup Reference (real projects):")
	print(" • TinyLlama 1.1B (3T tokens): 2,000 steps ≈ 0.1% of total")
	print(" • GPT-3 175B: 375M warmup tokens ≈ 117 steps")
	print(" • General range: 0.1% ~ 5% of total steps")
	print(" • Smaller experiments: 5~10% is also reasonable")

	print("\n Tuning priority (high → low):")
	print(" 1. Learning Rate ← tune first (10x impact)")
	print(" 2. Batch Size ← adjust with LR")
	print(" 3. Warmup Steps ← early stability")
	print(" 4. Weight Decay ← if overfitting (typically 0.1)")
	print(" 5. β₁, β₂ (Adam) ← see β₂ analysis above")
	print(" 6. Gradient Clip ← usually keep at 1.0")

	return {
	"level": 3,
	"findings": findings,
	"config_summary": {
	"learning_rate": config.learning_rate,
	"effective_batch": config.effective_batch_size,
	"warmup_steps": config.warmup_steps,
	"total_steps": config.total_steps,
	"grad_clip": config.grad_clip,
	},
	}

	@staticmethod
	def lr_range_test(
	model: nn.Module,
	dataloader: DataLoader,
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	lr_start: float = 1e-7,
	lr_end: float = 1e-1,
	steps: int = 300,
	) -> Dict[str, Any]:
	"""Run an LR range test to find the optimal learning rate (Level 3 bonus).

	Sweeps LR from lr_start to lr_end exponentially, recording loss.
	The optimal LR is where loss decreases fastest (steepest slope),
	divided by 3~10 for stability.

	WARNING: This modifies a copy of the model. The original is untouched.
	"""
	print(_header("Level 3 Bonus: LR Range Test"))
	print(f" Sweeping LR from {lr_start:.1e} to {lr_end:.1e} over {steps} steps...\n")

	test_model = copy.deepcopy(model)
	test_model.to(device)
	test_model.train()
	optimizer = torch.optim.AdamW(test_model.parameters(), lr=lr_start)

	lr_mult = (lr_end / lr_start) ** (1 / steps)
	lr = lr_start

	lrs: List[float] = []
	losses: List[float] = []
	data_iter = iter(dataloader)

	for step in range(steps):
	for pg in optimizer.param_groups:
	pg["lr"] = lr

	try:
	batch = next(data_iter)
	except StopIteration:
	data_iter = iter(dataloader)
	batch = next(data_iter)

	input_ids = batch["input_ids"].to(device)
	targets_t = batch["targets"].to(device)

	optimizer.zero_grad()
	with torch.amp.autocast(device_type="cuda", dtype=dtype, enabled=(dtype != torch.float32)):
	_, loss = test_model(input_ids, targets_t)
	loss.backward()
	optimizer.step()

	loss_val = loss.item()
	lrs.append(lr)
	losses.append(loss_val)

	if (step + 1) % 50 == 0:
	print(f" Step {step + 1}: LR = {lr:.2e}, Loss = {loss_val:.4f}")

	# Stop if loss explodes
	if len(losses) > 1 and loss_val > losses[0] * 4:
	print(f" Loss exploded at LR = {lr:.2e}, stopping.")
	break

	lr *= lr_mult

	del test_model, optimizer
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Find steepest descent
	best_lr = lr_start
	if len(losses) > 10:
	# Smooth losses and find steepest negative slope
	window = 5
	smoothed = []
	for i in range(len(losses) - window):
	smoothed.append(sum(losses[i:i + window]) / window)

	min_slope = 0
	min_idx = 0
	for i in range(1, len(smoothed)):
	slope = smoothed[i] - smoothed[i - 1]
	if slope < min_slope:
	min_slope = slope
	min_idx = i

	best_lr = lrs[min_idx]
	suggested_lr = best_lr / 3 # conservative choice

	print(f"\n Steepest descent at LR = {best_lr:.2e}")
	print(f" Suggested peak LR: {suggested_lr:.2e} (÷3 for stability)")
	print(f" Conservative range: [{best_lr / 10:.2e}, {best_lr / 3:.2e}]")
	else:
	suggested_lr = 3e-4
	print(f"\n Not enough data points. Using default LR = {suggested_lr:.2e}")

	return {
	"lrs": lrs,
	"losses": losses,
	"best_lr": best_lr,
	"suggested_lr": suggested_lr,
	}

	# ───────────────────────────────────────────────────────────────
	# Level 4: Overfitting vs Underfitting Diagnosis
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def diagnose_fitting(
	metrics_history: Dict[str, list],
	model_params: Optional[int] = None,
	total_tokens: Optional[int] = None,
	) -> Dict[str, Any]:
	"""Diagnose overfitting vs underfitting from metrics (Level 4).

	Cases:
	1. Both high, decreasing → Normal (still training)
	2. Both high, plateau → Underfitting
	3. Train↓ Val→ or Val↑ → Overfitting
	4. Both low, plateau → Converged (or at limit)
	"""
	print(_header("Level 4: Overfitting vs Underfitting Diagnosis"))

	train_losses = metrics_history.get("train_loss", [])
	val_losses = [v for v in metrics_history.get("val_loss", []) if v is not None]

	if len(train_losses) < 10 or len(val_losses) < 2:
	print(" [!] Not enough data. Need more training steps with eval.")
	return {"level": 4, "case": "insufficient_data", "recommendations": []}

	# Recent train trend
	recent_n = min(50, len(train_losses))
	train_recent = train_losses[-recent_n:]
	train_mid = len(train_recent) // 2
	train_first = sum(train_recent[:train_mid]) / max(train_mid, 1)
	train_second = sum(train_recent[train_mid:]) / max(len(train_recent) - train_mid, 1)
	train_decreasing = train_second < train_first - 0.02

	# Val trend
	val_mid = len(val_losses) // 2
	val_first = sum(val_losses[:max(val_mid, 1)]) / max(val_mid, 1)
	val_second = sum(val_losses[val_mid:]) / max(len(val_losses) - val_mid, 1)
	val_decreasing = val_second < val_first - 0.02
	val_increasing = val_second > val_first + 0.05

	# Train-Val gap
	last_train = train_losses[-1]
	last_val = val_losses[-1]
	gap = last_train - last_val # negative means val > train (typical)

	print(f" Train loss (recent): {train_first:.4f} → {train_second:.4f} "
	f"({'↓' if train_decreasing else '→'})")
	print(f" Val loss: {val_first:.4f} → {val_second:.4f} "
	f"({'↓' if val_decreasing else '↑' if val_increasing else '→'})")
	print(f" Train-Val gap: {abs(gap):.4f}")

	# ── Classify ──
	case = ""
	recommendations: List[str] = []

	if train_decreasing and val_decreasing:
	case = "Case 1: Normal — both decreasing"
	recommendations.append("Training is progressing normally. Continue.")
	if model_params and total_tokens:
	ratio = total_tokens / model_params
	chinchilla = 20 # Chinchilla optimal: 20 tokens per param
	if ratio < chinchilla:
	recommendations.append(
	f"Token/param ratio = {ratio:.1f}x "
	f"(Chinchilla optimal ≈ {chinchilla}x). "
	f"Model may benefit from more data."
	)
	print(f"\n 🟢 {case}")

	elif not train_decreasing and not val_decreasing and last_train > _EXPECTED_TRAIN_LOSS[1]:
	case = "Case 2: Underfitting — both plateaued at high loss"
	recommendations = [
	"Diagnosis priority (check in order):",
	"1) Training insufficient? → check if loss curve still has downward slope",
	" - Chinchilla: 1B model needs ~20B tokens minimum",
	" - TinyLlama trains 1.1B on 3T tokens (inference-optimal)",
	"2) LR too low? → try LR ×2, see if loss drops faster",
	"3) Model capacity too small? → train 2x larger model on same data",
	" - If larger model gets lower loss → capacity was the limit",
	"4) Data quality? → sample and read training data manually",
	" - Noisy/low-quality data raises the achievable loss floor",
	]
	if model_params and total_tokens:
	ratio = total_tokens / model_params
	if ratio < 10:
	recommendations.insert(0,
	f"⚠ Token/param ratio = {ratio:.1f}x — "
	f"very likely undertrained. Chinchilla recommends ≥20x."
	)
	elif ratio < 20:
	recommendations.insert(0,
	f"ℹ Token/param ratio = {ratio:.1f}x — "
	f"below Chinchilla optimal (20x). More tokens may help."
	)
	print(f"\n 🟡 {case}")

	elif train_decreasing and (val_increasing or not val_decreasing):
	case = "Case 3: Overfitting — train↓ but val→/↑"
	recommendations = [
	"Diagnosis priority (check in order):",
	"1) Data repetition? (most common cause in pretraining)",
	" - Check: total tokens vs unique tokens",
	" - Epoch > 1 dramatically increases overfitting risk",
	" - Solution: add more data, stay within 1 epoch",
	"2) Weight decay too low?",
	" - Check: weight_decay value (standard: 0.1)",
	" - LLaMA, TinyLlama, OLMo, GPT-3 all use 0.1",
	" - Experiment: 0.01 / 0.05 / 0.1 / 0.3",
	"3) Data diversity?",
	" - Single-domain data overfits faster",
	" - Mix: web, books, code, wiki, etc.",
	"",
	"Note on Dropout in LLM pretraining:",
	" - Modern LLMs do NOT use dropout in pretraining",
	" (Pythia, TinyLlama, OLMo, LLaMA all use dropout=0)",
	" - Sufficient data is the best regularization",
	" - Dropout is useful for fine-tuning on small datasets",
	]
	print(f"\n 🟡 {case}")

	else:
	case = "Case 4: Converged — loss is low and stable"
	recommendations = [
	"Training has converged (or reached the data/model limit).",
	"To push further: add more data or increase model size.",
	]
	print(f"\n 🟢 {case}")

	for rec in recommendations:
	print(f" {rec}")

	return {
	"level": 4,
	"case": case,
	"train_trend": "decreasing" if train_decreasing else "flat",
	"val_trend": "decreasing" if val_decreasing else ("increasing" if val_increasing else "flat"),
	"gap": abs(gap),
	"recommendations": recommendations,
	}

	# ───────────────────────────────────────────────────────────────
	# Level 5: Architecture Checks
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def check_architecture(
	model: nn.Module,
	dataloader: DataLoader,
	device: torch.device,
	) -> Dict[str, Any]:
	"""Check weight initialization and per-layer activation health (Level 5).

	Healthy initialization:
	- All layers: std ≈ 1.0, mean ≈ 0.0
	Problems:
	- std increasing per layer → activation explosion (init scale too large)
	- std decreasing per layer → activation vanishing (init scale too small)
	- Sudden change at specific layer → implementation bug in that layer
	"""
	print(_header("Level 5: Architecture / Initialization Check"))

	batch = next(iter(dataloader))
	sample_input = batch["input_ids"][:1].to(device)

	model.eval()
	layer_stats: List[Dict[str, Any]] = []

	with torch.no_grad():
	h = model.token_embedding(sample_input)
	emb_std = h.float().std().item()
	print(f"\n Embedding: std={emb_std:.4f}")

	for i, layer in enumerate(model.layers):
	h = layer(h, mask=None, position_offset=0)
	h_f = h.float()
	stats = {
	"layer": i,
	"mean": h_f.mean().item(),
	"std": h_f.std().item(),
	"max": h_f.abs().max().item(),
	}
	layer_stats.append(stats)

	# Print stats
	print(f"\n Layer-by-layer activation statistics:")
	print(f" {'Layer':<8} {'Mean':>10} {'Std':>10} {'Max':>10}")
	print(f" {'-' * 38}")
	for s in layer_stats:
	print(f" {s['layer']:<8} {s['mean']:>10.4f} {s['std']:>10.4f} {s['max']:>10.4f}")

	# ── Weight initialization distribution check ──
	print(f"\n Weight Initialization Distribution:")
	print(f" {'Parameter':<40} {'Mean':>10} {'Std':>10} {'Shape'}")
	print(f" {'-' * 75}")
	weight_issues = []
	for name, param in model.named_parameters():
	if param.ndim < 2:
	continue # skip biases, norm weights
	p_f = param.float()
	p_mean = p_f.mean().item()
	p_std = p_f.std().item()
	# Expected: std ≈ 0.02 for most layers, smaller for residual projections
	shape_str = str(list(param.shape))
	is_residual = "o_proj" in name or "down_proj" in name
	expected_std = 0.02 # GPT-2 style
	if p_std > expected_std * 5:
	weight_issues.append(f"Large std: {name} (std={p_std:.4f})")
	print(f" 🟡 {name:<38} {p_mean:>10.4f} {p_std:>10.4f} {shape_str}")
	elif p_std < expected_std * 0.1:
	weight_issues.append(f"Tiny std: {name} (std={p_std:.6f})")
	print(f" 🟡 {name:<38} {p_mean:>10.4f} {p_std:>10.6f} {shape_str}")
	else:
	print(f" {name:<38} {p_mean:>10.4f} {p_std:>10.4f} {shape_str}")

	if weight_issues:
	print(f"\n ⚠ {len(weight_issues)} weight distribution issue(s) found")
	for issue in weight_issues[:5]:
	print(f" • {issue}")
	else:
	print(f"\n ✅ All weight distributions look normal (std ≈ 0.02)")

	print(f"\n Expected init pattern:")
	print(f" • General Linear: N(0, 0.02)")
	print(f" • Residual proj (o_proj, down_proj): N(0, 0.02/√(2×layers))")
	print(f" • Embedding: N(0, 0.02)")

	# ── Ablation study guidance ──
	print(f"\n Component Ablation Reference:")
	print(" ┌──────────────────────┬────────────────────────────────────┐")
	print(" │ Experiment │ Expected Outcome │")
	print(" ├──────────────────────┼────────────────────────────────────┤")
	print(" │ RMSNorm → LayerNorm │ Minimal loss diff → OK │")
	print(" │ RoPE → Absolute PE │ Similar on short seq (<512) │")
	print(" │ SwiGLU → ReLU FFN │ Loss +0.05~0.15 → SwiGLU working │")
	print(" │ GQA → MHA │ Same loss, less memory → OK │")
	print(" └──────────────────────┴────────────────────────────────────┘")
	print(" If any replacement shows unexpected results, check that component.")

	# Analyze trends
	stds = [s["std"] for s in layer_stats]
	diagnosis = "healthy"
	detail = ""

	if len(stds) >= 3:
	# Check for monotonic increase/decrease
	first_third = sum(stds[:len(stds) // 3]) / (len(stds) // 3)
	last_third = sum(stds[-(len(stds) // 3):]) / (len(stds) // 3)
	ratio = last_third / max(first_third, 1e-8)

	if ratio > 5:
	diagnosis = "exploding"
	detail = (
	f"Activation std grows {ratio:.1f}x from early to late layers. "
	f"Init scale may be too large."
	)
	elif ratio < 0.2:
	diagnosis = "vanishing"
	detail = (
	f"Activation std shrinks to {ratio:.1f}x from early to late layers. "
	f"Init scale may be too small."
	)
	else:
	detail = f"Std ratio (last/first third) = {ratio:.2f} — within normal range."

	# Check for sudden jumps
	for i in range(1, len(stds)):
	jump = stds[i] / max(stds[i - 1], 1e-8)
	if jump > 10 or jump < 0.1:
	diagnosis = "anomaly"
	detail = (
	f"Sudden activation change at layer {i}: "
	f"std {stds[i - 1]:.4f} → {stds[i]:.4f}. "
	f"Possible implementation bug in that layer."
	)
	break

	icon = {"healthy": "✅", "exploding": "🔴", "vanishing": "🟡", "anomaly": "🔴"}
	print(f"\n {icon.get(diagnosis, '⚪')} Diagnosis: {diagnosis}")
	print(f" {detail}")

	return {
	"level": 5,
	"diagnosis": diagnosis,
	"detail": detail,
	"layer_stats": layer_stats,
	"weight_issues": weight_issues,
	}

	# ───────────────────────────────────────────────────────────────
	# Main Entry Point
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def run_diagnostics(
	model: nn.Module,
	dataloader: DataLoader,
	tokenizer: Any,
	train_config: TrainConfig,
	metrics_history: Dict[str, list],
	device: torch.device,
	dtype: torch.dtype = torch.bfloat16,
	vocab_size: int = 32000,
	levels: Optional[List[int]] = None,
	) -> Dict[str, Any]:
	"""Run the full 5-level debugging framework.

	Args:
	model: the LLM model
	dataloader: training dataloader
	tokenizer: tokenizer with encode/decode methods
	train_config: TrainConfig instance
	metrics_history: dict from MetricsTracker.history
	device: torch device
	dtype: mixed precision dtype
	vocab_size: model vocabulary size
	levels: which levels to run (default: all [0,1,2,3,4,5])

	Returns:
	Full diagnostic report dict.
	"""
	if levels is None:
	levels = [0, 1, 2, 3, 4, 5]

	print("\n" + "═" * 60)
	print(" LLM Loss Debugging Framework")
	print(" Levels to run: " + ", ".join(str(l) for l in levels))
	print("═" * 60)

	report: Dict[str, Any] = {}

	if 0 in levels:
	report["level_0"] = LossDebugger.diagnose_status(vocab_size, metrics_history)
	# If status is normal and only level 0 was explicitly requested, skip rest
	if (
	report["level_0"]["status"] == STATUS_NORMAL
	and levels == [0]
	):
	print("\n Training is healthy — no further debugging needed.")
	return report

	if 1 in levels:
	report["level_1"] = LossDebugger.check_data_pipeline(
	model, dataloader, tokenizer, vocab_size, device, dtype,
	)

	if 2 in levels:
	report["level_2"] = LossDebugger.check_numerical_stability(
	model, dataloader, device, dtype,
	)

	if 3 in levels:
	report["level_3"] = LossDebugger.diagnose_hyperparameters(
	metrics_history, train_config,
	)

	if 4 in levels:
	model_params = sum(p.numel() for p in model.parameters())
	total_tokens = len(metrics_history.get("train_loss", [])) * train_config.tokens_per_step
	report["level_4"] = LossDebugger.diagnose_fitting(
	metrics_history, model_params, total_tokens,
	)

	if 5 in levels:
	report["level_5"] = LossDebugger.check_architecture(
	model, dataloader, device,
	)

	# Final summary
	print("\n" + "═" * 60)
	print(" Diagnostics Complete")
	print("═" * 60)

	return report

	# ───────────────────────────────────────────────────────────────
	# Study Roadmap
	# ───────────────────────────────────────────────────────────────

	@staticmethod
	def print_study_roadmap() -> None:
	"""Print the recommended study roadmap for LLM training optimization."""
	print(_header("Study Roadmap — LLM Training Optimization"))

	print("""
	⭐⭐⭐ Top Priority: Optimization Fundamentals
	─────────────────────────────────────────────
	1. SGD → Momentum → Adam → AdamW progression
	- Why Adam > SGD? Why decouple weight decay in AdamW?
	- β₁, β₂ intuition (1st / 2nd momentum)
	- Ref: Loshchilov & Hutter 2019 (AdamW)
	- Ref: Karpathy "A Recipe for Training Neural Networks"

	2. Loss Landscape
	- Why large LR diverges, small LR stalls
	- Batch size effect on landscape exploration
	- Ref: Li et al. 2018 "Visualizing the Loss Landscape"
	- Ref: McCandlish et al. 2018 "Large-Batch Training"

	3. Chinchilla Scaling Law
	- Loss = f(N, D) relationship
	- Compute-optimal model size vs data allocation
	- Ref: Hoffmann et al. 2022 (original)
	- Ref: Kaplan et al. 2020 (predecessor)
	- Ref: Besiroglu et al. 2024 (replication/verification)

	⭐⭐ Important: Training Stability
	──────────────────────────────────
	4. Gradient Flow: vanishing/exploding, residual as gradient highway
	5. Weight Init: Xavier / Kaiming / GPT-2 style
	6. Normalization: BatchNorm → LayerNorm → RMSNorm
	7. Weight Decay: L2 vs decoupled, why exclude embed/norm

	⭐ Advanced: Optimization Techniques
	─────────────────────────────────────
	8. LR Schedules: cosine vs linear vs step, warmup/cooldown
	9. Gradient Accumulation & Large Batch Training
	10. μP (Maximal Update Parameterization): transfer HP across scales

	Recommended Experiments (in order):
	───────────────────────────────────
	1. Single-batch overfit (30 min) → basic sanity
	2. LR Range Test (1 hour) → optimal LR range
	3. 10M model quick train (2-3 hrs) → pipeline validation
	4. Ablation (remove components) (1 day) → component contribution
	5. 100M model + 5B tokens (1-2 days)→ mid-scale dynamics
	6. 1B model full training (2-3 days)→ scaling law verification
	7. LR / batch size comparison (1 day) → HP sensitivity

	Key References:
	───────────────
	⭐⭐⭐ Karpathy "Recipe for Training NNs" — debugging mindset
	⭐⭐⭐ Hoffmann et al. 2022 (Chinchilla) — scaling law
	⭐⭐ Touvron et al. 2023 (LLaMA) — 1B+ training details
	⭐⭐ Biderman et al. 2023 (Pythia) — open training logs
	⭐⭐ Zhang et al. 2024 (TinyLlama) — 1.1B on 3T tokens
	⭐⭐ Groeneveld et al. 2024 (OLMo) — fully open LLM
	⭐⭐ Li et al. 2018 (Loss Landscape) — loss terrain intuition
	⭐⭐ Loshchilov & Hutter 2019 (AdamW) — optimizer basics
	⭐ Yang et al. 2022 (μP) — HP transfer
	⭐ McCandlish et al. 2018 (Batch size) — critical batch size
	""")