Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

pytorch-training-debugger / ml_training_debugger /simulation.py

omkarrr88

Real training curves added

aa0bed2 2 months ago

11.3 kB

	"""Training curve generation — real PyTorch mini-training with parametric fallback.

	Primary: run_real_training() from pytorch_engine (20 real epochs, cached per task/seed).
	Fallback: parametric torch.Tensor formulas for edge cases.
	Zero numpy. Spec reference: Section 6.
	"""

	from __future__ import annotations

	import torch

	from ml_training_debugger.scenarios import ScenarioParams

	EPOCHS = 20


	def _get_real_curves(scenario: ScenarioParams) -> dict[str, list[float]] \| None:
	"""Try to get real training curves. Returns None on failure."""
	try:
	from ml_training_debugger.pytorch_engine import run_real_training

	return run_real_training(scenario)
	except Exception:
	return None


	def gen_loss_history(scenario: ScenarioParams) -> list[float]:
	"""Generate training loss history (20 epochs).

	Uses real mini-training (cached). Falls back to parametric on failure.
	"""
	real = _get_real_curves(scenario)
	if real is not None:
	return real["loss_history"]

	# Parametric fallback
	torch.manual_seed(scenario.seed)
	t = torch.arange(EPOCHS, dtype=torch.float32)

	root = scenario.root_cause.value

	if root == "lr_too_high":
	# Exponentially growing loss
	lr_tensor = torch.tensor(scenario.learning_rate, dtype=torch.float32)
	base = torch.exp(lr_tensor * t * 0.5)
	loss = 2.3 * base
	# Add NaN marker after epoch 12
	loss_list = loss.tolist()
	for i in range(12, EPOCHS):
	loss_list[i] = float("inf")
	return loss_list

	if root == "vanishing_gradients":
	# Flat loss — barely decreases
	noise = torch.randn(EPOCHS) * 0.02
	loss = 2.3 - t * 0.002 + noise
	return loss.clamp(min=0.01).tolist()

	if root == "data_leakage":
	# Normal-looking training loss
	loss = 2.3 * torch.exp(-0.15 * t) + 0.05
	noise = torch.randn(EPOCHS) * 0.02
	return (loss + noise).clamp(min=0.01).tolist()

	if root == "overfitting":
	# Steadily decreasing to near-zero
	loss = 2.3 * torch.exp(-0.25 * t) + 0.01
	noise = torch.randn(EPOCHS) * 0.01
	return (loss + noise).clamp(min=0.001).tolist()

	if root == "batchnorm_eval_mode":
	# Roughly normal with higher variance
	base = 2.3 * torch.exp(-0.1 * t) + 0.3
	noise = torch.randn(EPOCHS) * 0.15
	return (base + noise).clamp(min=0.1).tolist()

	if root == "code_bug":
	loss = 2.3 * torch.exp(-0.05 * t) + 0.5
	noise = torch.randn(EPOCHS) * 0.1
	return (loss + noise).clamp(min=0.1).tolist()

	if root == "scheduler_misconfigured":
	# Training starts well, then LR drops too aggressively causing stagnation
	step_size = scenario.scheduler_step_size
	gamma = scenario.scheduler_gamma
	loss_list: list[float] = []
	for i in range(EPOCHS):
	if i < step_size:
	val = 2.3 * (1.0 - 0.15 * i) # normal decrease
	else:
	steps_decayed = (i - step_size) // step_size + 1
	effective_lr_ratio = gamma ** steps_decayed
	val = 2.3 * (1.0 - 0.15 * step_size) + 0.05 * (i - step_size) * (1 - effective_lr_ratio)
	loss_list.append(max(0.3, val + torch.randn(1).item() * 0.05))
	return loss_list

	# Fallback
	return (2.3 * torch.exp(-0.1 * t)).tolist()


	def gen_val_accuracy_history(scenario: ScenarioParams) -> list[float]:
	"""Generate validation accuracy history (20 epochs).

	Uses real mini-training (cached). Falls back to parametric on failure.
	"""
	real = _get_real_curves(scenario)
	if real is not None:
	return real["val_acc_history"]

	# Parametric fallback
	torch.manual_seed(scenario.seed + 1)
	t = torch.arange(EPOCHS, dtype=torch.float32)

	root = scenario.root_cause.value

	if root == "lr_too_high":
	# Collapses along with training loss
	acc = torch.sigmoid(torch.linspace(0, -3, EPOCHS)) * 0.5
	return acc.clamp(0.0, 1.0).tolist()

	if root == "vanishing_gradients":
	# Near random chance
	noise = torch.randn(EPOCHS) * 0.02
	acc = 0.10 + t * 0.001 + noise
	return acc.clamp(0.0, 1.0).tolist()

	if root == "data_leakage":
	# Suspiciously high from epoch 1
	leakage = torch.tensor(scenario.leakage_pct, dtype=torch.float32)
	base = torch.sigmoid(torch.linspace(-3, 3, EPOCHS))
	acc = base * (1.0 - leakage) + leakage * 0.95
	# Inflate early epochs
	acc = acc.clamp(0.0, 1.0)
	# Ensure suspiciously high from epoch 1
	acc_list = acc.tolist()
	for i in range(EPOCHS):
	acc_list[i] = max(acc_list[i], 0.82 * (1.0 + scenario.leakage_pct))
	return [min(v, 0.99) for v in acc_list]

	if root == "overfitting":
	# Rises then falls — classic divergence
	div = scenario.divergence_epoch
	acc_list: list[float] = []
	for i in range(EPOCHS):
	if i < div:
	val = 0.10 + (0.75 - 0.10) * (i / max(div, 1))
	else:
	decline = (i - div) * 0.02
	val = 0.75 - decline
	acc_list.append(max(0.0, min(1.0, val)))
	return acc_list

	if root == "batchnorm_eval_mode":
	# Slow degradation ~1-2% per epoch
	start = 0.76
	noise = torch.randn(EPOCHS) * 0.01
	acc = torch.tensor(
	[start - 0.015 * i for i in range(EPOCHS)], dtype=torch.float32
	)
	acc = acc + noise
	return acc.clamp(0.0, 1.0).tolist()

	if root == "code_bug":
	noise = torch.randn(EPOCHS) * 0.03
	acc = 0.10 + t * 0.005 + noise
	return acc.clamp(0.0, 1.0).tolist()

	if root == "scheduler_misconfigured":
	# Accuracy improves initially, then stagnates/degrades when scheduler kills LR
	step_size = scenario.scheduler_step_size
	acc_list: list[float] = []
	for i in range(EPOCHS):
	if i < step_size:
	val = 0.10 + 0.08 * i
	else:
	val = 0.10 + 0.08 * step_size - 0.01 * (i - step_size)
	acc_list.append(max(0.05, min(0.95, val + torch.randn(1).item() * 0.02)))
	return acc_list

	# Fallback
	return (torch.sigmoid(torch.linspace(-3, 3, EPOCHS)) * 0.9).tolist()


	def gen_val_loss_history(scenario: ScenarioParams) -> list[float]:
	"""Generate validation loss history (20 epochs).

	Uses real mini-training (cached). Falls back to parametric on failure.
	"""
	real = _get_real_curves(scenario)
	if real is not None:
	return real["val_loss_history"]

	# Parametric fallback
	torch.manual_seed(scenario.seed + 2)
	t = torch.arange(EPOCHS, dtype=torch.float32)

	root = scenario.root_cause.value

	if root == "lr_too_high":
	# Mirrors training loss divergence
	lr_tensor = torch.tensor(scenario.learning_rate, dtype=torch.float32)
	loss = 2.3 * torch.exp(lr_tensor * t * 0.5)
	loss_list = loss.tolist()
	for i in range(12, EPOCHS):
	loss_list[i] = float("inf")
	return loss_list

	if root == "vanishing_gradients":
	noise = torch.randn(EPOCHS) * 0.02
	loss = 2.3 - t * 0.001 + noise
	return loss.clamp(min=0.01).tolist()

	if root == "data_leakage":
	# Low val loss (because leaking train data into val)
	base = 2.3 * torch.exp(-0.2 * t) + 0.03
	noise = torch.randn(EPOCHS) * 0.02
	return (base + noise).clamp(min=0.01).tolist()

	if root == "overfitting":
	# Initially decreases, then diverges upward
	div = scenario.divergence_epoch
	loss_list: list[float] = []
	for i in range(EPOCHS):
	if i < div:
	val = 2.3 * (1.0 - 0.8 * i / max(div, 1))
	else:
	val = 0.46 + 0.1 * (i - div)
	loss_list.append(max(0.01, val))
	return loss_list

	if root == "batchnorm_eval_mode":
	# Slightly increasing
	base = 1.5 + t * 0.03
	noise = torch.randn(EPOCHS) * 0.1
	return (base + noise).clamp(min=0.1).tolist()

	if root == "code_bug":
	loss = 2.3 * torch.exp(-0.03 * t) + 0.8
	noise = torch.randn(EPOCHS) * 0.1
	return (loss + noise).clamp(min=0.1).tolist()

	if root == "scheduler_misconfigured":
	step_size = scenario.scheduler_step_size
	loss_list: list[float] = []
	for i in range(EPOCHS):
	if i < step_size:
	val = 2.3 * (1.0 - 0.12 * i)
	else:
	val = 2.3 * (1.0 - 0.12 * step_size) + 0.03 * (i - step_size)
	loss_list.append(max(0.1, val + torch.randn(1).item() * 0.05))
	return loss_list

	# Fallback
	return (2.3 * torch.exp(-0.1 * t) + 0.1).tolist()


	def _gen_confusion_matrix(scenario: ScenarioParams) -> list[list[float]]:
	"""Generate a 10x10 confusion matrix based on the fault type."""
	torch.manual_seed(scenario.seed + 10)
	root = scenario.root_cause.value
	n = 10

	if root == "data_leakage":
	# High diagonal but with leakage-induced off-diagonal noise
	base = torch.eye(n) * 0.8
	noise = torch.rand(n, n) * scenario.leakage_pct * 0.3
	cm = base + noise
	elif root == "overfitting":
	# Near-perfect diagonal (memorized)
	cm = torch.eye(n) * 0.95 + torch.rand(n, n) * 0.02
	else:
	# Normal confusion with moderate accuracy
	cm = torch.eye(n) * 0.6 + torch.rand(n, n) * 0.08

	# Normalize rows to sum to ~1.0
	row_sums = cm.sum(dim=1, keepdim=True)
	cm = cm / row_sums
	return cm.tolist()


	def gen_data_batch_stats(scenario: ScenarioParams) -> dict:
	"""Generate data batch statistics for the scenario."""
	torch.manual_seed(scenario.seed + 3)

	root = scenario.root_cause.value

	cm = _gen_confusion_matrix(scenario)

	if root == "data_leakage":
	overlap = 0.5 + scenario.leakage_pct * 1.5
	overlap = min(overlap, 0.92)
	return {
	"label_distribution": {i: 0.1 for i in range(10)},
	"feature_mean": 0.45 + torch.randn(1).item() * 0.05,
	"feature_std": 0.22 + torch.randn(1).item() * 0.02,
	"null_count": 0,
	"class_overlap_score": overlap,
	"batch_size": 64,
	"duplicate_ratio": scenario.leakage_pct,
	"confusion_matrix": cm,
	}

	if root == "overfitting":
	return {
	"label_distribution": {i: 0.1 for i in range(10)},
	"feature_mean": 0.48 + torch.randn(1).item() * 0.03,
	"feature_std": 0.25 + torch.randn(1).item() * 0.02,
	"null_count": 0,
	"class_overlap_score": 0.0,
	"batch_size": 64,
	"duplicate_ratio": 0.0,
	"confusion_matrix": cm,
	}

	# Default: normal data
	return {
	"label_distribution": {i: 0.1 for i in range(10)},
	"feature_mean": 0.47 + torch.randn(1).item() * 0.03,
	"feature_std": 0.24 + torch.randn(1).item() * 0.02,
	"null_count": 0,
	"class_overlap_score": 0.0 + torch.randn(1).abs().item() * 0.05,
	"batch_size": 64,
	"duplicate_ratio": 0.0,
	"confusion_matrix": cm,
	}