Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

pytorch-training-debugger / ml_training_debugger /scenarios.py

omkarrr88

minor changes

206438f about 2 months ago

6.61 kB

	"""ScenarioParams and scenario sampling.

	Internal scenario configuration — not exposed to the agent.
	"""

	from __future__ import annotations

	import dataclasses
	from typing import Optional

	import torch

	from ml_training_debugger.models import RootCauseDiagnosis


	@dataclasses.dataclass(frozen=True)
	class ScenarioParams:
	"""Internal scenario parameters created at reset() time."""

	task_id: str
	root_cause: RootCauseDiagnosis
	seed: int
	learning_rate: float = 0.001
	weight_decay: float = 0.0001
	leakage_pct: float = 0.0
	depth_multiplier: float = 1.0
	divergence_epoch: int = 5
	red_herring_intensity: float = 1.0
	red_herring_spike_layer: str = "fc"
	bug_type: Optional[str] = None
	notes: Optional[str] = None
	error_log: Optional[str] = None
	gpu_memory_used_gb: float = 6.2
	max_steps: int = 20
	model_type: str = "cnn"
	difficulty_level: int = 3
	scheduler_gamma: float = 0.1
	scheduler_step_size: int = 10


	def _task_seed(task_id: str, seed: int) -> int:
	"""Derive a deterministic seed from task_id and provided seed."""
	task_num = int(task_id.split("_")[1])
	return seed * 1000 + task_num


	def _choose(options: list, rng: torch.Generator) -> object:
	"""Choose a random element from a list using torch RNG."""
	idx = int(torch.randint(0, len(options), (1,), generator=rng).item())
	return options[idx]


	def _pick_model_type(rng: torch.Generator) -> str:
	"""Randomly pick CNN or MLP architecture."""
	return str(_choose(["cnn", "mlp"], rng))


	def sample_scenario(
	task_id: str, seed: int = 42, difficulty_level: int = 3
	) -> ScenarioParams:
	"""Sample a ScenarioParams for the given task.

	Args:
	task_id: One of task_001 through task_007.
	seed: Base seed for reproducibility.
	difficulty_level: 1 (easy signals) to 5 (max ambiguity). Default 3.

	Returns:
	ScenarioParams with randomized fault parameters.

	Raises:
	ValueError: If task_id is unknown.
	"""
	effective_seed = _task_seed(task_id, seed)
	rng = torch.Generator()
	rng.manual_seed(effective_seed)

	if task_id == "task_001":
	lr = _choose([0.05, 0.08, 0.10, 0.15, 0.30], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.LR_TOO_HIGH,
	seed=effective_seed,
	learning_rate=float(lr),
	error_log=f"RuntimeError: Loss is NaN at epoch 12 (lr={lr})",
	max_steps=20,
	model_type=_pick_model_type(rng),
	difficulty_level=difficulty_level,
	)

	if task_id == "task_002":
	lr = _choose([1e-6, 5e-6, 1e-5], rng)
	depth_mult = _choose([1.0, 1.5, 2.0], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.VANISHING_GRADIENTS,
	seed=effective_seed,
	learning_rate=float(lr),
	depth_multiplier=float(depth_mult),
	notes=(
	"Training resumed from a checkpoint saved at epoch 0 — "
	"early learning rate warmup may still be in effect."
	),
	max_steps=20,
	model_type=_pick_model_type(rng),
	difficulty_level=difficulty_level,
	)

	if task_id == "task_003":
	leakage = _choose([0.12, 0.18, 0.22, 0.28], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.DATA_LEAKAGE,
	seed=effective_seed,
	leakage_pct=float(leakage),
	notes=(
	"Model architecture upgraded from 2-layer to 4-layer CNN "
	"at epoch 2. Performance improvement may reflect increased "
	"model capacity."
	),
	max_steps=25,
	model_type=_pick_model_type(rng),
	difficulty_level=difficulty_level,
	)

	if task_id == "task_004":
	wd = _choose([0.0, 0.0001, 0.001], rng)
	div_epoch = _choose([5, 8, 12], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.OVERFITTING,
	seed=effective_seed,
	weight_decay=float(wd),
	divergence_epoch=int(div_epoch),
	notes=(
	"Dataset augmentation was disabled for this run to speed "
	"up training. Re-enabling may improve generalization."
	),
	max_steps=25,
	model_type=_pick_model_type(rng),
	difficulty_level=difficulty_level,
	)

	if task_id == "task_005":
	intensity = torch.empty(1).uniform_(0.8, 2.5, generator=rng).item()
	spike_layer = _choose(["fc", "conv1"], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.BATCHNORM_EVAL_MODE,
	seed=effective_seed,
	red_herring_intensity=float(intensity),
	red_herring_spike_layer=str(spike_layer),
	gpu_memory_used_gb=14.56,
	error_log=(
	"Warning: GPU memory pressure detected, consider reducing "
	"batch size or enabling gradient checkpointing"
	),
	max_steps=30,
	model_type="cnn", # CNN always for BatchNorm eval — MLP BatchNorm1d behaves differently
	difficulty_level=difficulty_level,
	)

	if task_id == "task_006":
	bug = _choose(
	["eval_mode", "detach_loss", "zero_grad_missing", "inplace_relu"], rng
	)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.CODE_BUG,
	seed=effective_seed,
	bug_type=str(bug),
	notes="Try adjusting the learning rate schedule.",
	max_steps=30,
	model_type="cnn", # Code templates reference CNN training — keep CNN for consistency
	difficulty_level=difficulty_level,
	)

	if task_id == "task_007":
	gamma = _choose([0.01, 0.001, 0.0001], rng)
	step_size = _choose([2, 3, 5], rng)
	return ScenarioParams(
	task_id=task_id,
	root_cause=RootCauseDiagnosis.SCHEDULER_MISCONFIGURED,
	seed=effective_seed,
	scheduler_gamma=float(gamma),
	scheduler_step_size=int(step_size),
	notes="LR scheduler was recently added to improve convergence.",
	max_steps=25,
	model_type=_pick_model_type(rng),
	difficulty_level=difficulty_level,
	)

	raise ValueError(f"Unknown task_id: {task_id}")