Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

pytorch-training-debugger / server /_heuristic.py

omkarrr88

minor changes

206438f about 1 month ago

7.03 kB

	"""Rule-based heuristic baseline for the /baseline endpoint.

	Extracted from app.py to keep route definitions clean.
	"""

	from __future__ import annotations

	from ml_training_debugger.models import MLTrainingAction
	from server.environment import MLTrainingEnvironment

	ALL_TASK_IDS = [
	"task_001",
	"task_002",
	"task_003",
	"task_004",
	"task_005",
	"task_006",
	"task_007",
	]


	def run_baseline_all_tasks() -> dict[str, float]:
	"""Run the rule-based baseline on all tasks. Returns {task_id: score}."""
	scores: dict[str, float] = {}
	for task_id in ALL_TASK_IDS:
	env = MLTrainingEnvironment()
	env.reset(seed=42, episode_id=f"baseline_{task_id}", task_id=task_id)
	scores[task_id] = round(_run_heuristic_episode(env), 4)
	return scores


	def _run_heuristic_episode(
	env: MLTrainingEnvironment, task_id: str = "",
	) -> float:
	"""Run one heuristic baseline episode. Returns grader score."""
	# Step 1: inspect_gradients
	obs = env.step(MLTrainingAction(action_type="inspect_gradients"))

	if obs.gradient_stats:
	if any(g.is_exploding for g in obs.gradient_stats):
	env.step(MLTrainingAction(
	action_type="modify_config", target="learning_rate", value=0.001,
	))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="lr_too_high",
	))
	return _get_score(env)

	if any(g.is_vanishing for g in obs.gradient_stats):
	env.step(MLTrainingAction(
	action_type="modify_config", target="learning_rate", value=0.01,
	))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="vanishing_gradients",
	))
	return _get_score(env)

	# Step 2: inspect_data_batch
	obs = env.step(MLTrainingAction(action_type="inspect_data_batch"))
	if obs.data_batch_stats and obs.data_batch_stats.class_overlap_score > 0.5:
	env.step(MLTrainingAction(action_type="patch_data_loader"))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="data_leakage",
	))
	return _get_score(env)

	# Detect overfitting pattern
	looks_like_overfitting = _detect_overfitting(obs)

	# Step 3: inspect_model_modes
	obs = env.step(MLTrainingAction(action_type="inspect_model_modes"))
	if obs.model_mode_info:
	if any(v == "eval" for v in obs.model_mode_info.values()):
	env.step(MLTrainingAction(action_type="fix_model_mode"))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="batchnorm_eval_mode",
	))
	return _get_score(env)

	# Step 4: inspect_code (for Task 6)
	obs = env.step(MLTrainingAction(action_type="inspect_code"))
	if obs.code_snippet:
	code = obs.code_snippet.code
	_try_code_fix(env, code)

	session = env._get_session()
	if session and session.state.fix_action_taken:
	env.step(MLTrainingAction(action_type="restart_run"))

	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="code_bug",
	))
	return _get_score(env)

	# Step 5: scheduler issue (loss stagnates after initial progress)
	if _detect_scheduler_issue(obs):
	env.step(MLTrainingAction(
	action_type="modify_config", target="learning_rate", value=0.001,
	))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="scheduler_misconfigured",
	))
	return _get_score(env)

	# Overfitting fallback
	if looks_like_overfitting:
	env.step(MLTrainingAction(
	action_type="modify_config", target="weight_decay", value=0.01,
	))
	env.step(MLTrainingAction(action_type="restart_run"))
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="overfitting",
	))
	return _get_score(env)

	# Final fallback
	env.step(MLTrainingAction(
	action_type="mark_diagnosed", diagnosis="overfitting",
	))
	return _get_score(env)


	def _try_code_fix(env: MLTrainingEnvironment, code: str) -> None:
	"""Attempt to fix a detected code bug."""
	if "model.eval()" in code and "model.train()" not in code:
	env.step(MLTrainingAction(
	action_type="fix_code", line=5, replacement="model.train()",
	))
	elif ".detach()" in code:
	env.step(MLTrainingAction(
	action_type="fix_code", line=14,
	replacement=" loss = criterion(output, batch_y)",
	))
	elif "inplace=True" in code:
	env.step(MLTrainingAction(
	action_type="fix_code", line=15,
	replacement=" output = F.relu(output)",
	))
	elif "optimizer.zero_grad()" not in code and "optimizer.step()" in code:
	env.step(MLTrainingAction(
	action_type="fix_code", line=11,
	replacement=" optimizer.zero_grad()",
	))


	def _detect_overfitting(obs: object) -> bool:
	"""Detect overfitting pattern from observation."""
	if not (obs.val_loss_history and obs.training_loss_history
	and len(obs.val_loss_history) >= 10):
	return False
	early_train = sum(obs.training_loss_history[:5]) / 5
	late_train = sum(obs.training_loss_history[-5:]) / 5
	early_val = sum(obs.val_loss_history[:5]) / 5
	late_val = sum(obs.val_loss_history[-5:]) / 5
	train_dropped = late_train < early_train * 0.5
	train_loss_low = late_train < 0.15
	val_not_improving = late_val >= early_val * 0.95
	gap_widening = (late_val - late_train) > (early_val - early_train)
	return (
	(train_dropped or train_loss_low)
	and (val_not_improving or gap_widening)
	and obs.data_batch_stats
	and obs.data_batch_stats.class_overlap_score < 0.3
	)


	def _detect_scheduler_issue(obs: object) -> bool:
	"""Detect scheduler misconfiguration from loss history."""
	if not (obs.training_loss_history and len(obs.training_loss_history) >= 10):
	return False
	early_loss = sum(obs.training_loss_history[:3]) / 3
	mid_loss = sum(obs.training_loss_history[5:8]) / 3
	finite_late = [v for v in obs.training_loss_history[-3:] if v != float("inf")]
	late_loss = sum(finite_late) / max(len(finite_late), 1)
	return early_loss > mid_loss and abs(late_loss - mid_loss) < 0.3


	def _get_score(env: MLTrainingEnvironment) -> float:
	"""Extract the grader score from the environment."""
	session = env._get_session()
	if session and session.last_score is not None:
	return session.last_score
	return 0.0