Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

pytorch-training-debugger / openenv.yaml

UjjwalPardeshi

fix: add has_grader: true to all tasks for evaluator grader check

fc3fbaf about 1 month ago

2.73 kB

	spec_version: 1
	name: pytorch-training-debugger
	type: space
	runtime: fastapi
	app: server.app:app
	port: 7860

	version: "1.1.0"
	description: \|
	PyTorch-native fault injection engine for training failure debugging.
	An AI agent investigates, diagnoses, fixes, and verifies broken
	training runs using real torch.nn.Module models (CNN + MLP), torch.autograd
	gradients, state_dict() weight inspection, and PyTorch code-level
	debugging. 7 tasks across 3 difficulty tiers with context-gated
	reward shaping, difficulty scaling (1-5), confusion matrices, and
	a live diagnostic dashboard.
	framework: openenv
	tags:
	- ml-debugging
	- pytorch
	- reinforcement-learning
	- root-cause-analysis
	- fault-injection
	- code-debugging
	- openenv

	observation_space:
	type: MLTrainingObservation
	description: "Training run snapshot with progressive reveal — gradients, weights, data stats, model modes, code snippets, and confusion matrices revealed on inspection"

	action_space:
	type: MLTrainingAction
	description: "Investigation, fix, code-fix, and diagnosis actions with dynamic availability"

	tasks:
	- id: task_001
	difficulty: easy
	max_steps: 20
	has_grader: true
	param_ranges:
	learning_rate: [0.05, 0.08, 0.10, 0.15, 0.30]

	- id: task_002
	difficulty: easy
	max_steps: 20
	has_grader: true
	param_ranges:
	learning_rate: [1e-6, 5e-6, 1e-5]
	depth_multiplier: [1.0, 1.5, 2.0]

	- id: task_003
	difficulty: medium
	max_steps: 25
	has_grader: true
	param_ranges:
	leakage_pct: [0.12, 0.18, 0.22, 0.28]

	- id: task_004
	difficulty: medium
	max_steps: 25
	has_grader: true
	param_ranges:
	weight_decay: [0.0, 0.0001, 0.001]
	divergence_epoch: [5, 8, 12]

	- id: task_005
	difficulty: hard
	max_steps: 30
	has_grader: true
	param_ranges:
	red_herring_intensity: [0.8, 2.5]

	- id: task_006
	difficulty: hard
	max_steps: 30
	has_grader: true
	param_ranges:
	bug_type: [eval_mode, detach_loss, zero_grad_missing, inplace_relu]

	- id: task_007
	difficulty: hard
	max_steps: 25
	has_grader: true
	param_ranges:
	scheduler_gamma: [0.01, 0.001, 0.0001]
	scheduler_step_size: [2, 3, 5]

	reward:
	range: [-1.0, 1.0]
	shaped: true
	step_penalty: -0.01
	investigation_bonus: 0.05
	max_investigation_bonus: 0.25
	correct_diagnosis: 0.50
	terminal_convergence: 0.40

	endpoints:
	websocket: "/ws"
	tasks: "GET /tasks"
	grader: "POST /grader"
	baseline: "POST /baseline"
	health: "GET /health"
	dashboard: "GET /dashboard"
	validation_report: "GET /validation-report"
	curriculum: "GET /curriculum"
	leaderboard: "GET /leaderboard"
	replay: "GET /replay/{episode_id}"