File size: 2,727 Bytes
e2f8b29
 
 
 
 
 
 
0b9b77b
e2f8b29
 
 
0b9b77b
e2f8b29
0b9b77b
 
 
e2f8b29
 
 
 
 
 
 
9e6a926
e2f8b29
 
 
 
0b9b77b
e2f8b29
 
 
9e6a926
e2f8b29
 
 
 
 
fc3fbaf
9e6a926
 
 
 
 
 
fc3fbaf
9e6a926
 
 
 
e2f8b29
 
 
fc3fbaf
9e6a926
 
 
 
 
 
fc3fbaf
9e6a926
 
 
 
e2f8b29
 
 
fc3fbaf
9e6a926
 
 
 
 
 
fc3fbaf
9e6a926
 
e2f8b29
0b9b77b
4414fa9
0b9b77b
fc3fbaf
0b9b77b
 
 
 
e2f8b29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e6a926
4f58e42
0b9b77b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
spec_version: 1
name: pytorch-training-debugger
type: space
runtime: fastapi
app: server.app:app
port: 7860

version: "1.1.0"
description: |
  PyTorch-native fault injection engine for training failure debugging.
  An AI agent investigates, diagnoses, fixes, and verifies broken
  training runs using real torch.nn.Module models (CNN + MLP), torch.autograd
  gradients, state_dict() weight inspection, and PyTorch code-level
  debugging. 7 tasks across 3 difficulty tiers with context-gated
  reward shaping, difficulty scaling (1-5), confusion matrices, and
  a live diagnostic dashboard.
framework: openenv
tags:
  - ml-debugging
  - pytorch
  - reinforcement-learning
  - root-cause-analysis
  - fault-injection
  - code-debugging
  - openenv

observation_space:
  type: MLTrainingObservation
  description: "Training run snapshot with progressive reveal — gradients, weights, data stats, model modes, code snippets, and confusion matrices revealed on inspection"

action_space:
  type: MLTrainingAction
  description: "Investigation, fix, code-fix, and diagnosis actions with dynamic availability"

tasks:
  - id: task_001
    difficulty: easy
    max_steps: 20
    has_grader: true
    param_ranges:
      learning_rate: [0.05, 0.08, 0.10, 0.15, 0.30]

  - id: task_002
    difficulty: easy
    max_steps: 20
    has_grader: true
    param_ranges:
      learning_rate: [1e-6, 5e-6, 1e-5]
      depth_multiplier: [1.0, 1.5, 2.0]

  - id: task_003
    difficulty: medium
    max_steps: 25
    has_grader: true
    param_ranges:
      leakage_pct: [0.12, 0.18, 0.22, 0.28]

  - id: task_004
    difficulty: medium
    max_steps: 25
    has_grader: true
    param_ranges:
      weight_decay: [0.0, 0.0001, 0.001]
      divergence_epoch: [5, 8, 12]

  - id: task_005
    difficulty: hard
    max_steps: 30
    has_grader: true
    param_ranges:
      red_herring_intensity: [0.8, 2.5]

  - id: task_006
    difficulty: hard
    max_steps: 30
    has_grader: true
    param_ranges:
      bug_type: [eval_mode, detach_loss, zero_grad_missing, inplace_relu]

  - id: task_007
    difficulty: hard
    max_steps: 25
    has_grader: true
    param_ranges:
      scheduler_gamma: [0.01, 0.001, 0.0001]
      scheduler_step_size: [2, 3, 5]

reward:
  range: [-1.0, 1.0]
  shaped: true
  step_penalty: -0.01
  investigation_bonus: 0.05
  max_investigation_bonus: 0.25
  correct_diagnosis: 0.50
  terminal_convergence: 0.40

endpoints:
  websocket: "/ws"
  tasks: "GET /tasks"
  grader: "POST /grader"
  baseline: "POST /baseline"
  health: "GET /health"
  dashboard: "GET /dashboard"
  validation_report: "GET /validation-report"
  curriculum: "GET /curriculum"
  leaderboard: "GET /leaderboard"
  replay: "GET /replay/{episode_id}"