omkarrr88 commited on
Commit
da20dfa
·
1 Parent(s): 6be6d8e

minor changes

Browse files
Files changed (9) hide show
  1. .gitignore +7 -0
  2. Dockerfile +0 -1
  3. baseline_inference.py +0 -241
  4. docs/PAPER.md +0 -56
  5. docs/PRD.md +0 -367
  6. docs/PROJECT_GUIDE.md +0 -691
  7. docs/ROADMAP.md +0 -441
  8. inference.py +4 -2
  9. run_all_baselines.py +0 -130
.gitignore CHANGED
@@ -20,3 +20,10 @@ CLAUDE.md
20
  uv.lock
21
  deploy-hf.sh
22
  deploy.sh
 
 
 
 
 
 
 
 
20
  uv.lock
21
  deploy-hf.sh
22
  deploy.sh
23
+ AUDIT_REPORT.md
24
+ baseline_inference.py
25
+ run_all_baselines.py
26
+ docs/PAPER.md
27
+ docs/PRD.md
28
+ docs/ROADMAP.md
29
+ docs/PROJECT_GUIDE.md
Dockerfile CHANGED
@@ -96,7 +96,6 @@ COPY ml_training_debugger/ ml_training_debugger/
96
  COPY server/ server/
97
  COPY openenv.yaml .
98
  COPY baseline_heuristic.py .
99
- COPY baseline_inference.py .
100
  COPY inference.py .
101
  COPY demo.py .
102
  COPY README.md .
 
96
  COPY server/ server/
97
  COPY openenv.yaml .
98
  COPY baseline_heuristic.py .
 
99
  COPY inference.py .
100
  COPY demo.py .
101
  COPY README.md .
baseline_inference.py DELETED
@@ -1,241 +0,0 @@
1
- #!/usr/bin/env python3
2
- """LLM baseline agent using Google Gemini (via OpenAI-compatible SDK).
3
-
4
- Requires GEMINI_API_KEY environment variable (or pass via --api-key).
5
- Uses temperature=0.0 for near-deterministic behavior.
6
- Spec reference: Section 17.
7
-
8
- Usage:
9
- GEMINI_API_KEY=... python baseline_inference.py
10
- python baseline_inference.py --api-key YOUR_KEY
11
- """
12
-
13
- from __future__ import annotations
14
-
15
- import argparse
16
- import json
17
- import os
18
- import sys
19
- from pathlib import Path
20
-
21
- # Load .env file if present
22
- _env_path = Path(__file__).parent / ".env"
23
- if _env_path.exists():
24
- for line in _env_path.read_text().splitlines():
25
- line = line.strip()
26
- if line and not line.startswith("#") and "=" in line:
27
- key, _, value = line.partition("=")
28
- os.environ.setdefault(key.strip(), value.strip())
29
-
30
- try:
31
- from openai import OpenAI
32
- except ImportError:
33
- print("Error: openai package not installed. Run: pip install openai")
34
- sys.exit(1)
35
-
36
- from ml_training_debugger.models import MLTrainingAction
37
- from server.environment import MLTrainingEnvironment
38
-
39
- ALL_TASKS = [
40
- "task_001",
41
- "task_002",
42
- "task_003",
43
- "task_004",
44
- "task_005",
45
- "task_006",
46
- "task_007",
47
- ]
48
-
49
- SYSTEM_PROMPT = """You are an expert ML engineer debugging a PyTorch training run.
50
- You are interacting with an environment that simulates a broken training job.
51
-
52
- Available actions (respond with JSON only, no explanation):
53
- - {"action_type": "inspect_gradients"} - View gradient statistics per layer
54
- - {"action_type": "inspect_data_batch"} - View data batch statistics and confusion matrix
55
- - {"action_type": "inspect_model_modes"} - View model layer modes (train/eval)
56
- - {"action_type": "inspect_model_weights"} - View model weight statistics
57
- - {"action_type": "inspect_code"} - View PyTorch training code
58
- - {"action_type": "modify_config", "target": "<field>", "value": <val>} - Change a hyperparameter
59
- - {"action_type": "add_callback"} - Add gradient clipping/scheduler
60
- - {"action_type": "patch_data_loader"} - Fix data pipeline issues
61
- - {"action_type": "fix_model_mode"} - Call model.train()
62
- - {"action_type": "fix_code", "line": <int>, "replacement": "<code>"} - Fix a code line
63
- - {"action_type": "restart_run"} - Restart training (requires a fix first)
64
- - {"action_type": "mark_diagnosed", "diagnosis": "<cause>"} - Submit diagnosis
65
-
66
- Valid diagnoses: lr_too_high, vanishing_gradients, data_leakage, overfitting, batchnorm_eval_mode, code_bug, scheduler_misconfigured
67
-
68
- Strategy:
69
- 1. First investigate by inspecting gradients, data, model modes, and code
70
- 2. Form a hypothesis based on the evidence gathered
71
- 3. Apply the correct fix for the identified root cause
72
- 4. Restart training to verify the fix works
73
- 5. Submit your diagnosis
74
-
75
- IMPORTANT: Respond with ONLY a valid JSON action object. No explanation, no markdown, no code blocks."""
76
-
77
-
78
- def run_llm_episode(task_id: str, client: OpenAI, model_name: str) -> float:
79
- """Run one LLM agent episode."""
80
- env = MLTrainingEnvironment()
81
- obs = env.reset(seed=42, episode_id=f"llm_{task_id}", task_id=task_id)
82
-
83
- initial_obs = {
84
- "training_loss_history": obs.training_loss_history[:5],
85
- "val_accuracy_history": obs.val_accuracy_history[:5],
86
- "current_config": obs.current_config.model_dump(),
87
- "error_log": obs.error_log,
88
- "available_actions": obs.available_actions,
89
- "notes": obs.notes,
90
- "gpu_memory_used_gb": obs.gpu_memory_used_gb,
91
- }
92
-
93
- messages = [
94
- {"role": "system", "content": SYSTEM_PROMPT},
95
- {
96
- "role": "user",
97
- "content": f"New episode started for a broken PyTorch training run.\n\nInitial observation:\n{json.dumps(initial_obs, indent=2, default=str)}",
98
- },
99
- ]
100
-
101
- for step in range(25):
102
- if obs.done:
103
- break
104
-
105
- try:
106
- response = client.chat.completions.create(
107
- model=model_name,
108
- messages=messages,
109
- temperature=0.0,
110
- max_tokens=300,
111
- )
112
- action_text = response.choices[0].message.content.strip()
113
- except Exception as e:
114
- print(f" Step {step}: API error — {e}", file=sys.stderr)
115
- break
116
-
117
- # Clean up common LLM formatting issues
118
- action_text = action_text.strip("`").strip()
119
- if action_text.startswith("json"):
120
- action_text = action_text[4:].strip()
121
-
122
- messages.append({"role": "assistant", "content": action_text})
123
-
124
- try:
125
- action_data = json.loads(action_text)
126
- action = MLTrainingAction(**action_data)
127
- except (json.JSONDecodeError, Exception) as e:
128
- messages.append(
129
- {
130
- "role": "user",
131
- "content": f"Invalid action format: {e}. Respond with ONLY valid JSON.",
132
- }
133
- )
134
- continue
135
-
136
- obs = env.step(action)
137
-
138
- obs_summary: dict = {
139
- "reward": obs.reward,
140
- "done": obs.done,
141
- "step": obs.episode_state.step_count,
142
- "available_actions": obs.available_actions,
143
- }
144
- if obs.error_log:
145
- obs_summary["error_log"] = obs.error_log
146
- if obs.gradient_stats:
147
- obs_summary["gradient_stats"] = [
148
- {
149
- "layer": g.layer_name,
150
- "mean_norm": round(g.mean_norm, 4),
151
- "exploding": g.is_exploding,
152
- "vanishing": g.is_vanishing,
153
- }
154
- for g in obs.gradient_stats
155
- ]
156
- if obs.data_batch_stats:
157
- obs_summary["data_overlap"] = obs.data_batch_stats.class_overlap_score
158
- obs_summary["duplicate_ratio"] = obs.data_batch_stats.duplicate_ratio
159
- if obs.model_mode_info:
160
- obs_summary["model_modes"] = obs.model_mode_info
161
- if obs.code_snippet:
162
- obs_summary["code"] = obs.code_snippet.code[:600]
163
- obs_summary["hint"] = obs.code_snippet.hint
164
-
165
- messages.append(
166
- {
167
- "role": "user",
168
- "content": f"Observation after your action:\n{json.dumps(obs_summary, indent=2, default=str)}",
169
- }
170
- )
171
-
172
- session = env._get_session()
173
- return session.last_score if session and session.last_score is not None else 0.0
174
-
175
-
176
- PROVIDERS = {
177
- "groq": {
178
- "env_key": "GROQ_API_KEY",
179
- "base_url": "https://api.groq.com/openai/v1",
180
- "default_model": "llama-3.3-70b-versatile",
181
- },
182
- "cerebras": {
183
- "env_key": "CEREBRAS_API_KEY",
184
- "base_url": "https://api.cerebras.ai/v1",
185
- "default_model": "llama3.1-8b",
186
- },
187
- "gemini": {
188
- "env_key": "GEMINI_API_KEY",
189
- "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
190
- "default_model": "gemini-2.0-flash",
191
- },
192
- "openai": {
193
- "env_key": "OPENAI_API_KEY",
194
- "base_url": None,
195
- "default_model": "gpt-4o",
196
- },
197
- }
198
-
199
-
200
- def main() -> None:
201
- parser = argparse.ArgumentParser(description="LLM baseline agent")
202
- parser.add_argument("--url", default="http://localhost:7860")
203
- parser.add_argument("--api-key", default=None, help="API key")
204
- parser.add_argument(
205
- "--provider",
206
- default="groq",
207
- choices=list(PROVIDERS.keys()),
208
- help="LLM provider (default: groq)",
209
- )
210
- parser.add_argument("--model", default=None, help="Model name (auto-detected from provider)")
211
- args = parser.parse_args()
212
-
213
- prov = PROVIDERS[args.provider]
214
- api_key = args.api_key or os.environ.get(prov["env_key"])
215
- if not api_key:
216
- print(f"Error: Set {prov['env_key']} env var or pass --api-key")
217
- sys.exit(1)
218
-
219
- model_name = args.model or prov["default_model"]
220
- client_kwargs: dict = {"api_key": api_key}
221
- if prov["base_url"]:
222
- client_kwargs["base_url"] = prov["base_url"]
223
- client = OpenAI(**client_kwargs)
224
-
225
- scores: dict[str, float] = {}
226
- print(f"Running LLM baseline with {args.provider}/{model_name}...", file=sys.stderr)
227
-
228
- for task_id in ALL_TASKS:
229
- try:
230
- score = run_llm_episode(task_id, client, model_name)
231
- scores[task_id] = round(score, 4)
232
- print(f" {task_id}: {score:.4f}", file=sys.stderr)
233
- except Exception as e:
234
- print(f" {task_id}: ERROR — {e}", file=sys.stderr)
235
- scores[task_id] = 0.0
236
-
237
- print(json.dumps(scores, indent=2))
238
-
239
-
240
- if __name__ == "__main__":
241
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/PAPER.md DELETED
@@ -1,56 +0,0 @@
1
- # Context-Gated Reward Shaping for Evidence-Based ML Debugging
2
-
3
- ## Abstract
4
-
5
- We present a reinforcement learning environment for training AI agents to debug broken PyTorch training runs. The environment introduces **context-gated reward shaping** — a penalty mechanism that distinguishes between reasonable prior actions (no penalty) and actions that ignore evidence the agent has already gathered (penalty). This single mechanic encodes evidence-based reasoning directly into the reward signal, teaching agents to reason about their accumulated knowledge rather than follow fixed playbooks. The environment covers 7 failure scenarios across 3 difficulty tiers, uses real PyTorch model internals (torch.nn.Module, torch.autograd, state_dict()), and includes a code-level debugging task where agents must read and fix actual Python source code.
6
-
7
- ## Motivation
8
-
9
- ML teams spend 15-25% of engineer time debugging silent training failures — runs that produce no errors, just mysteriously bad metrics. Each misdiagnosed restart wastes $2-8/hour/GPU. Existing RL environments focus on games, navigation, or text tasks. No environment trains agents for the diagnostic reasoning process that ML engineers perform daily: gathering evidence from gradients, weights, data, and code; forming hypotheses under uncertainty; and making evidence-based decisions about which fix to apply.
10
-
11
- ## Method: Context-Gated Reward Shaping
12
-
13
- Standard RL environments use stateless rewards: "did action X happen?" Our environment tracks the agent's information state and conditions penalties on what the agent has already observed.
14
-
15
- **Core mechanic:** An agent that adds gradient clipping *before* inspecting gradients follows a reasonable prior — no penalty. An agent that inspects gradients, sees they are normal, and *then* adds gradient clipping is ignoring counter-evidence — **-0.20 penalty**.
16
-
17
- Formally: the penalty fires when `gradients_inspected == True AND gradients_were_normal == True AND action == add_callback`. This gate requires two conditions to be jointly true, both of which depend on prior agent actions.
18
-
19
- This teaches agents a transferable skill: *don't ignore what you've already learned*. In real MLOps, ignoring gathered evidence leads to wasted GPU hours and delayed incident resolution.
20
-
21
- ## Environment Design
22
-
23
- - **7 tasks** across 3 difficulty tiers (easy, medium, hard) with difficulty scaling (1-5)
24
- - **Real PyTorch models**: SimpleCNN (~50K params) and SimpleMLP (~20K params) with real torch.autograd gradients
25
- - **Progressive information reveal**: agents must actively choose what to investigate
26
- - **Code-level debugging** (Task 6): agent reads PyTorch source and submits line-by-line fixes
27
- - **Red herring injection** (Task 5): misleading gradient spikes, GPU memory warnings, near-vanishing layers
28
- - **Confusion matrices** in data batch inspection for richer diagnostic signals
29
- - **7 diagnosis types**: lr_too_high, vanishing_gradients, data_leakage, overfitting, batchnorm_eval_mode, code_bug, scheduler_misconfigured
30
-
31
- ## Results
32
-
33
- Three-agent comparison demonstrates the environment differentiates across agent types:
34
-
35
- | Task | Heuristic | Llama 3.3 70B | Llama 3.1 8B |
36
- |------|-----------|---------------|--------------|
37
- | task_001 | **1.00** | 1.00 | 0.60 |
38
- | task_002 | **1.00** | 1.00 | 0.05 |
39
- | task_003 | **1.00** | 0.40 | 0.40 |
40
- | task_004 | 0.45 | 0.45 | **0.60** |
41
- | task_005 | **1.00** | 1.00 | 1.00 |
42
- | task_006 | **1.00** | — | 0.60 |
43
- | task_007 | **1.00** | — | 0.60 |
44
- | **Average** | **0.92** | 0.69* | 0.55 |
45
-
46
- Key findings: (1) Model size matters — 70B scores 25% higher than 8B. (2) Domain-specific heuristic (0.92) outperforms general LLMs (0.55-0.69), proving the environment rewards systematic debugging. (3) Task 4 is the exception where flexible LLM reasoning outperforms rigid heuristic on subtle real training curves.
47
-
48
- ## Conclusion
49
-
50
- Context-gated reward shaping is a general technique applicable to any RL environment where agents must reason about accumulated evidence. By conditioning penalties on the agent's information state, we create environments that reward systematic investigation over pattern-matching — a capability with direct transfer value to real-world MLOps debugging.
51
-
52
- The environment is deployed as an OpenEnv-compatible Docker container on Hugging Face Spaces with full API documentation, a live diagnostic dashboard, and bit-exact reproducible baselines.
53
-
54
- ---
55
-
56
- *Built for the Meta PyTorch OpenEnv Hackathon x Scaler School of Technology, 2026.*
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/PRD.md DELETED
@@ -1,367 +0,0 @@
1
- # PRD — PyTorch Training Run Debugger
2
-
3
- **Product:** OpenEnv RL environment for ML training failure diagnosis
4
- **Hackathon:** Meta PyTorch OpenEnv Hackathon x Scaler School of Technology, Round 1
5
- **Deadline:** April 8, 2026 (submission window opens March 28)
6
- **Runtime:** Python 3.12 · PyTorch CPU-only · openenv-core v0.2.2
7
- **Source of truth:** `ml-training-debugger-spec.md` for all implementation detail beyond this PRD
8
-
9
- ---
10
-
11
- ## 1. Overview
12
-
13
- ### 1.1 What We Are Building
14
-
15
- An OpenEnv-compliant reinforcement learning environment where an AI agent receives a snapshot of a broken PyTorch training run and must investigate, diagnose, fix, and verify the failure through a multi-step interactive process. The environment exposes real PyTorch model internals (gradients from `torch.autograd`, weights from `model.state_dict()`) and covers 6 failure scenarios across 3 difficulty tiers.
16
-
17
- ### 1.2 Problem Being Solved
18
-
19
- MLOps teams spend 15-25% of engineer time debugging silent training failures — runs that produce no error, no crash, just bad metrics. Each misdiagnosed restart wastes GPU compute at $2-8/hour/card. The diagnostic process is hard because:
20
-
21
- - Multiple symptoms can point to multiple causes simultaneously
22
- - Some bugs produce no error — just mysteriously bad performance
23
- - Fixing the wrong thing wastes hours of compute and restarts
24
- - Static analysis catches some bugs but cannot reason through ambiguous runtime signals
25
-
26
- No existing OpenEnv environment covers this domain. The OpenEnv Hub currently contains a demo echo environment and a code execution environment. This fills a genuine gap.
27
-
28
- ### 1.3 Why This Domain Wins
29
-
30
- 1. **Strategic alignment** — PyTorch debugging for a Meta PyTorch hackathon. Judges from Meta and Hugging Face will see their own framework as the core subject matter.
31
- 2. **Novel reward design** — Context-gated penalties that encode evidence-based reasoning into the reward signal. No existing OpenEnv environment attempts this.
32
- 3. **Code-level debugging** — Task 6 requires the agent to read and fix actual PyTorch code. Directly addresses Meta's interest: can an AI agent debug PyTorch?
33
- 4. **Ecosystem gap** — Zero competition in the OpenEnv ecosystem for ML training failure diagnosis.
34
-
35
- ### 1.4 Key Differentiators
36
-
37
- | Differentiator | What It Is | Why It Matters |
38
- |---|---|---|
39
- | Context-gated reward shaping | Penalty fires only when agent ignores evidence it already gathered; no penalty for reasonable priors | Encodes evidence-based decision making — a capability no other OpenEnv environment has |
40
- | PyTorch-native internals | Real `torch.nn.Module` models, real `torch.autograd` gradients, real `state_dict()` snapshots | Every model-level observation is grounded in real PyTorch computation, not synthetic data |
41
- | Code-level debugging (Task 6) | Agent reads PyTorch code, identifies buggy line, submits code fix | Tests code understanding, not just metric interpretation — aligned with Meta's core interest |
42
-
43
- ---
44
-
45
- ## 2. Target Users
46
-
47
- ### 2.1 Primary: Hackathon Judges (Meta + Hugging Face Engineers)
48
-
49
- **What they evaluate:**
50
- - Real-world utility (30%) — Is this a genuine task? Would someone use this to train/evaluate agents?
51
- - Task & grader quality (25%) — Well-defined tasks, accurate graders, meaningful difficulty progression?
52
- - Environment design (20%) — Clean state management, sensible action/observation spaces, good reward shaping?
53
- - Code quality & spec compliance (15%) — OpenEnv spec, clean structure, typed models, working Dockerfile?
54
- - Creativity & novelty (10%) — Novel domain, interesting mechanics, original approach?
55
-
56
- **What impresses them:**
57
- - Real `import torch` in core modules (not numpy wrappers)
58
- - A live dashboard where they can watch an agent investigate in real time
59
- - Deterministic graders that produce different scores for different agent quality levels
60
- - The context-gated penalty — nuanced reward design that goes beyond standard practice
61
-
62
- **What disqualifies:**
63
- - HF Space doesn't deploy or respond to `reset()`
64
- - Plagiarized or trivially modified existing environments
65
- - Graders that always return the same score
66
- - No baseline inference script
67
- - Dockerfile doesn't build
68
-
69
- ### 2.2 Secondary: RL Researchers and Agent Developers
70
-
71
- **What they need:**
72
- - A challenging benchmark that differentiates heuristic agents from reasoning-capable ones
73
- - Clear, typed action/observation schemas for agent integration
74
- - Reproducible baseline scores for comparison
75
- - Environments that produce meaningful reward signal across the full trajectory (not just sparse terminal reward)
76
-
77
- ### 2.3 Tertiary: Auto-Validation System (Phase 1 Gate)
78
-
79
- A non-human "user" that must pass before any human judge sees the submission:
80
- - Pings HF Space URL — must return 200 and respond to `reset()`
81
- - Validates `openenv.yaml`, typed models, `step()`/`reset()`/`state()` endpoints
82
- - Runs `docker build` on submitted repo
83
- - Runs baseline script twice — scores must be identical
84
- - Enumerates tasks, runs each grader — scores must be in [0.0, 1.0]
85
-
86
- ---
87
-
88
- ## 3. Success Metrics
89
-
90
- ### 3.1 Evaluation Criteria Targets
91
-
92
- | Criterion | Weight | Target Score | How We Hit It |
93
- |---|---|---|---|
94
- | Real-world utility | 30% | 26-30 | ML debugging is a $B+ problem; every PyTorch team encounters these failures; fills a genuine OpenEnv gap |
95
- | Task & grader quality | 25% | 21-25 | 6 tasks (3 MVP), 3 difficulty tiers, deterministic graders, hard tasks challenge frontier models |
96
- | Environment design | 20% | 17-20 | Progressive reveal, context-gated penalties, dynamic `available_actions`, proper episode boundaries |
97
- | Code quality & spec compliance | 15% | 13-15 | Full OpenEnv spec, typed Pydantic models, working Dockerfile + HF Space, two baselines |
98
- | Creativity & novelty | 10% | 9-10 | Context-gated rewards, real PyTorch model internals, code fix task — all new to OpenEnv |
99
- | **Total** | **100%** | **86-100** | |
100
-
101
- ### 3.2 Quantitative Success Criteria
102
-
103
- | Metric | Target | Measurement |
104
- |---|---|---|
105
- | Auto-validation | Pass all 5 gates | `openenv validate` + smoke test sequence |
106
- | Grader score range | Meaningful variance per task | Heuristic baseline ~0.30-0.85 across tasks (not flat) |
107
- | Heuristic-LLM gap | Measurable difference | LLM scores higher than heuristic on Tasks 5 and 6 |
108
- | `reset()` latency | <200ms | Model instantiation + 2 forward passes + parametric curves |
109
- | `step()` latency | <10ms | Action dispatch + reward computation + state update |
110
- | Baseline reproducibility | Bit-exact across runs | `diff run1.json run2.json` produces no output |
111
- | Docker image size | <500MB | PyTorch CPU-only + python:3.12-slim |
112
- | Test coverage | >80% | `pytest --cov` |
113
-
114
- ### 3.3 Qualitative Success Criteria
115
-
116
- - A judge can open `/dashboard`, trigger a baseline run, and understand the agent's reasoning at a glance
117
- - Task 5 (BatchNorm eval mode) visibly differentiates disciplined investigation from red-herring chasing
118
- - Task 6 (code bug) produces a "wow" moment — an agent reading and fixing PyTorch code in front of Meta judges
119
- - The context-gated penalty creates a story: "this agent gathered evidence and then ignored it"
120
-
121
- ---
122
-
123
- ## 4. Functional Requirements
124
-
125
- > **Complete typed specifications for all data models, actions, observations, tasks, reward components, and error handling are in `ml-training-debugger-spec.md` Sections 10-16.** This section provides a product-level summary.
126
-
127
- ### 4.1 Agent Interaction Loop
128
-
129
- ```
130
- reset(task_id) → initial observation (loss curves, config, error log — no gradients/weights/data/code)
131
-
132
- step(action) → updated observation + reward + done flag (progressive reveal)
133
-
134
- ... repeat ...
135
-
136
- step(mark_diagnosed) → terminal observation, done=True, episode scored by grader
137
- ```
138
-
139
- ### 4.2 Observation Space Summary
140
-
141
- The `MLTrainingObservation` extends `Observation` from openenv-core. Key design:
142
- - **Always visible from reset:** loss/accuracy histories, config, error_log, GPU memory, episode state, available actions
143
- - **Progressively revealed:** gradient stats (real torch.autograd), weight stats (real state_dict), data batch stats, model mode info, code snippets — each populated only after the corresponding `inspect_*` action
144
- - All fields are typed Pydantic models with explicit types. See spec Section 10 for complete field definitions.
145
-
146
- ### 4.3 Action Space Summary
147
-
148
- The `MLTrainingAction` extends `Action` from openenv-core. 14 action types in 3 categories:
149
- - **Investigation** (5): `inspect_gradients`, `inspect_data_batch`, `inspect_model_modes`, `inspect_model_weights`, `inspect_code`
150
- - **Fix** (7): `modify_config`, `add_callback`, `replace_optimizer`, `patch_data_loader`, `fix_model_mode`, `fix_code`, `rollback_checkpoint`
151
- - **Terminal** (2): `restart_run`, `mark_diagnosed`
152
-
153
- Dynamic availability: `restart_run` requires `fix_action_taken`, `fix_code` requires `code_inspected`, `mark_diagnosed` disappears after submission. See spec Section 10 for complete action definitions and required fields.
154
-
155
- ### 4.4 Diagnosis Enum (RootCauseDiagnosis)
156
-
157
- Closed set of 6 values. Grader is a single equality check — no fuzzy matching.
158
-
159
- | Value | Description |
160
- |---|---|
161
- | `lr_too_high` | Learning rate too large for the architecture |
162
- | `vanishing_gradients` | LR too low or architecture too deep, gradients decay to near-zero |
163
- | `data_leakage` | Validation samples appearing in training batches |
164
- | `overfitting` | Model memorizing training data, failing to generalize |
165
- | `batchnorm_eval_mode` | Model left in eval mode, BatchNorm using running statistics |
166
- | `code_bug` | Bug in the PyTorch training code (Task 6 — always this, regardless of bug variant) |
167
-
168
- ### 4.5 Reward Function Summary
169
-
170
- Per-step signal. **Separate from the grader** (see 4.6). Range: [-1.0, 1.0] hard cap.
171
-
172
- | Event | Reward | Gate Condition |
173
- |---|---|---|
174
- | Any step taken | -0.01 | Unconditional, flat constant (never multiplied by step_count) |
175
- | First-time inspection (per type) | +0.05 | Not previously inspected for that type |
176
- | `add_callback` after normal gradients | -0.20 | `gradients_inspected == True AND gradients_were_normal == True` |
177
- | Invalid action | -0.05 | Action not in current `available_actions` |
178
- | Wrong code fix | -0.10 | `fix_code` with incorrect line or replacement |
179
- | Correct diagnosis | +0.50 | `diagnosis == true_root_cause` |
180
- | Wrong diagnosis | -0.30 | `diagnosis != true_root_cause` |
181
- | Convergence after fix+restart | +0.40 | `fix_action_taken AND restart_after_fix AND convergence_confirmed` |
182
-
183
- See spec Section 12 for full design rationale.
184
-
185
- ### 4.6 Grader Function
186
-
187
- Returns a single normalized 0.0-1.0 score at episode end. Evaluates `EpisodeState` holistically — checks which key actions were taken, whether the correct fix was applied, whether the diagnosis is correct, and efficiency. **Not a sum of step rewards.** One grader function per task. All graders are deterministic.
188
-
189
- Exposed via `POST /grader`. Returns score for the most recently completed episode.
190
-
191
- ### 4.7 The Six Tasks
192
-
193
- | Task | ID | Difficulty | Root Cause | Key Signal | Heuristic Score |
194
- |---|---|---|---|---|---|
195
- | Exploding Gradients | `task_001` | Easy | `lr_too_high` | All layers `is_exploding: True`, NaN in error_log | ~0.85 |
196
- | Vanishing Gradients | `task_002` | Easy | `vanishing_gradients` | Deeper layers `is_vanishing: True`, flat loss | ~0.80 |
197
- | Silent Data Leakage | `task_003` | Medium | `data_leakage` | High val accuracy from epoch 1, `class_overlap_score` 0.68-0.88 | ~0.70 |
198
- | Overfitting | `task_004` | Medium | `overfitting` | Train-val divergence, loss→0.01 while val climbs | ~0.65 |
199
- | BatchNorm Eval Mode | `task_005` | Hard | `batchnorm_eval_mode` | Slow val degradation + compound red herrings | ~0.45 |
200
- | PyTorch Code Bug | `task_006` | Hard | `code_bug` (always) | Anomalous metrics, root cause only visible in code | ~0.30 |
201
-
202
- **MVP tasks:** 1, 3, 5 (satisfies the 3-task minimum with easy→medium→hard range).
203
-
204
- See spec Section 11 for complete task specifications including fault parameters, red herrings, solution paths, and grader breakdowns.
205
-
206
- ### 4.8 Baseline Agents
207
-
208
- **Rule-based baseline (submission default, `baseline_heuristic.py`):**
209
- - Deterministic decision tree: inspect_gradients → check exploding/vanishing → inspect_data → check leakage → check overfitting → inspect_model_modes → inspect_code → fallback
210
- - No API key required. Bit-exact reproducible.
211
- - Used for Phase 1 auto-validation reproducibility checks.
212
-
213
- **LLM baseline (optional, `baseline_inference.py`):**
214
- - GPT-4o at temperature=0.0, seed=42
215
- - Requires `OPENAI_API_KEY` environment variable
216
- - Supplementary demonstration of heuristic vs. reasoning score gap
217
- - Not used for Phase 1 reproducibility — scores reported only after empirical measurement
218
-
219
- ### 4.9 Required Endpoints
220
-
221
- | Endpoint | Method | Required By | Response |
222
- |---|---|---|---|
223
- | `/ws` | WebSocket | OpenEnv framework | Handles `reset`, `step`, `state` messages |
224
- | `/tasks` | GET | Hackathon | Task list with IDs, difficulties, MLTrainingAction JSON schema |
225
- | `/grader` | POST | Hackathon | `{"score": float, "task_id": str, "steps": int}` for last completed episode |
226
- | `/baseline` | POST | Hackathon | Triggers baseline run, returns `{"scores": {"task_001": float, ...}}` |
227
- | `/health` | GET | Hackathon | `{"status": "ready", "tasks": N}` — N is active task count |
228
- | `/dashboard` | GET | Bonus | Live diagnostic dashboard (HTML/JS, Plotly.js via CDN) |
229
- | `/validation-report` | GET | Bonus | Pre-computed PyTorch fidelity reports |
230
-
231
- Framework auto-provides: `POST /reset`, `POST /step`, `GET /state`, `GET /schema`, `GET /docs`, `/mcp`.
232
-
233
- ### 4.10 Error Handling
234
-
235
- `step()` must never raise an unhandled exception. All invalid actions return a valid observation with -0.05 penalty and an error note. See spec Section 16 for the complete error handling matrix covering all edge cases (invalid actions, malformed JSON, step before reset, etc.).
236
-
237
- ---
238
-
239
- ## 5. Non-Functional Requirements
240
-
241
- ### 5.1 OpenEnv Spec Compliance
242
-
243
- | Requirement | Implementation |
244
- |---|---|
245
- | `openenv.yaml` present | Name, version, description, framework, tags, observation/action space, tasks with IDs+difficulties+max_steps, reward config, endpoints |
246
- | Typed Pydantic models | `MLTrainingAction` extends `Action`, `MLTrainingObservation` extends `Observation`, all fields explicitly typed |
247
- | `step()`/`reset()`/`state()` | Implemented in `MLTrainingEnvironment` extending `Environment` from `openenv.core.env_server.interfaces` |
248
- | `openenv validate` passes | Tested before every submission |
249
-
250
- ### 5.2 Framework Integration
251
-
252
- | Requirement | Implementation |
253
- |---|---|
254
- | `openenv-core` v0.2.2 | `create_app()` returns standard FastAPI instance — **verified** |
255
- | Custom routes compose | `/tasks`, `/grader`, `/baseline`, `/health` added via `@app.get()`/`@app.post()` on the returned FastAPI app |
256
- | Framework-provided routes | `/reset`, `/step`, `/state`, `/ws`, `/schema`, `/docs`, `/mcp` — do not reimplement |
257
- | Factory pattern | `create_app(MLTrainingEnvironment, ...)` takes the class, not an instance |
258
- | Concurrent sessions | `SUPPORTS_CONCURRENT_SESSIONS = True`, session state keyed by session ID |
259
- | Typed client | `client.py` extends `EnvClient` with typed action/observation — used by baseline scripts |
260
-
261
- ### 5.3 Docker & Deployment
262
-
263
- | Requirement | Target |
264
- |---|---|
265
- | Base image | `python:3.12-slim` |
266
- | PyTorch | CPU-only wheel (`--index-url https://download.pytorch.org/whl/cpu`), ~150MB |
267
- | Total image size | <500MB |
268
- | Build time | <5 min (no real training during build; validation reports pre-computed) |
269
- | HF Spaces | Tagged with `openenv`, port 7860 |
270
- | Health check | `/health` returns `{"status": "ready", "tasks": N}` within 60s of container start |
271
-
272
- ### 5.4 Reproducibility
273
-
274
- | Requirement | Implementation |
275
- |---|---|
276
- | Deterministic episodes | `torch.manual_seed(seed)` at every `reset()`, seed derived deterministically from task ID |
277
- | Baseline bit-exact | Rule-based baseline produces identical scores on two consecutive runs |
278
- | Exploit resistance | Parameters randomized per `reset()` from defined ranges; opaque task IDs |
279
- | Grader determinism | Same `EpisodeState` always produces same score |
280
-
281
- ### 5.5 Performance
282
-
283
- | Requirement | Target |
284
- |---|---|
285
- | `reset()` latency | <200ms (model instantiation + 2 forward passes + parametric curves) |
286
- | `step()` latency | <10ms (action dispatch + reward + state update) |
287
- | Memory | <512MB RSS (small CNN ~50K params, no GPU, no large datasets) |
288
-
289
- ### 5.6 Code Quality
290
-
291
- | Requirement | Standard |
292
- |---|---|
293
- | Formatting | black (line length 88) |
294
- | Linting | ruff |
295
- | Import ordering | isort (profile=black) |
296
- | Type hints | Every function signature and return type |
297
- | Tests | pytest, >80% coverage, every module has corresponding test file |
298
- | PyTorch-native | All core computation uses `torch.Tensor`, zero numpy in core modules |
299
-
300
- ---
301
-
302
- ## 6. Prioritized Scope
303
-
304
- ### Tier 1: MVP (Must Ship First)
305
-
306
- **Deadline within deadline:** Deploy to HF Spaces by Day 6 (April 2). Everything after is additive.
307
-
308
- | Deliverable | Description | DQ Risk if Missing |
309
- |---|---|---|
310
- | Task 1 (`task_001`) | Exploding gradients — easy | Yes (need 3+ tasks) |
311
- | Task 3 (`task_003`) | Silent data leakage — medium | Yes (need 3+ tasks) |
312
- | Task 5 (`task_005`) | BatchNorm eval mode — hard | Yes (need easy→hard range) |
313
- | Context-gated penalty | -0.20 for `add_callback` after `gradients_were_normal` | No (but kills differentiation) |
314
- | Rule-based baseline | `baseline_heuristic.py`, deterministic, no API key | Yes (baseline required) |
315
- | Reward engine | All 7 reward components implemented exactly | Yes (reward logic required) |
316
- | Graders (3) | One per MVP task, 0.0-1.0, deterministic | Yes (graders required) |
317
- | `openenv.yaml` | Full metadata, 3+ tasks listed | Yes (spec compliance) |
318
- | Required endpoints | `/tasks`, `/grader`, `/baseline`, `/health` | Yes (auto-validator checks) |
319
- | Dockerfile | Builds and runs, port 7860 | Yes (auto-validator checks) |
320
- | HF Space | Deployed, tagged `openenv`, responds to `reset()` | Yes (auto-validator pings) |
321
- | README | Environment description, action/observation spaces, task descriptions, setup instructions, baseline scores | Yes (submission requirement) |
322
-
323
- ### Tier 2: Strongest Differentiator (Add Immediately After MVP)
324
-
325
- | Deliverable | Description | Why This Order |
326
- |---|---|---|
327
- | Task 6 (`task_006`) | PyTorch code bug — hard, code-level debugging | Single highest-impact feature for Meta judges |
328
- | Code fix validation | Multi-strategy pipeline (tokenize, AST, semantic patterns) | Required for Task 6 to work with LLM agents |
329
- | Grader for Task 6 | `code_bug` diagnosis, code fix scoring | Completes Task 6 |
330
-
331
- ### Tier 3: Full Task Coverage (Time Permitting)
332
-
333
- | Deliverable | Description |
334
- |---|---|
335
- | Task 2 (`task_002`) | Vanishing gradients — easy (similar to Task 1, fast to implement) |
336
- | Task 4 (`task_004`) | Overfitting — medium (train-val divergence, regularization fix) |
337
- | Graders for Tasks 2 & 4 | Same pattern as existing graders |
338
-
339
- ### Tier 4: Polish & Extras (Only After Tiers 1-3 Complete)
340
-
341
- | Deliverable | Description | Priority Within Tier |
342
- |---|---|---|
343
- | Live dashboard | HTML/JS at `/dashboard`, Plotly.js via CDN, 4-panel layout | 1st — transforms judging experience |
344
- | PyTorch validation suite | 6 scripts proving parametric curves match real training, R² > 0.85 | 2nd — answers "how realistic?" |
345
- | Validation report endpoint | `GET /validation-report` serving pre-computed fidelity plots | With validation suite |
346
- | LLM baseline | `baseline_inference.py`, GPT-4o, measures heuristic-LLM gap | 3rd — supplementary demonstration |
347
-
348
- ### Implementation Timeline (11 days: March 28 - April 8)
349
-
350
- | Days | Focus | Exit Criteria |
351
- |---|---|---|
352
- | 1-2 | Skeleton server + Task 1 end-to-end | `reset()` → `step()` → `grader` works for one task, Docker builds |
353
- | 3-5 | Tasks 3 & 5 + reward engine + baseline | All 3 MVP tasks pass grader, `baseline_heuristic.py` reproduces |
354
- | 6 | **Deploy MVP to HF Spaces** | Auto-validation passes. This is the insurance policy. |
355
- | 7-8 | Task 6 (code debugging) | Code fix validation works for all 4 bug variants |
356
- | 9-10 | Tasks 2 & 4 + dashboard | Full 6-task environment, dashboard shows agent behavior |
357
- | 11 | Polish, README, final smoke test | Submission-ready |
358
-
359
- ### What We Will NOT Build (Explicit Exclusions)
360
-
361
- - No game or toy environments
362
- - No numpy in core modules (torch.Tensor only)
363
- - No free-text diagnosis (closed enum only)
364
- - No grader that sums step rewards (holistic evaluation only)
365
- - No cumulative step penalty (flat -0.01 only, never -0.01 * step_count)
366
- - No accommodation support or non-RL features
367
- - No multi-GPU or CUDA dependencies (CPU-only PyTorch)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/PROJECT_GUIDE.md DELETED
@@ -1,691 +0,0 @@
1
- # PyTorch Training Run Debugger — Complete Project Guide
2
-
3
- ## What Is This?
4
-
5
- A game where an AI agent plays detective to fix broken PyTorch training runs. The agent sees a failing training run, investigates clues (gradients, data, code), applies a fix, and submits a diagnosis. Built as an [OpenEnv](https://github.com/openenv) RL environment for the **Meta PyTorch OpenEnv Hackathon**.
6
-
7
- ---
8
-
9
- ## How a Game Works
10
-
11
- ```
12
- 1. Agent receives a broken training run (loss curves, config, error log)
13
- 2. Agent investigates (inspect gradients, data, weights, model modes, code)
14
- 3. Agent applies a fix (reduce LR, patch data, fix code, etc.)
15
- 4. Agent restarts training and confirms recovery
16
- 5. Agent submits diagnosis ("the problem was lr_too_high")
17
- 6. Grader scores the agent 0.0 to 1.0
18
- ```
19
-
20
- ---
21
-
22
- ## The 7 Tasks
23
-
24
- | Task | Problem | Difficulty | Root Cause | Key Clue |
25
- |------|---------|-----------|------------|----------|
26
- | `task_001` | Gradients explode | Easy | `lr_too_high` | All layers `is_exploding: true` |
27
- | `task_002` | Gradients vanish | Easy | `vanishing_gradients` | Deep layers `is_vanishing: true` |
28
- | `task_003` | Test data leaked into training | Medium | `data_leakage` | `class_overlap_score > 0.5` |
29
- | `task_004` | Model memorizes, doesn't learn | Medium | `overfitting` | Train loss drops, val loss rises |
30
- | `task_005` | BatchNorm stuck in eval mode | Hard | `batchnorm_eval_mode` | Model modes show "eval" + red herrings |
31
- | `task_006` | Bug in Python training code | Hard | `code_bug` | Bug visible in code snippet |
32
- | `task_007` | LR scheduler decays too fast | Medium-Hard | `scheduler_misconfigured` | Early progress then stagnation |
33
-
34
- ---
35
-
36
- ## Reward System
37
-
38
- Every action earns or costs points (capped at -1.0 to 1.0):
39
-
40
- | Event | Reward | When |
41
- |-------|--------|------|
42
- | Any step taken | **-0.01** | Always (encourages efficiency) |
43
- | First-time inspection | **+0.05** | Once per inspection type |
44
- | Correct diagnosis | **+0.50** | Diagnosis matches root cause |
45
- | Wrong diagnosis | **-0.30** | Diagnosis doesn't match |
46
- | Fix works + training recovers | **+0.40** | After fix + restart + convergence |
47
- | Invalid action | **-0.05** | Action not available |
48
- | Wrong code fix | **-0.10** | `fix_code` with wrong line/replacement |
49
- | **Context-gated penalty** | **-0.20** | Inspected gradients, saw they're normal, then added gradient clipping anyway |
50
-
51
- ### The Context-Gated Penalty (Core Innovation)
52
-
53
- - Agent checks gradients -> finds them **normal** -> adds gradient clipping = **-0.20 penalty** (ignoring evidence)
54
- - Agent adds gradient clipping **before** checking gradients = **no penalty** (reasonable prior)
55
-
56
- This teaches: *don't ignore what you've already learned*.
57
-
58
- ---
59
-
60
- ## Architecture
61
-
62
- ```
63
- ml_training_debugger/ # Core logic
64
- models.py # All data types (Pydantic)
65
- scenarios.py # Creates the 7 tasks with random params
66
- pytorch_engine.py # Real PyTorch model + fault injection
67
- simulation.py # Loss/accuracy curve generation
68
- reward_engine.py # Per-step reward calculation
69
- graders.py # Final 0.0-1.0 scoring per task
70
- code_templates.py # Buggy code for Task 6
71
- client.py # Client for connecting to the environment
72
-
73
- server/ # Web server
74
- app.py # FastAPI + all endpoints
75
- environment.py # Game logic (reset, step, state)
76
-
77
- tests/ # 183 tests, 97% coverage
78
- baseline_heuristic.py # Rule-based agent (deterministic)
79
- baseline_inference.py # LLM agent (Llama/GPT-4o)
80
- ```
81
-
82
- ---
83
-
84
- ## API Endpoints
85
-
86
- ### GET /health
87
-
88
- Server status check.
89
-
90
- **Response:**
91
- ```json
92
- {
93
- "status": "ready",
94
- "tasks": 7
95
- }
96
- ```
97
-
98
- ---
99
-
100
- ### GET /tasks
101
-
102
- List all available tasks with action schema.
103
-
104
- **Response:**
105
- ```json
106
- [
107
- {
108
- "id": "task_001",
109
- "difficulty": "easy",
110
- "max_steps": 20,
111
- "action_schema": {
112
- "title": "MLTrainingAction",
113
- "type": "object",
114
- "properties": {
115
- "action_type": { "type": "string" },
116
- "target": { "type": ["string", "null"] },
117
- "value": { "type": ["number", "integer", "string", "null"] },
118
- "diagnosis": { "type": ["string", "null"] },
119
- "line": { "type": ["integer", "null"] },
120
- "replacement": { "type": ["string", "null"] }
121
- },
122
- "required": ["action_type"]
123
- }
124
- }
125
- ]
126
- ```
127
-
128
- ---
129
-
130
- ### POST /baseline
131
-
132
- Run the heuristic baseline agent on all 7 tasks.
133
-
134
- **Response:**
135
- ```json
136
- {
137
- "scores": {
138
- "task_001": 1.00,
139
- "task_002": 1.00,
140
- "task_003": 1.00,
141
- "task_004": 0.45,
142
- "task_005": 0.35,
143
- "task_006": 1.00,
144
- "task_007": 1.00
145
- }
146
- }
147
- ```
148
-
149
- Returns `409` if baseline is already running.
150
-
151
- ---
152
-
153
- ### POST /grader
154
-
155
- Get the grader score for the last completed episode.
156
-
157
- **Query params:** `session_id` (optional)
158
-
159
- **Response:**
160
- ```json
161
- {
162
- "score": 0.85,
163
- "task_id": "task_001",
164
- "steps": 5
165
- }
166
- ```
167
-
168
- If no episode completed:
169
- ```json
170
- {
171
- "score": null,
172
- "error": "no_completed_episode"
173
- }
174
- ```
175
-
176
- ---
177
-
178
- ### GET /dashboard
179
-
180
- Live diagnostic dashboard (HTML page with Plotly.js charts). Open in a browser.
181
-
182
- **Panels:**
183
- 1. Training metrics (loss/accuracy curves)
184
- 2. Gradient & weight heatmap
185
- 3. Action timeline with rewards
186
- 4. Episode summary with state flags
187
-
188
- ---
189
-
190
- ### GET /validation-report
191
-
192
- Pre-computed fidelity report comparing parametric curves to real PyTorch training runs.
193
-
194
- ---
195
-
196
- ### GET /curriculum
197
-
198
- Recommended task order for progressive training (easy to hard, 3 difficulty levels each).
199
-
200
- **Response:**
201
- ```json
202
- {
203
- "curriculum": [
204
- { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 1, "max_steps": 20 },
205
- { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 3, "max_steps": 20 },
206
- { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 5, "max_steps": 20 }
207
- ],
208
- "total_episodes": 21
209
- }
210
- ```
211
-
212
- ---
213
-
214
- ### GET /leaderboard
215
-
216
- Sorted episode scores from baseline runs.
217
-
218
- **Response:**
219
- ```json
220
- {
221
- "entries": [
222
- { "score": 1.00, "task_id": "task_001", "steps": 5, "episode_id": "baseline_task_001" }
223
- ],
224
- "total": 7
225
- }
226
- ```
227
-
228
- ---
229
-
230
- ### GET /replay/{episode_id}
231
-
232
- Full action/observation trace for a completed episode.
233
-
234
- **Response:**
235
- ```json
236
- {
237
- "episode_id": "baseline_task_001",
238
- "score": 1.00,
239
- "task_id": "task_001",
240
- "steps": 5
241
- }
242
- ```
243
-
244
- ---
245
-
246
- ## WebSocket Interface (Primary Agent Interface)
247
-
248
- **Endpoint:** `ws://localhost:7860/ws`
249
-
250
- This is the main way agents interact with the environment. HTTP endpoints are stateless — WebSocket maintains session state across a full episode.
251
-
252
- ### Reset (Start New Episode)
253
-
254
- **Send:**
255
- ```json
256
- {
257
- "type": "reset",
258
- "seed": 42,
259
- "kwargs": {
260
- "task_id": "task_003",
261
- "difficulty_level": 3
262
- }
263
- }
264
- ```
265
-
266
- Without `kwargs`, defaults to `task_001`.
267
-
268
- **Receive:**
269
- ```json
270
- {
271
- "type": "observation",
272
- "observation": {
273
- "run_id": "ep_12345",
274
- "framework": "pytorch",
275
- "epoch": 20,
276
- "training_loss_history": [2.3, 2.1, 1.9, ...],
277
- "val_loss_history": [2.4, 2.2, 2.0, ...],
278
- "val_accuracy_history": [0.3, 0.35, 0.4, ...],
279
- "gradient_stats": [],
280
- "model_weight_stats": null,
281
- "data_batch_stats": null,
282
- "model_mode_info": null,
283
- "code_snippet": null,
284
- "current_config": {
285
- "learning_rate": 0.001,
286
- "weight_decay": 0.0001,
287
- "batch_size": 64,
288
- "hidden_dim": 64,
289
- "num_layers": 3,
290
- "optimizer": "adam",
291
- "dropout_rate": 0.0,
292
- "gradient_clip_norm": null
293
- },
294
- "error_log": null,
295
- "gpu_memory_used_gb": 6.2,
296
- "gpu_memory_total_gb": 16.0,
297
- "available_actions": [
298
- "inspect_gradients",
299
- "inspect_data_batch",
300
- "inspect_model_modes",
301
- "inspect_model_weights",
302
- "inspect_code",
303
- "modify_config",
304
- "add_callback",
305
- "replace_optimizer",
306
- "patch_data_loader",
307
- "fix_model_mode",
308
- "mark_diagnosed"
309
- ],
310
- "episode_state": {
311
- "step_count": 0,
312
- "gradients_inspected": false,
313
- "gradients_were_normal": false,
314
- "data_inspected": false,
315
- "model_modes_inspected": false,
316
- "model_weights_inspected": false,
317
- "code_inspected": false,
318
- "fix_action_taken": false,
319
- "restart_after_fix": false,
320
- "diagnosis_submitted": false,
321
- "actions_taken": []
322
- },
323
- "notes": null,
324
- "done": false,
325
- "reward": null,
326
- "metadata": {}
327
- }
328
- }
329
- ```
330
-
331
- ### Step (Take an Action)
332
-
333
- **Investigation actions** (no extra fields needed):
334
- ```json
335
- {"type": "step", "action": {"action_type": "inspect_gradients"}}
336
- {"type": "step", "action": {"action_type": "inspect_data_batch"}}
337
- {"type": "step", "action": {"action_type": "inspect_model_modes"}}
338
- {"type": "step", "action": {"action_type": "inspect_model_weights"}}
339
- {"type": "step", "action": {"action_type": "inspect_code"}}
340
- ```
341
-
342
- **Fix actions:**
343
- ```json
344
- {"type": "step", "action": {"action_type": "modify_config", "target": "learning_rate", "value": 0.001}}
345
- {"type": "step", "action": {"action_type": "add_callback"}}
346
- {"type": "step", "action": {"action_type": "replace_optimizer"}}
347
- {"type": "step", "action": {"action_type": "patch_data_loader"}}
348
- {"type": "step", "action": {"action_type": "fix_model_mode"}}
349
- {"type": "step", "action": {"action_type": "fix_code", "line": 5, "replacement": "model.train()"}}
350
- ```
351
-
352
- **Terminal actions:**
353
- ```json
354
- {"type": "step", "action": {"action_type": "restart_run"}}
355
- {"type": "step", "action": {"action_type": "mark_diagnosed", "diagnosis": "lr_too_high"}}
356
- ```
357
-
358
- **Receive (after each step):**
359
- ```json
360
- {
361
- "type": "observation",
362
- "observation": {
363
- "...same structure as reset response...",
364
- "gradient_stats": [
365
- {
366
- "layer_name": "conv1",
367
- "norm_history": [0.5, 0.6, 0.7],
368
- "mean_norm": 51.1,
369
- "max_norm": 98.3,
370
- "is_exploding": true,
371
- "is_vanishing": false
372
- }
373
- ],
374
- "episode_state": {
375
- "step_count": 1,
376
- "gradients_inspected": true,
377
- "actions_taken": ["inspect_gradients"]
378
- },
379
- "done": false,
380
- "reward": 0.04
381
- }
382
- }
383
- ```
384
-
385
- When `done: true`, the episode is over.
386
-
387
- ---
388
-
389
- ## All 14 Action Types
390
-
391
- | Action | Required Fields | Description |
392
- |--------|----------------|-------------|
393
- | `inspect_gradients` | none | View per-layer gradient stats |
394
- | `inspect_data_batch` | none | View data batch statistics |
395
- | `inspect_model_modes` | none | View train/eval mode per layer |
396
- | `inspect_model_weights` | none | View per-layer weight stats |
397
- | `inspect_code` | none | View source code (Task 6) |
398
- | `modify_config` | `target`, `value` | Change a hyperparameter |
399
- | `add_callback` | none | Add gradient clipping callback |
400
- | `replace_optimizer` | none | Switch optimizer |
401
- | `patch_data_loader` | none | Fix data pipeline |
402
- | `fix_model_mode` | none | Switch model to train mode |
403
- | `fix_code` | `line`, `replacement` | Fix a line of code |
404
- | `restart_run` | none | Restart training (requires fix first) |
405
- | `mark_diagnosed` | `diagnosis` | Submit final diagnosis |
406
- | `rollback_checkpoint` | none | Rollback to checkpoint |
407
-
408
- ### Valid `target` values for modify_config
409
- `learning_rate`, `weight_decay`, `batch_size`, `hidden_dim`, `num_layers`, `optimizer`, `dropout_rate`, `gradient_clip_norm`
410
-
411
- ### Valid `diagnosis` values for mark_diagnosed
412
- `lr_too_high`, `vanishing_gradients`, `data_leakage`, `overfitting`, `batchnorm_eval_mode`, `code_bug`, `scheduler_misconfigured`
413
-
414
- ---
415
-
416
- ## Dynamic Action Availability
417
-
418
- Actions appear/disappear based on episode state:
419
-
420
- | Action | Available When |
421
- |--------|---------------|
422
- | `fix_code` | Only after `inspect_code` (code_inspected = true) |
423
- | `restart_run` | Only after a fix action (fix_action_taken = true) |
424
- | `rollback_checkpoint` | Only after restart (restart_after_fix = true) |
425
- | `mark_diagnosed` | Only while diagnosis_submitted = false |
426
-
427
- ---
428
-
429
- ## Observation Fields — Progressive Reveal
430
-
431
- On reset, the agent sees loss curves, config, and error log. Everything else is `null` until inspected:
432
-
433
- | Field | Starts As | Populated After |
434
- |-------|-----------|----------------|
435
- | `training_loss_history` | 20 floats | Always visible |
436
- | `val_accuracy_history` | 20 floats | Always visible |
437
- | `val_loss_history` | 20 floats | Always visible |
438
- | `current_config` | Full config | Always visible |
439
- | `error_log` | String or null | Always visible |
440
- | `gradient_stats` | `[]` | `inspect_gradients` |
441
- | `model_weight_stats` | `null` | `inspect_model_weights` |
442
- | `data_batch_stats` | `null` | `inspect_data_batch` |
443
- | `model_mode_info` | `null` | `inspect_model_modes` |
444
- | `code_snippet` | `null` | `inspect_code` |
445
-
446
- ---
447
-
448
- ## Data Types
449
-
450
- ### GradientStats (per layer)
451
- ```json
452
- {
453
- "layer_name": "conv1",
454
- "norm_history": [0.5, 0.6, 0.7],
455
- "mean_norm": 12.5,
456
- "max_norm": 25.3,
457
- "is_exploding": true,
458
- "is_vanishing": false
459
- }
460
- ```
461
- - Exploding: `mean_norm > 10.0`
462
- - Vanishing: `mean_norm < 0.000001`
463
-
464
- ### ModelWeightStats (per layer)
465
- ```json
466
- {
467
- "layer_name": "conv1",
468
- "weight_norm": 1.234,
469
- "weight_mean": 0.001,
470
- "weight_std": 0.05,
471
- "weight_min": -0.15,
472
- "weight_max": 0.16,
473
- "dead_neuron_pct": 0.0,
474
- "has_nan": false,
475
- "has_inf": false
476
- }
477
- ```
478
-
479
- ### DataBatchStats
480
- ```json
481
- {
482
- "label_distribution": {"0": 0.25, "1": 0.25, "2": 0.25, "3": 0.25},
483
- "feature_mean": 0.5,
484
- "feature_std": 0.2,
485
- "null_count": 0,
486
- "class_overlap_score": 0.15,
487
- "batch_size": 64,
488
- "duplicate_ratio": 0.0,
489
- "confusion_matrix": [[10, 2, 1], [1, 9, 3], [2, 1, 11]]
490
- }
491
- ```
492
-
493
- ### CodeSnippet (Task 6 only)
494
- ```json
495
- {
496
- "code": "import torch\nimport torch.nn as nn\n...",
497
- "filename": "train.py",
498
- "line_count": 50,
499
- "imports": ["torch", "torch.nn", "torch.optim"],
500
- "hint": "Look for .detach() preventing gradient flow"
501
- }
502
- ```
503
-
504
- ### EpisodeState
505
- ```json
506
- {
507
- "step_count": 0,
508
- "gradients_inspected": false,
509
- "gradients_were_normal": false,
510
- "data_inspected": false,
511
- "model_modes_inspected": false,
512
- "model_weights_inspected": false,
513
- "code_inspected": false,
514
- "fix_action_taken": false,
515
- "restart_after_fix": false,
516
- "diagnosis_submitted": false,
517
- "actions_taken": []
518
- }
519
- ```
520
-
521
- ---
522
-
523
- ## Grading Breakdown (per task)
524
-
525
- Each task has its own grader that scores 0.0 to 1.0 based on what the agent did:
526
-
527
- ### Task 1 — Exploding Gradients
528
- | Component | Points |
529
- |-----------|--------|
530
- | Inspected gradients | +0.05 |
531
- | Applied config fix | +0.20 |
532
- | Restarted training | +0.35 |
533
- | Correct diagnosis (`lr_too_high`) | +0.40 |
534
-
535
- ### Task 2 — Vanishing Gradients
536
- | Component | Points |
537
- |-----------|--------|
538
- | Inspected gradients | +0.05 |
539
- | Applied config fix | +0.20 |
540
- | Restarted training | +0.35 |
541
- | Correct diagnosis (`vanishing_gradients`) | +0.40 |
542
-
543
- ### Task 3 — Data Leakage
544
- | Component | Points |
545
- |-----------|--------|
546
- | Inspected data | +0.05 |
547
- | Patched data loader | +0.30 |
548
- | Restarted training | +0.30 |
549
- | Correct diagnosis (`data_leakage`) | +0.35 |
550
-
551
- ### Task 4 — Overfitting
552
- | Component | Points |
553
- |-----------|--------|
554
- | Inspected data | +0.05 |
555
- | Applied fix (config or callback) | +0.25 |
556
- | Restarted training | +0.30 |
557
- | Correct diagnosis (`overfitting`) | +0.40 |
558
-
559
- ### Task 5 — BatchNorm Eval Mode (with red herrings)
560
- | Component | Points |
561
- |-----------|--------|
562
- | Inspected gradients | +0.05 |
563
- | Inspected model modes | +0.05 |
564
- | **Fell for red herring** (add_callback after normal gradients) | **-0.20** |
565
- | Fixed model mode | +0.25 |
566
- | Restarted training | +0.30 |
567
- | Correct diagnosis (`batchnorm_eval_mode`) | +0.40 |
568
-
569
- ### Task 6 — Code Bug
570
- | Component | Points |
571
- |-----------|--------|
572
- | Inspected code | +0.05 |
573
- | Fixed code correctly | +0.30 |
574
- | Restarted training | +0.25 |
575
- | Correct diagnosis (`code_bug`) | +0.40 |
576
-
577
- ### Task 7 — Scheduler Misconfigured
578
- | Component | Points |
579
- |-----------|--------|
580
- | Inspected gradients | +0.05 |
581
- | Inspected data | +0.05 |
582
- | Applied config fix | +0.25 |
583
- | Restarted training | +0.25 |
584
- | Correct diagnosis (`scheduler_misconfigured`) | +0.40 |
585
-
586
- ---
587
-
588
- ## Baseline Scores
589
-
590
- | Task | Heuristic | Llama 3.3 70B | Llama 3.1 8B |
591
- |------|-----------|---------------|--------------|
592
- | task_001 | **1.00** | 1.00 | 0.60 |
593
- | task_002 | **1.00** | 1.00 | 0.05 |
594
- | task_003 | **1.00** | 0.40 | 0.40 |
595
- | task_004 | 0.45 | 0.45 | **0.60** |
596
- | task_005 | **1.00** | 1.00 | 1.00 |
597
- | task_006 | **1.00** | — | 0.60-1.00 |
598
- | task_007 | **1.00** | — | 0.60 |
599
- | **Average** | **0.92** | ~0.69 | 0.55 |
600
-
601
- ---
602
-
603
- ## Walkthrough: Solving Task 1 (Exploding Gradients)
604
-
605
- ```
606
- Step 1: Reset
607
- Send: {"type": "reset", "kwargs": {"task_id": "task_001"}}
608
- See: Loss history going to infinity, error_log says "NaN at epoch 12"
609
-
610
- Step 2: Inspect gradients
611
- Send: {"type": "step", "action": {"action_type": "inspect_gradients"}}
612
- See: All layers is_exploding: true, mean_norm > 10.0
613
- Reward: +0.04 (-0.01 step + 0.05 investigation)
614
-
615
- Step 3: Reduce learning rate
616
- Send: {"type": "step", "action": {"action_type": "modify_config", "target": "learning_rate", "value": 0.001}}
617
- Reward: -0.01 (step penalty)
618
-
619
- Step 4: Restart training
620
- Send: {"type": "step", "action": {"action_type": "restart_run"}}
621
- See: Convergence detected!
622
- Reward: +0.39 (-0.01 step + 0.40 convergence)
623
-
624
- Step 5: Submit diagnosis
625
- Send: {"type": "step", "action": {"action_type": "mark_diagnosed", "diagnosis": "lr_too_high"}}
626
- See: done: true
627
- Reward: +0.49 (-0.01 step + 0.50 correct diagnosis)
628
-
629
- Grader score: 1.0 (perfect)
630
- ```
631
-
632
- ---
633
-
634
- ## Walkthrough: Task 5 Trap (Red Herring)
635
-
636
- ```
637
- Step 1: Reset task_005
638
- Step 2: Inspect gradients
639
- -> FC layer has a spike (mean_norm=4.2, but is_exploding: false)
640
- -> gradients_were_normal is set to TRUE (nothing actually exploding)
641
-
642
- Step 3 (BAD): Add gradient clipping
643
- -> Reward: -0.21 (-0.01 step - 0.20 context-gated penalty!)
644
- -> Agent IGNORED the evidence that gradients were normal
645
-
646
- Step 3 (GOOD): Inspect model modes instead
647
- -> Sees all layers in "eval" mode — that's the real problem!
648
-
649
- Step 4: Fix model mode
650
- Step 5: Restart training
651
- Step 6: Diagnose batchnorm_eval_mode -> correct!
652
- ```
653
-
654
- ---
655
-
656
- ## Quick Start
657
-
658
- ```bash
659
- # Setup
660
- python3 -m venv .venv && source .venv/bin/activate
661
- pip install torch --index-url https://download.pytorch.org/whl/cpu
662
- pip install -r requirements.txt
663
- pip install pytest pytest-cov
664
-
665
- # Run server
666
- uvicorn server.app:app --host 0.0.0.0 --port 7860
667
-
668
- # Test
669
- pytest tests/ -v --cov=ml_training_debugger
670
- curl http://localhost:7860/health
671
- curl http://localhost:7860/tasks | python3 -m json.tool
672
- curl -X POST http://localhost:7860/baseline | python3 -m json.tool
673
-
674
- # Docker
675
- docker build -t pytorch-debugger .
676
- docker run -p 7860:7860 pytorch-debugger
677
- ```
678
-
679
- ---
680
-
681
- ## Tech Stack
682
-
683
- | Component | Purpose |
684
- |-----------|---------|
685
- | Python 3.12 | Runtime |
686
- | PyTorch (CPU-only) | Real neural networks, real gradients |
687
- | FastAPI | Web server |
688
- | OpenEnv | RL environment framework (step/reset/state API) |
689
- | Pydantic v2 | Typed data models |
690
- | Plotly.js | Dashboard charts |
691
- | Docker | Containerized deployment |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
docs/ROADMAP.md DELETED
@@ -1,441 +0,0 @@
1
- # ROADMAP — PyTorch Training Run Debugger
2
-
3
- **Timeline:** March 28 - April 8, 2026 (11 days)
4
- **Runtime:** Python 3.12 · PyTorch CPU-only · openenv-core v0.2.2
5
- **Governing documents:** `ml-training-debugger-spec.md` (source of truth), `PRD.md` (requirements), `CLAUDE.md` (coding rules)
6
- **Iron rule:** No phase begins until the previous phase's acceptance criteria are met. The single exception: Phase 0 and Phase 1 file creation can overlap on Day 1.
7
-
8
- ---
9
-
10
- ## Phase 0: Setup & Validation (Days 1-2)
11
-
12
- **Goal:** A running skeleton server that proves the toolchain works end-to-end. Zero business logic — just plumbing.
13
-
14
- ### 0.1 Files to Create
15
-
16
- | File | Purpose | Lines (est.) |
17
- |---|---|---|
18
- | `ML Debugger/` (this directory) | Project root directory (git init here) | — |
19
- | `pyproject.toml` | Project metadata, dependencies (torch CPU, openenv-core, pydantic>=2.0, fastapi, uvicorn, pytest, black, ruff, isort) | ~40 |
20
- | `requirements.txt` | Flat dependency list mirroring pyproject.toml (Docker uses this). **Exclude openai** — deferred to Phase 3. | ~10 |
21
- | `.python-version` | `3.12` | 1 |
22
- | `openenv.yaml` | Full metadata — start with 3 MVP tasks (task_001, task_003, task_005), expand later | ~50 |
23
- | `Dockerfile` | `python:3.12-slim`, torch CPU-only, openenv-core, app deps, port 7860 | ~15 |
24
- | `.dockerignore` | Exclude `.venv/`, `__pycache__/`, `.git/`, `validation/reports/*.png` | ~10 |
25
- | `.gitignore` | `.venv/`, `__pycache__/`, `*.pyc`, `.env`, `run*.json` | ~15 |
26
- | `ml_training_debugger/__init__.py` | Package init, version string | ~3 |
27
- | `ml_training_debugger/models.py` | **Stub only:** `RootCauseDiagnosis` enum, `EpisodeState`, `TrainingConfig`, `GradientStats`, `DataBatchStats`, `ModelWeightStats`, `CodeSnippet`, `MLTrainingObservation` (extends `Observation`), `MLTrainingAction` (extends `Action`). All fields typed, all values defaulted. | ~200 |
28
- | `ml_training_debugger/client.py` | **Stub:** `MLTrainingEnvClient` extending `EnvClient` with `action_type = MLTrainingAction` and `observation_type = MLTrainingObservation`. Used by baseline scripts. | ~20 |
29
- | `server/__init__.py` | Empty | 0 |
30
- | `server/environment.py` | **Stub:** `MLTrainingEnvironment(Environment)` with `reset()` returning a hardcoded observation, `step()` echoing back, `state` property | ~50 |
31
- | `server/app.py` | `create_app(MLTrainingEnvironment, MLTrainingAction, MLTrainingObservation)` + stub routes for `/tasks`, `/grader`, `/baseline`, `/health` | ~60 |
32
- | `tests/__init__.py` | Empty | 0 |
33
- | `tests/test_models.py` | Validate all Pydantic models instantiate, serialize to JSON, and round-trip | ~60 |
34
- | `tests/conftest.py` | Shared fixtures: sample `EpisodeState`, sample `ScenarioParams`, sample observation | ~40 |
35
-
36
- ### 0.2 Dependencies to Install
37
-
38
- ```bash
39
- # Create venv inside ML Debugger/ project root
40
- python3 -m venv .venv && source .venv/bin/activate
41
-
42
- # Core runtime
43
- pip install torch --index-url https://download.pytorch.org/whl/cpu
44
- pip install openenv-core pydantic>=2.0 fastapi uvicorn
45
-
46
- # Dev tools
47
- pip install pytest pytest-cov pytest-asyncio black ruff isort httpx websockets
48
-
49
- # NOTE: openai is deferred to Phase 3 (LLM baseline). Do NOT install now.
50
- ```
51
-
52
- ### 0.3 Validation Steps (Must All Pass)
53
-
54
- | # | Command | Expected Result |
55
- |---|---|---|
56
- | 1 | `python -c "import torch; print(torch.__version__)"` | Version string, no CUDA |
57
- | 2 | `python -c "from openenv.core.env_server.http_server import create_app"` | No import error |
58
- | 3 | `python -c "from ml_training_debugger.models import MLTrainingAction, MLTrainingObservation"` | No import error |
59
- | 4 | `python -c "from ml_training_debugger.client import MLTrainingEnvClient"` | No import error |
60
- | 5 | `uvicorn server.app:app --host 0.0.0.0 --port 7860` | Server starts, no crash |
61
- | 6 | `curl http://localhost:7860/health` | `{"status": "ready", "tasks": 3}` |
62
- | 7 | `curl http://localhost:7860/tasks` | JSON with task list |
63
- | 8 | `curl http://localhost:7860/docs` | Swagger UI loads |
64
- | 9 | `pytest tests/test_models.py -v` | All pass |
65
- | 10 | `docker build -t pytorch-debugger .` | Builds in <5min, image <500MB |
66
- | 11 | `docker run -p 7860:7860 pytorch-debugger` then `curl /health` | Returns `{"status": "ready", "tasks": 3}` |
67
- | 12 | `openenv validate` | Passes (or identify what needs fixing) |
68
- | 13 | `black --check . && ruff check . && isort --check .` | Clean |
69
-
70
- ### 0.4 Acceptance Criteria
71
-
72
- - [ ] Skeleton server starts on port 7860 and responds to `/health`, `/tasks`, `/docs`, `/ws`
73
- - [ ] `/health` returns `{"status": "ready", "tasks": 3}` (task count matches active tasks)
74
- - [ ] All Pydantic models instantiate without error and serialize to valid JSON
75
- - [ ] `client.py` imports without error
76
- - [ ] Docker image builds under 500MB and container starts cleanly
77
- - [ ] `openenv validate` passes or all failures are documented with a fix plan
78
- - [ ] `pytest` runs with zero failures
79
- - [ ] Git repo initialized, first commit made
80
-
81
- ---
82
-
83
- ## Phase 1: MVP — Tasks 1, 3, 5 + Core Engine (Days 2-6)
84
-
85
- **Goal:** A fully functional 3-task environment that passes all auto-validation gates, deployed to HF Spaces. This is the survival milestone — everything after this is differentiation.
86
-
87
- ### 1.1 Files to Create
88
-
89
- | File | Purpose | Lines (est.) | Depends On |
90
- |---|---|---|---|
91
- | `ml_training_debugger/scenarios.py` | `ScenarioParams` dataclass, `sample_scenario(task_id, seed)` for tasks 001/003/005. Parameter ranges from spec Section 11. | ~120 | `models.py` |
92
- | `ml_training_debugger/pytorch_engine.py` | `SimpleCNN(torch.nn.Module)`, `inject_fault(model, scenario)`, `extract_gradient_stats(model)`, `extract_weight_stats(model)`. Real torch.autograd. | ~250 | `scenarios.py` |
93
- | `ml_training_debugger/simulation.py` | `gen_loss_history(scenario)`, `gen_val_accuracy_history(scenario)`, `gen_val_loss_history(scenario)`. All `torch.Tensor` ops. Parametric curves per spec Section 6. | ~180 | `scenarios.py` |
94
- | `ml_training_debugger/reward_engine.py` | `compute_reward(action, episode_state, scenario) -> float`. All 7 reward components per spec Section 12. Context-gated penalty logic. | ~100 | `models.py` |
95
- | `ml_training_debugger/graders.py` | `grade_task_001(state, scenario)`, `grade_task_003(...)`, `grade_task_005(...)`. Each returns float in [0.0, 1.0]. Per spec Section 11 grader breakdowns. | ~150 | `models.py` |
96
- | `baseline_heuristic.py` | Deterministic decision tree agent using `MLTrainingEnvClient`. Runs all MVP tasks, prints JSON scores. | ~150 | `client.py`, server running |
97
- | `README.md` | Environment description, action/observation spaces, task descriptions with difficulty, setup instructions, baseline scores table | ~200 | Everything |
98
-
99
- ### 1.2 Files to Edit
100
-
101
- | File | Changes | Why |
102
- |---|---|---|
103
- | `ml_training_debugger/models.py` | Finalize all field types, add `available_actions` computation logic to `EpisodeState`, add red herring fields (notes, gpu_memory) | Stubs from Phase 0 become real |
104
- | `ml_training_debugger/client.py` | Wire typed client to connect via WebSocket or HTTP as needed by baseline | Stub becomes functional |
105
- | `server/environment.py` | Full `reset()` and `step()` implementations. See spec Sections 9, 13 for lifecycle. | Stubs become real |
106
- | `server/app.py` | Wire `/tasks`, `/grader`, `/baseline`, `/health` to return real data. `/health` returns `{"status": "ready", "tasks": 3}`. | Stubs become real |
107
- | `openenv.yaml` | Finalize observation_space, action_space, reward section. Verify task IDs and max_steps per spec Section 14. | Was skeletal in Phase 0 |
108
- | `Dockerfile` | Add `COPY` for all new source files. Verify build still works. | New files added |
109
-
110
- ### 1.3 Tests to Create
111
-
112
- | Test File | What It Covers | Critical Assertions |
113
- |---|---|---|
114
- | `tests/test_scenarios.py` | `sample_scenario()` for each MVP task | Returns correct root cause enum; params within defined ranges; different seeds produce different params |
115
- | `tests/test_pytorch_engine.py` | Model instantiation, fault injection, gradient/weight extraction | `SimpleCNN` is a real `torch.nn.Module`; `extract_gradient_stats` returns `GradientStats` with real float norms; exploding fault produces `is_exploding=True`; batchnorm eval fault produces `model.training==False` |
116
- | `tests/test_simulation.py` | Parametric curve generators | All outputs are `list[float]` of length 20; exploding LR produces diverging loss; leakage produces inflated val_acc; batchnorm produces slow val_acc degradation |
117
- | `tests/test_reward_engine.py` | All 7 reward components | **Critical:** context-gated penalty fires when `gradients_inspected=True AND gradients_were_normal=True` then `add_callback`; does NOT fire when `add_callback` without prior inspection; step penalty is flat -0.01; investigation bonus is +0.05 first-time only |
118
- | `tests/test_graders.py` | Graders for tasks 001, 003, 005 | Each returns float in [0.0, 1.0]; correct diagnosis + fix + restart = 1.0; wrong diagnosis < 0.5; partial completion scores between 0 and 1 |
119
- | `tests/test_episode_lifecycle.py` | Full reset→inspect→fix→restart→diagnose flow | State transitions match spec Section 13; `available_actions` updates correctly; `done=True` after `mark_diagnosed`; step limit triggers `done=True` |
120
-
121
- ### 1.4 Task-Specific Implementation
122
-
123
- See spec Section 11 for complete task specifications. Key implementation notes per task:
124
-
125
- **Task 1 (`task_001`, easy):** Unambiguous signal. LR from spec ranges → real gradients explode → `is_exploding=True` on all layers. Straightforward grader.
126
-
127
- **Task 3 (`task_003`, medium):** Red herring note about architecture upgrade. Data leakage confirmed via `class_overlap_score`. Normal model (no gradient/weight anomaly). Mild gradient elevation on one layer (`is_exploding=False`).
128
-
129
- **Task 5 (`task_005`, hard):** The differentiator task. `gradients_were_normal=True` set inside `inspect_gradients` handler because `is_exploding=False` on ALL layers (FC spike mean_norm < 10.0). Context-gated penalty fires when agent then calls `add_callback`. Red herrings: FC spike, GPU 91%, conv1 near-vanishing, error_log warning.
130
-
131
- ### 1.5 Endpoint Responses
132
-
133
- **`GET /health`:** `{"status": "ready", "tasks": 3}` (200) — or `{"status": "initializing"}` (503) during startup.
134
-
135
- **`GET /tasks`:** Task list with IDs, difficulties, max_steps, and MLTrainingAction JSON schema.
136
-
137
- **`POST /grader`:** `{"score": float, "task_id": str, "steps": int}` (200) — or `{"score": null, "error": "no_completed_episode"}` (200) if no episode. See spec Section 14 for edge cases.
138
-
139
- **`POST /baseline`:** Runs baseline logic internally, returns `{"scores": {"task_001": float, "task_003": float, "task_005": float}}`. Returns 409 if already running.
140
-
141
- ### 1.6 Baseline Heuristic Decision Tree
142
-
143
- See spec Section 17 for the complete decision tree. Summary:
144
- ```
145
- 1. reset(task_id)
146
- 2. inspect_gradients
147
- 3. IF any layer is_exploding → fix LR → restart → diagnose lr_too_high
148
- 4. IF any layer is_vanishing → fix LR → restart → diagnose vanishing_gradients
149
- 5. inspect_data_batch
150
- 6. IF class_overlap_score > 0.5 → patch_data_loader → restart → diagnose data_leakage
151
- 7. IF val_loss diverging → modify weight_decay → restart → diagnose overfitting
152
- 8. inspect_model_modes
153
- 9. IF any layer in "eval" → fix_model_mode → restart → diagnose batchnorm_eval_mode
154
- 10. inspect_code → attempt fix → restart → diagnose code_bug
155
- 11. FALLBACK: diagnose overfitting
156
- ```
157
-
158
- ### 1.7 Deploy to HF Spaces
159
-
160
- | Step | Action | Verification |
161
- |---|---|---|
162
- | 1 | Create HF Space (Docker type), tag with `openenv` | Space page shows openenv tag |
163
- | 2 | Push Dockerfile + source to Space repo | Build triggers automatically |
164
- | 3 | Wait for build to complete | Build log shows success |
165
- | 4 | Test health endpoint | `curl https://<space-url>/health` returns `{"status": "ready", "tasks": 3}` |
166
- | 5 | Test reset via WebSocket | `wscat -c wss://<space-url>/ws` then send `{"type": "reset", "task_id": "task_001"}` |
167
- | 6 | Run `openenv validate` against deployed space | All checks pass |
168
-
169
- ### 1.8 Acceptance Criteria
170
-
171
- - [ ] `reset(task_id)` for tasks 001, 003, 005 returns valid `MLTrainingObservation` with correct initial state
172
- - [ ] `step()` dispatches all 14 action types correctly (investigation, fix, terminal)
173
- - [ ] `inspect_gradients` on Task 1 → `is_exploding=True` on all layers (real torch.autograd)
174
- - [ ] `inspect_gradients` on Task 5 → `is_exploding=False` on all layers, `gradients_were_normal=True`
175
- - [ ] `inspect_data_batch` on Task 3 → `class_overlap_score > 0.5`
176
- - [ ] `inspect_model_modes` on Task 5 → all layers in "eval" mode
177
- - [ ] Context-gated penalty: `inspect_gradients`(normal) then `add_callback` → reward includes -0.20
178
- - [ ] Context-gated penalty: `add_callback` without prior inspection → NO -0.20 penalty
179
- - [ ] Grader for Task 1: correct path scores 1.0, wrong diagnosis scores < 0.5
180
- - [ ] Grader for Task 5: agent that chases red herring scores 0.80-0.85 (penalty applied)
181
- - [ ] `baseline_heuristic.py` runs twice → `diff run1.json run2.json` is empty
182
- - [ ] `POST /baseline` returns scores for all 3 tasks, all in [0.0, 1.0]
183
- - [ ] `POST /grader` returns score after completed episode
184
- - [ ] `GET /tasks` returns 3 tasks with action schema
185
- - [ ] `GET /health` returns `{"status": "ready", "tasks": 3}`
186
- - [ ] Docker builds <500MB, starts <60s, serves on port 7860
187
- - [ ] HF Space deployed, responds to `reset()`, tagged `openenv`
188
- - [ ] `openenv validate` passes
189
- - [ ] `pytest --cov` shows >80% coverage on all Phase 1 modules
190
- - [ ] `import torch` in every core module; zero `import numpy` in core
191
- - [ ] README has: description, action/observation spaces, 3 task descriptions, setup instructions, baseline scores
192
-
193
- ---
194
-
195
- ## Phase 2: Stretch — Tasks 2, 4, 6 + Code Debugging (Days 7-9)
196
-
197
- **Goal:** Full 6-task environment with code-level debugging. Task 6 is the single highest-impact differentiator for Meta judges.
198
-
199
- **Prerequisites:** Phase 1 acceptance criteria ALL met. HF Space deployed and passing auto-validation.
200
-
201
- ### 2.1 Priority Order (Strict)
202
-
203
- 1. **Task 6** first — it is the strongest differentiator and the hardest to implement
204
- 2. **Task 2** second — structurally identical to Task 1 (vanishing vs. exploding), fastest to add
205
- 3. **Task 4** third — medium difficulty overfitting, similar pattern to existing tasks
206
-
207
- ### 2.2 Files to Create
208
-
209
- | File | Purpose | Lines (est.) | Depends On |
210
- |---|---|---|---|
211
- | `ml_training_debugger/code_templates.py` | 4 bug variant templates, `generate_code_snippet(bug_type, seed)`, `validate_fix(bug_type, line, replacement)` with multi-strategy pipeline per spec Section 22 | ~250 | `models.py` |
212
- | `tests/test_code_templates.py` | All 4 variants generate valid code; fix validation accepts correct fixes; rejects wrong fixes; handles whitespace/comment variations | ~150 | `code_templates.py` |
213
-
214
- ### 2.3 Files to Edit
215
-
216
- | File | Changes | Complexity |
217
- |---|---|---|
218
- | `ml_training_debugger/scenarios.py` | Add `sample_scenario` cases for task_002, task_004, task_006. Task 006 includes `bug_type` field. | Low |
219
- | `ml_training_debugger/pytorch_engine.py` | Add fault injection for vanishing gradients, overfitting, code bug variants. | Medium |
220
- | `ml_training_debugger/simulation.py` | Add curve generators for vanishing (flat loss), overfitting (train-val divergence), code bug variants. | Medium |
221
- | `ml_training_debugger/reward_engine.py` | Add wrong code fix penalty (-0.10). No other changes. | Low |
222
- | `ml_training_debugger/graders.py` | Add `grade_task_002`, `grade_task_004`, `grade_task_006`. Task 006: diagnosis must be `code_bug` always. | Medium |
223
- | `server/environment.py` | `step()` handlers for `inspect_code` and `fix_code`. Update `available_actions`. | Medium |
224
- | `server/app.py` | Update `/tasks` to return 6 tasks. Update `/health` to return `"tasks": 6`. | Low |
225
- | `openenv.yaml` | Add task_002, task_004, task_006. | Low |
226
- | `baseline_heuristic.py` | Extend decision tree for vanishing, overfitting, code bug. | Medium |
227
- | `README.md` | Add descriptions for Tasks 2, 4, 6. Update baseline scores. | Low |
228
-
229
- ### 2.4 Task 6 Code Fix Validation
230
-
231
- The `validate_fix()` pipeline is defined in spec Section 22 (Known Risks). Key layers:
232
-
233
- 1. **Normalize:** strip whitespace + inline comments → compare against known correct strings
234
- 2. **Tokenize:** Python `tokenize` module, filter noise tokens, compare streams
235
- 3. **Semantic patterns:** 2-3 per variant (e.g. `"criterion("` present AND `".detach()"` absent)
236
- 4. **AST fallback:** `ast.parse()` full code with replacement, verify buggy pattern absent
237
-
238
- Test cases that MUST pass: correct fix, trailing whitespace, inline comments, different indentation.
239
- Test cases that MUST fail: bug still present, `pass`, wrong line number.
240
-
241
- ### 2.5 Tests to Create/Extend
242
-
243
- | Test File | New Coverage |
244
- |---|---|
245
- | `tests/test_code_templates.py` | **New file.** All 4 variants, validate_fix accepts/rejects correctly, 5+ whitespace/comment variations per variant |
246
- | `tests/test_scenarios.py` | Extend: sample_scenario for task_002, 004, 006 |
247
- | `tests/test_simulation.py` | Extend: vanishing flat loss, overfitting divergence, code bug symptoms |
248
- | `tests/test_graders.py` | Extend: graders 002, 004, 006. Task 006: `code_bug` required; `batchnorm_eval_mode` on eval_mode variant = wrong |
249
- | `tests/test_reward_engine.py` | Extend: wrong code fix penalty (-0.10) |
250
- | `tests/test_episode_lifecycle.py` | Extend: `inspect_code` → `fix_code` available; `fix_code` before `inspect_code` → invalid |
251
-
252
- ### 2.6 Acceptance Criteria
253
-
254
- - [ ] All 6 tasks return valid observations from `reset()` and process all action types in `step()`
255
- - [ ] Task 6: `inspect_code` returns `CodeSnippet` with real PyTorch code containing the sampled bug
256
- - [ ] Task 6: `fix_code` correct → `fix_action_taken=True`, no penalty
257
- - [ ] Task 6: `fix_code` wrong → -0.10 penalty
258
- - [ ] Task 6: `mark_diagnosed(code_bug)` → correct (+0.50)
259
- - [ ] Task 6: `mark_diagnosed(batchnorm_eval_mode)` on eval_mode variant → wrong (-0.30)
260
- - [ ] `validate_fix` accepts 5+ whitespace/comment variations per variant
261
- - [ ] `validate_fix` rejects all invalid fixes
262
- - [ ] Graders for all 6 tasks return [0.0, 1.0] with meaningful variance
263
- - [ ] `baseline_heuristic.py` handles all 6 tasks, still bit-exact reproducible
264
- - [ ] `POST /baseline` returns scores for all 6 tasks
265
- - [ ] `GET /tasks` returns 6 tasks
266
- - [ ] `GET /health` returns `{"status": "ready", "tasks": 6}`
267
- - [ ] All new tests pass; overall coverage >80%
268
- - [ ] Updated openenv.yaml lists all 6 tasks
269
- - [ ] HF Space redeployed with 6 tasks, auto-validation still passes
270
-
271
- ---
272
-
273
- ## Phase 3: Polish — Dashboard, Validation Suite, LLM Baseline (Days 10-11)
274
-
275
- **Goal:** Transform a technically correct submission into a visually impressive, deeply validated, winning submission.
276
-
277
- **Prerequisites:** Phase 2 acceptance criteria ALL met. 6-task environment deployed.
278
-
279
- ### 3.1 Priority Order Within Phase 3
280
-
281
- 1. **Dashboard** — transforms judging experience (highest ROI for judges)
282
- 2. **Full test suite + README polish** — ensures no auto-validation failure
283
- 3. **Validation suite** — answers "how realistic are your curves?"
284
- 4. **LLM baseline** — demonstrates heuristic-reasoning gap (lowest priority)
285
-
286
- ### 3.2 Files to Create
287
-
288
- | File | Purpose | Lines (est.) | Priority |
289
- |---|---|---|---|
290
- | `server/dashboard.html` | Single-file SPA. 4 panels per spec Section 19. Plotly.js via CDN. | ~400 | 1st |
291
- | `validation/requirements.txt` | `torch`, `matplotlib`, `scipy` | ~3 | 3rd |
292
- | `validation/conftest.py` | Shared fixtures: CIFAR-10 subset loader, model definitions | ~50 | 3rd |
293
- | `validation/validate_exploding_gradients.py` | Real training, compare to parametric curve, R² > 0.85 | ~80 | 3rd |
294
- | `validation/validate_data_leakage.py` | Real training with leakage, compare | ~80 | 3rd |
295
- | `validation/validate_batchnorm_eval.py` | Real training with `model.eval()`, compare | ~80 | 3rd |
296
- | `validation/validate_vanishing_gradients.py` | Real gradient decay, compare | ~80 | 3rd |
297
- | `validation/validate_overfitting.py` | Real train-val divergence, compare | ~80 | 3rd |
298
- | `validation/validate_code_bugs.py` | Run 4 bug variants, confirm symptoms | ~80 | 3rd |
299
- | `validation/reports/` | Pre-computed fidelity scores + comparison plots | — | 3rd |
300
- | `baseline_inference.py` | LLM agent (GPT-4o, temp=0.0, seed=42). Runs all 6 tasks. **Now install openai.** | ~200 | 4th |
301
-
302
- ### 3.3 Files to Edit
303
-
304
- | File | Changes | Priority |
305
- |---|---|---|
306
- | `server/app.py` | Add `GET /dashboard` and `GET /validation-report` routes | 1st/3rd |
307
- | `requirements.txt` | Add `openai` (only now, for LLM baseline) | 4th |
308
- | `Dockerfile` | `COPY validation/reports/` and `COPY server/dashboard.html` | 1st |
309
- | `README.md` | Final polish: dashboard description, validation suite, measured baseline scores | 2nd |
310
- | `openenv.yaml` | Add dashboard and validation-report to endpoints | 1st |
311
-
312
- ### 3.4 Dashboard Panels
313
-
314
- See spec Section 19 for full specification. Summary:
315
- 1. **Training Metrics** — Plotly.js line charts for loss/accuracy with restart markers
316
- 2. **Gradient & Weight Heatmap** — color-coded per-layer grid (green/yellow/red/blue)
317
- 3. **Action Timeline** — horizontal bars per step, color-coded by type, reward bars
318
- 4. **Episode Summary** — task ID, state flags, available actions, grader score
319
-
320
- Tech: single HTML file, Plotly.js CDN, native WebSocket, CSS Grid. Zero Docker bloat.
321
-
322
- ### 3.5 Validation Suite
323
-
324
- Run locally (NOT in Docker build). Each script: real training → capture metrics → compare to parametric → assert R² > 0.85 → save plots. Pre-computed reports committed to git and served via `/validation-report`. See spec Section 18.
325
-
326
- ### 3.6 Tests to Create/Extend
327
-
328
- | Test File | Coverage |
329
- |---|---|
330
- | `tests/test_dashboard.py` | `GET /dashboard` returns 200 with HTML containing "Plotly" and "WebSocket" |
331
- | `tests/test_endpoints.py` | Integration: full episode via HTTP (reset→step→grader), verify response schemas |
332
- | `tests/test_baseline_reproducibility.py` | Run baseline twice, assert identical JSON |
333
- | Existing test files | Fill coverage gaps to >80% on every module |
334
-
335
- ### 3.7 Acceptance Criteria
336
-
337
- - [ ] `GET /dashboard` serves HTML that renders in a browser with 4 panels
338
- - [ ] Dashboard connects to WebSocket and updates in real time during a baseline run
339
- - [ ] Validation suite passes all scripts with R² > 0.85 (run locally)
340
- - [ ] Pre-computed validation reports exist in `validation/reports/`
341
- - [ ] `GET /validation-report` serves fidelity data
342
- - [ ] LLM baseline runs, scores higher than heuristic on Tasks 5 and 6 (if implemented)
343
- - [ ] README is complete: all 6 tasks, both baselines, dashboard description, setup instructions
344
- - [ ] `pytest --cov` shows >80% coverage across all modules
345
- - [ ] Final `openenv validate` passes
346
- - [ ] Final Docker build <500MB, starts <60s
347
- - [ ] HF Space redeployed with dashboard + all features
348
-
349
- ---
350
-
351
- ## Pre-Submission Gate Checklist
352
-
353
- **Every item must be checked before submitting. Failure on any starred (*) item = disqualification.**
354
-
355
- ### Auto-Validation Gates (*)
356
-
357
- - [ ] * **HF Space deploys** — `curl https://<space-url>/health` returns `{"status": "ready", "tasks": N}` with HTTP 200
358
- - [ ] * **HF Space responds to reset** — WebSocket connection to `/ws`, send reset message, receive valid observation
359
- - [ ] * **OpenEnv spec compliance** — `openenv validate` passes (openenv.yaml present, typed models, step/reset/state work)
360
- - [ ] * **Dockerfile builds** — `docker build -t pytorch-debugger .` succeeds
361
- - [ ] * **Docker runs** — `docker run -p 7860:7860 pytorch-debugger` starts and serves on port 7860
362
- - [ ] * **Baseline reproduces** — `python baseline_heuristic.py > run1.json && python baseline_heuristic.py > run2.json && diff run1.json run2.json` produces no output
363
- - [ ] * **3+ tasks with graders** — `GET /tasks` returns ≥3 tasks; `POST /grader` returns score in [0.0, 1.0] after each task completes
364
- - [ ] * **Graders produce varying scores** — different agent behaviors produce different scores (not always same value)
365
-
366
- ### Required Endpoint Gates (*)
367
-
368
- - [ ] * **`GET /tasks`** — returns JSON with task IDs, difficulties, action schema
369
- - [ ] * **`POST /grader`** — returns `{"score": float}` after a completed episode
370
- - [ ] * **`POST /baseline`** — triggers baseline, returns scores for all tasks
371
- - [ ] * **`GET /health`** — returns `{"status": "ready", "tasks": N}`
372
-
373
- ### Submission Artifacts (*)
374
-
375
- - [ ] * **Public GitHub repo** — contains all code, README, requirements, openenv.yaml
376
- - [ ] * **HF Spaces demo link** — deployed, tagged `openenv`, accessible
377
- - [ ] * **README complete** — environment description, action/observation space definitions, task descriptions with difficulty, setup instructions, baseline scores
378
-
379
- ### Quality Gates (Not DQ, but impact scoring)
380
-
381
- - [ ] All typed Pydantic models — no `Dict[str, Any]`
382
- - [ ] `import torch` in every core module — zero `import numpy` in core
383
- - [ ] Context-gated penalty fires correctly (manually tested both paths)
384
- - [ ] Task 5 red herrings present: FC spike, GPU 91%, conv1 near-vanishing, error_log warning
385
- - [ ] Task 6 code fix validation handles whitespace and comment variations
386
- - [ ] Task 6 diagnosis is always `code_bug` regardless of bug variant
387
- - [ ] Grader and reward function are separate modules
388
- - [ ] Step penalty is flat -0.01 (not multiplied by step_count)
389
- - [ ] Episode state is isolated per WebSocket session
390
- - [ ] Test suite passes with >80% coverage
391
- - [ ] Code formatted with black, linted with ruff, imports sorted with isort
392
-
393
- ### Final Smoke Test Sequence
394
-
395
- Run this entire sequence the night before submission:
396
-
397
- ```bash
398
- # 1. Clean build
399
- docker build --no-cache -t pytorch-debugger .
400
- docker run -d -p 7860:7860 --name smoke-test pytorch-debugger
401
-
402
- # 2. Wait for startup
403
- sleep 10
404
- curl -f http://localhost:7860/health || echo "FAIL: health"
405
-
406
- # 3. Tasks endpoint
407
- curl -f http://localhost:7860/tasks | python -m json.tool || echo "FAIL: tasks"
408
-
409
- # 4. Baseline reproducibility
410
- python baseline_heuristic.py > run1.json 2>/dev/null
411
- python baseline_heuristic.py > run2.json 2>/dev/null
412
- diff run1.json run2.json && echo "PASS: reproducible" || echo "FAIL: non-reproducible"
413
-
414
- # 5. Baseline via endpoint
415
- curl -f -X POST http://localhost:7860/baseline | python -m json.tool || echo "FAIL: baseline endpoint"
416
-
417
- # 6. Grader via endpoint (after baseline has completed episodes)
418
- curl -f -X POST http://localhost:7860/grader | python -m json.tool || echo "FAIL: grader endpoint"
419
-
420
- # 7. OpenEnv validation
421
- openenv validate || echo "FAIL: openenv validate"
422
-
423
- # 8. Test suite
424
- pytest tests/ -v --cov=ml_training_debugger --cov-report=term-missing
425
-
426
- # 9. Cleanup
427
- docker stop smoke-test && docker rm smoke-test
428
-
429
- echo "=== Smoke test complete ==="
430
- ```
431
-
432
- ### If Something Fails at Submission Time
433
-
434
- | Failure | Triage |
435
- |---|---|
436
- | HF Space won't deploy | Check Dockerfile CMD, port 7860, build logs. Redeploy. |
437
- | Baseline non-reproducible | Check `torch.manual_seed()` in `reset()`. Check for `random` module usage. |
438
- | Grader returns same score | Check that `sample_scenario` uses different seeds. Check grader logic has branching. |
439
- | `openenv validate` fails | Read error message. Usually missing field in openenv.yaml or wrong model base class. |
440
- | Docker image >500MB | Check `docker images` size. Remove unused deps. Ensure torch is CPU-only. |
441
- | Test coverage <80% | Run `pytest --cov` with `--cov-report=html`. Find uncovered branches. Add targeted tests. |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
inference.py CHANGED
@@ -46,8 +46,10 @@ TASK_NAME = os.environ.get("TASK_NAME", "task_001")
46
  BENCHMARK = "pytorch-training-debugger"
47
 
48
  MAX_STEPS = 25
49
- MAX_TOTAL_REWARD = 1.0
50
- SUCCESS_SCORE_THRESHOLD = 0.6
 
 
51
  TEMPERATURE = 0.0
52
  MAX_TOKENS = 300
53
  FALLBACK_ACTION = '{"action_type": "inspect_gradients"}'
 
46
  BENCHMARK = "pytorch-training-debugger"
47
 
48
  MAX_STEPS = 25
49
+ # Max achievable reward: +0.50 (diagnosis) +0.40 (convergence) +5*0.05 (investigations)
50
+ # minus step penalties. Use 1.15 as the theoretical ceiling for normalization.
51
+ MAX_TOTAL_REWARD = 1.15
52
+ SUCCESS_SCORE_THRESHOLD = 0.5
53
  TEMPERATURE = 0.0
54
  MAX_TOKENS = 300
55
  FALLBACK_ACTION = '{"action_type": "inspect_gradients"}'
run_all_baselines.py DELETED
@@ -1,130 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Run heuristic + multiple LLM baselines and show comparison table.
3
-
4
- Usage:
5
- python3 run_all_baselines.py
6
- """
7
-
8
- from __future__ import annotations
9
-
10
- import json
11
- import os
12
- import sys
13
- import time
14
- from concurrent.futures import ThreadPoolExecutor, as_completed
15
- from pathlib import Path
16
-
17
- # Load .env
18
- _env_path = Path(__file__).parent / ".env"
19
- if _env_path.exists():
20
- for line in _env_path.read_text().splitlines():
21
- line = line.strip()
22
- if line and not line.startswith("#") and "=" in line:
23
- key, _, value = line.partition("=")
24
- os.environ.setdefault(key.strip(), value.strip())
25
-
26
- from baseline_heuristic import ALL_TASKS
27
- from baseline_heuristic import run_heuristic_episode
28
- from baseline_inference import PROVIDERS, run_llm_episode
29
-
30
- try:
31
- from openai import OpenAI
32
- except ImportError:
33
- print("Error: pip install openai")
34
- sys.exit(1)
35
-
36
-
37
- def run_heuristic() -> dict[str, float]:
38
- scores = {}
39
- for task_id in ALL_TASKS:
40
- scores[task_id] = round(run_heuristic_episode(task_id), 4)
41
- return scores
42
-
43
-
44
- def run_llm_provider(provider_name: str, model: str | None = None) -> dict[str, float]:
45
- prov = PROVIDERS[provider_name]
46
- api_key = os.environ.get(prov["env_key"])
47
- if not api_key:
48
- return {t: -1.0 for t in ALL_TASKS} # -1 = no key
49
-
50
- model_name = model or prov["default_model"]
51
- client_kwargs: dict = {"api_key": api_key}
52
- if prov["base_url"]:
53
- client_kwargs["base_url"] = prov["base_url"]
54
- client = OpenAI(**client_kwargs)
55
-
56
- scores: dict[str, float] = {}
57
- for task_id in ALL_TASKS:
58
- try:
59
- score = run_llm_episode(task_id, client, model_name)
60
- scores[task_id] = round(score, 4)
61
- print(f" [{provider_name}/{model_name}] {task_id}: {score:.4f}", file=sys.stderr)
62
- except Exception as e:
63
- err_str = str(e)[:80]
64
- print(f" [{provider_name}/{model_name}] {task_id}: ERROR — {err_str}", file=sys.stderr)
65
- scores[task_id] = 0.0
66
- return scores
67
-
68
-
69
- def main() -> None:
70
- print("Running all baselines...\n", file=sys.stderr)
71
-
72
- results: dict[str, dict[str, float]] = {}
73
-
74
- # Run heuristic first (fast, deterministic)
75
- print("--- Heuristic baseline ---", file=sys.stderr)
76
- results["Heuristic"] = run_heuristic()
77
- print(f" Done: {json.dumps(results['Heuristic'])}", file=sys.stderr)
78
-
79
- # Run LLM providers sequentially (avoids thread hang issues)
80
- llm_runs = [
81
- ("Cerebras/Llama-3.1-8B", "cerebras", "llama3.1-8b"),
82
- ("Groq/Llama-3.1-8B", "groq", "llama-3.1-8b-instant"),
83
- ]
84
-
85
- for label, provider, model in llm_runs:
86
- print(f"\n--- {label} ---", file=sys.stderr)
87
- try:
88
- results[label] = run_llm_provider(provider, model)
89
- except Exception as e:
90
- print(f" {label}: FAILED — {e}", file=sys.stderr)
91
- results[label] = {t: 0.0 for t in ALL_TASKS}
92
-
93
- # Print comparison table
94
- print("\n" + "=" * 80)
95
- print("BASELINE COMPARISON TABLE")
96
- print("=" * 80)
97
-
98
- headers = list(results.keys())
99
- print(f"\n{'Task':<12}", end="")
100
- for h in headers:
101
- print(f"{h:>25}", end="")
102
- print()
103
- print("-" * (12 + 25 * len(headers)))
104
-
105
- for task_id in ALL_TASKS:
106
- print(f"{task_id:<12}", end="")
107
- for h in headers:
108
- score = results[h].get(task_id, 0.0)
109
- if score < 0:
110
- print(f"{'no key':>25}", end="")
111
- else:
112
- print(f"{score:>25.4f}", end="")
113
- print()
114
-
115
- print("-" * (12 + 25 * len(headers)))
116
-
117
- # Averages
118
- print(f"{'AVERAGE':<12}", end="")
119
- for h in headers:
120
- valid = [v for v in results[h].values() if v >= 0]
121
- avg = sum(valid) / len(valid) if valid else 0
122
- print(f"{avg:>25.4f}", end="")
123
- print()
124
-
125
- # Save JSON
126
- print(json.dumps(results, indent=2))
127
-
128
-
129
- if __name__ == "__main__":
130
- main()