Spaces:

ujjwalpardeshi
/

pytorch-training-debugger

Sleeping

App Files Files Community

omkarrr88 commited on Apr 4

Commit

da20dfa

1 Parent(s): 6be6d8e

minor changes

Browse files

Files changed (9) hide show

.gitignore +7 -0
Dockerfile +0 -1
baseline_inference.py +0 -241
docs/PAPER.md +0 -56
docs/PRD.md +0 -367
docs/PROJECT_GUIDE.md +0 -691
docs/ROADMAP.md +0 -441
inference.py +4 -2
run_all_baselines.py +0 -130

.gitignore CHANGED Viewed

@@ -20,3 +20,10 @@ CLAUDE.md
 uv.lock
 deploy-hf.sh
 deploy.sh

 uv.lock
 deploy-hf.sh
 deploy.sh
+AUDIT_REPORT.md
+baseline_inference.py
+run_all_baselines.py
+docs/PAPER.md
+docs/PRD.md
+docs/ROADMAP.md
+docs/PROJECT_GUIDE.md

Dockerfile CHANGED Viewed

@@ -96,7 +96,6 @@ COPY ml_training_debugger/ ml_training_debugger/
 COPY server/ server/
 COPY openenv.yaml .
 COPY baseline_heuristic.py .
-COPY baseline_inference.py .
 COPY inference.py .
 COPY demo.py .
 COPY README.md .

 COPY server/ server/
 COPY openenv.yaml .
 COPY baseline_heuristic.py .
 COPY inference.py .
 COPY demo.py .
 COPY README.md .

baseline_inference.py DELETED Viewed

@@ -1,241 +0,0 @@
-#!/usr/bin/env python3
-"""LLM baseline agent using Google Gemini (via OpenAI-compatible SDK).
-Requires GEMINI_API_KEY environment variable (or pass via --api-key).
-Uses temperature=0.0 for near-deterministic behavior.
-Spec reference: Section 17.
-Usage:
-    GEMINI_API_KEY=... python baseline_inference.py
-    python baseline_inference.py --api-key YOUR_KEY
-"""
-from __future__ import annotations
-import argparse
-import json
-import os
-import sys
-from pathlib import Path
-# Load .env file if present
-_env_path = Path(__file__).parent / ".env"
-if _env_path.exists():
-    for line in _env_path.read_text().splitlines():
-        line = line.strip()
-        if line and not line.startswith("#") and "=" in line:
-            key, _, value = line.partition("=")
-            os.environ.setdefault(key.strip(), value.strip())
-try:
-    from openai import OpenAI
-except ImportError:
-    print("Error: openai package not installed. Run: pip install openai")
-    sys.exit(1)
-from ml_training_debugger.models import MLTrainingAction
-from server.environment import MLTrainingEnvironment
-ALL_TASKS = [
-    "task_001",
-    "task_002",
-    "task_003",
-    "task_004",
-    "task_005",
-    "task_006",
-    "task_007",
-]
-SYSTEM_PROMPT = """You are an expert ML engineer debugging a PyTorch training run.
-You are interacting with an environment that simulates a broken training job.
-Available actions (respond with JSON only, no explanation):
-- {"action_type": "inspect_gradients"} - View gradient statistics per layer
-- {"action_type": "inspect_data_batch"} - View data batch statistics and confusion matrix
-- {"action_type": "inspect_model_modes"} - View model layer modes (train/eval)
-- {"action_type": "inspect_model_weights"} - View model weight statistics
-- {"action_type": "inspect_code"} - View PyTorch training code
-- {"action_type": "modify_config", "target": "<field>", "value": <val>} - Change a hyperparameter
-- {"action_type": "add_callback"} - Add gradient clipping/scheduler
-- {"action_type": "patch_data_loader"} - Fix data pipeline issues
-- {"action_type": "fix_model_mode"} - Call model.train()
-- {"action_type": "fix_code", "line": <int>, "replacement": "<code>"} - Fix a code line
-- {"action_type": "restart_run"} - Restart training (requires a fix first)
-- {"action_type": "mark_diagnosed", "diagnosis": "<cause>"} - Submit diagnosis
-Valid diagnoses: lr_too_high, vanishing_gradients, data_leakage, overfitting, batchnorm_eval_mode, code_bug, scheduler_misconfigured
-Strategy:
-1. First investigate by inspecting gradients, data, model modes, and code
-2. Form a hypothesis based on the evidence gathered
-3. Apply the correct fix for the identified root cause
-4. Restart training to verify the fix works
-5. Submit your diagnosis
-IMPORTANT: Respond with ONLY a valid JSON action object. No explanation, no markdown, no code blocks."""
-def run_llm_episode(task_id: str, client: OpenAI, model_name: str) -> float:
-    """Run one LLM agent episode."""
-    env = MLTrainingEnvironment()
-    obs = env.reset(seed=42, episode_id=f"llm_{task_id}", task_id=task_id)
-    initial_obs = {
-        "training_loss_history": obs.training_loss_history[:5],
-        "val_accuracy_history": obs.val_accuracy_history[:5],
-        "current_config": obs.current_config.model_dump(),
-        "error_log": obs.error_log,
-        "available_actions": obs.available_actions,
-        "notes": obs.notes,
-        "gpu_memory_used_gb": obs.gpu_memory_used_gb,
-    }
-    messages = [
-        {"role": "system", "content": SYSTEM_PROMPT},
-        {
-            "role": "user",
-            "content": f"New episode started for a broken PyTorch training run.\n\nInitial observation:\n{json.dumps(initial_obs, indent=2, default=str)}",
-        },
-    ]
-    for step in range(25):
-        if obs.done:
-            break
-        try:
-            response = client.chat.completions.create(
-                model=model_name,
-                messages=messages,
-                temperature=0.0,
-                max_tokens=300,
-            )
-            action_text = response.choices[0].message.content.strip()
-        except Exception as e:
-            print(f"    Step {step}: API error — {e}", file=sys.stderr)
-            break
-        # Clean up common LLM formatting issues
-        action_text = action_text.strip("`").strip()
-        if action_text.startswith("json"):
-            action_text = action_text[4:].strip()
-        messages.append({"role": "assistant", "content": action_text})
-        try:
-            action_data = json.loads(action_text)
-            action = MLTrainingAction(**action_data)
-        except (json.JSONDecodeError, Exception) as e:
-            messages.append(
-                {
-                    "role": "user",
-                    "content": f"Invalid action format: {e}. Respond with ONLY valid JSON.",
-                }
-            )
-            continue
-        obs = env.step(action)
-        obs_summary: dict = {
-            "reward": obs.reward,
-            "done": obs.done,
-            "step": obs.episode_state.step_count,
-            "available_actions": obs.available_actions,
-        }
-        if obs.error_log:
-            obs_summary["error_log"] = obs.error_log
-        if obs.gradient_stats:
-            obs_summary["gradient_stats"] = [
-                {
-                    "layer": g.layer_name,
-                    "mean_norm": round(g.mean_norm, 4),
-                    "exploding": g.is_exploding,
-                    "vanishing": g.is_vanishing,
-                }
-                for g in obs.gradient_stats
-            ]
-        if obs.data_batch_stats:
-            obs_summary["data_overlap"] = obs.data_batch_stats.class_overlap_score
-            obs_summary["duplicate_ratio"] = obs.data_batch_stats.duplicate_ratio
-        if obs.model_mode_info:
-            obs_summary["model_modes"] = obs.model_mode_info
-        if obs.code_snippet:
-            obs_summary["code"] = obs.code_snippet.code[:600]
-            obs_summary["hint"] = obs.code_snippet.hint
-        messages.append(
-            {
-                "role": "user",
-                "content": f"Observation after your action:\n{json.dumps(obs_summary, indent=2, default=str)}",
-            }
-        )
-    session = env._get_session()
-    return session.last_score if session and session.last_score is not None else 0.0
-PROVIDERS = {
-    "groq": {
-        "env_key": "GROQ_API_KEY",
-        "base_url": "https://api.groq.com/openai/v1",
-        "default_model": "llama-3.3-70b-versatile",
-    },
-    "cerebras": {
-        "env_key": "CEREBRAS_API_KEY",
-        "base_url": "https://api.cerebras.ai/v1",
-        "default_model": "llama3.1-8b",
-    },
-    "gemini": {
-        "env_key": "GEMINI_API_KEY",
-        "base_url": "https://generativelanguage.googleapis.com/v1beta/openai/",
-        "default_model": "gemini-2.0-flash",
-    },
-    "openai": {
-        "env_key": "OPENAI_API_KEY",
-        "base_url": None,
-        "default_model": "gpt-4o",
-    },
-}
-def main() -> None:
-    parser = argparse.ArgumentParser(description="LLM baseline agent")
-    parser.add_argument("--url", default="http://localhost:7860")
-    parser.add_argument("--api-key", default=None, help="API key")
-    parser.add_argument(
-        "--provider",
-        default="groq",
-        choices=list(PROVIDERS.keys()),
-        help="LLM provider (default: groq)",
-    )
-    parser.add_argument("--model", default=None, help="Model name (auto-detected from provider)")
-    args = parser.parse_args()
-    prov = PROVIDERS[args.provider]
-    api_key = args.api_key or os.environ.get(prov["env_key"])
-    if not api_key:
-        print(f"Error: Set {prov['env_key']} env var or pass --api-key")
-        sys.exit(1)
-    model_name = args.model or prov["default_model"]
-    client_kwargs: dict = {"api_key": api_key}
-    if prov["base_url"]:
-        client_kwargs["base_url"] = prov["base_url"]
-    client = OpenAI(**client_kwargs)
-    scores: dict[str, float] = {}
-    print(f"Running LLM baseline with {args.provider}/{model_name}...", file=sys.stderr)
-    for task_id in ALL_TASKS:
-        try:
-            score = run_llm_episode(task_id, client, model_name)
-            scores[task_id] = round(score, 4)
-            print(f"  {task_id}: {score:.4f}", file=sys.stderr)
-        except Exception as e:
-            print(f"  {task_id}: ERROR — {e}", file=sys.stderr)
-            scores[task_id] = 0.0
-    print(json.dumps(scores, indent=2))
-if __name__ == "__main__":
-    main()

docs/PAPER.md DELETED Viewed

@@ -1,56 +0,0 @@
-# Context-Gated Reward Shaping for Evidence-Based ML Debugging
-## Abstract
-We present a reinforcement learning environment for training AI agents to debug broken PyTorch training runs. The environment introduces **context-gated reward shaping** — a penalty mechanism that distinguishes between reasonable prior actions (no penalty) and actions that ignore evidence the agent has already gathered (penalty). This single mechanic encodes evidence-based reasoning directly into the reward signal, teaching agents to reason about their accumulated knowledge rather than follow fixed playbooks. The environment covers 7 failure scenarios across 3 difficulty tiers, uses real PyTorch model internals (torch.nn.Module, torch.autograd, state_dict()), and includes a code-level debugging task where agents must read and fix actual Python source code.
-## Motivation
-ML teams spend 15-25% of engineer time debugging silent training failures — runs that produce no errors, just mysteriously bad metrics. Each misdiagnosed restart wastes $2-8/hour/GPU. Existing RL environments focus on games, navigation, or text tasks. No environment trains agents for the diagnostic reasoning process that ML engineers perform daily: gathering evidence from gradients, weights, data, and code; forming hypotheses under uncertainty; and making evidence-based decisions about which fix to apply.
-## Method: Context-Gated Reward Shaping
-Standard RL environments use stateless rewards: "did action X happen?" Our environment tracks the agent's information state and conditions penalties on what the agent has already observed.
-**Core mechanic:** An agent that adds gradient clipping *before* inspecting gradients follows a reasonable prior — no penalty. An agent that inspects gradients, sees they are normal, and *then* adds gradient clipping is ignoring counter-evidence — **-0.20 penalty**.
-Formally: the penalty fires when `gradients_inspected == True AND gradients_were_normal == True AND action == add_callback`. This gate requires two conditions to be jointly true, both of which depend on prior agent actions.
-This teaches agents a transferable skill: *don't ignore what you've already learned*. In real MLOps, ignoring gathered evidence leads to wasted GPU hours and delayed incident resolution.
-## Environment Design
-- **7 tasks** across 3 difficulty tiers (easy, medium, hard) with difficulty scaling (1-5)
-- **Real PyTorch models**: SimpleCNN (~50K params) and SimpleMLP (~20K params) with real torch.autograd gradients
-- **Progressive information reveal**: agents must actively choose what to investigate
-- **Code-level debugging** (Task 6): agent reads PyTorch source and submits line-by-line fixes
-- **Red herring injection** (Task 5): misleading gradient spikes, GPU memory warnings, near-vanishing layers
-- **Confusion matrices** in data batch inspection for richer diagnostic signals
-- **7 diagnosis types**: lr_too_high, vanishing_gradients, data_leakage, overfitting, batchnorm_eval_mode, code_bug, scheduler_misconfigured
-## Results
-Three-agent comparison demonstrates the environment differentiates across agent types:
-| Task | Heuristic | Llama 3.3 70B | Llama 3.1 8B |
-|------|-----------|---------------|--------------|
-| task_001 | **1.00** | 1.00 | 0.60 |
-| task_002 | **1.00** | 1.00 | 0.05 |
-| task_003 | **1.00** | 0.40 | 0.40 |
-| task_004 | 0.45 | 0.45 | **0.60** |
-| task_005 | **1.00** | 1.00 | 1.00 |
-| task_006 | **1.00** | — | 0.60 |
-| task_007 | **1.00** | — | 0.60 |
-| **Average** | **0.92** | 0.69* | 0.55 |
-Key findings: (1) Model size matters — 70B scores 25% higher than 8B. (2) Domain-specific heuristic (0.92) outperforms general LLMs (0.55-0.69), proving the environment rewards systematic debugging. (3) Task 4 is the exception where flexible LLM reasoning outperforms rigid heuristic on subtle real training curves.
-## Conclusion
-Context-gated reward shaping is a general technique applicable to any RL environment where agents must reason about accumulated evidence. By conditioning penalties on the agent's information state, we create environments that reward systematic investigation over pattern-matching — a capability with direct transfer value to real-world MLOps debugging.
-The environment is deployed as an OpenEnv-compatible Docker container on Hugging Face Spaces with full API documentation, a live diagnostic dashboard, and bit-exact reproducible baselines.
----
-*Built for the Meta PyTorch OpenEnv Hackathon x Scaler School of Technology, 2026.*

docs/PRD.md DELETED Viewed

@@ -1,367 +0,0 @@
-# PRD — PyTorch Training Run Debugger
-**Product:** OpenEnv RL environment for ML training failure diagnosis
-**Hackathon:** Meta PyTorch OpenEnv Hackathon x Scaler School of Technology, Round 1
-**Deadline:** April 8, 2026 (submission window opens March 28)
-**Runtime:** Python 3.12 · PyTorch CPU-only · openenv-core v0.2.2
-**Source of truth:** `ml-training-debugger-spec.md` for all implementation detail beyond this PRD
----
-## 1. Overview
-### 1.1 What We Are Building
-An OpenEnv-compliant reinforcement learning environment where an AI agent receives a snapshot of a broken PyTorch training run and must investigate, diagnose, fix, and verify the failure through a multi-step interactive process. The environment exposes real PyTorch model internals (gradients from `torch.autograd`, weights from `model.state_dict()`) and covers 6 failure scenarios across 3 difficulty tiers.
-### 1.2 Problem Being Solved
-MLOps teams spend 15-25% of engineer time debugging silent training failures — runs that produce no error, no crash, just bad metrics. Each misdiagnosed restart wastes GPU compute at $2-8/hour/card. The diagnostic process is hard because:
-- Multiple symptoms can point to multiple causes simultaneously
-- Some bugs produce no error — just mysteriously bad performance
-- Fixing the wrong thing wastes hours of compute and restarts
-- Static analysis catches some bugs but cannot reason through ambiguous runtime signals
-No existing OpenEnv environment covers this domain. The OpenEnv Hub currently contains a demo echo environment and a code execution environment. This fills a genuine gap.
-### 1.3 Why This Domain Wins
-1. **Strategic alignment** — PyTorch debugging for a Meta PyTorch hackathon. Judges from Meta and Hugging Face will see their own framework as the core subject matter.
-2. **Novel reward design** — Context-gated penalties that encode evidence-based reasoning into the reward signal. No existing OpenEnv environment attempts this.
-3. **Code-level debugging** — Task 6 requires the agent to read and fix actual PyTorch code. Directly addresses Meta's interest: can an AI agent debug PyTorch?
-4. **Ecosystem gap** — Zero competition in the OpenEnv ecosystem for ML training failure diagnosis.
-### 1.4 Key Differentiators
-| Differentiator | What It Is | Why It Matters |
-|---|---|---|
-| Context-gated reward shaping | Penalty fires only when agent ignores evidence it already gathered; no penalty for reasonable priors | Encodes evidence-based decision making — a capability no other OpenEnv environment has |
-| PyTorch-native internals | Real `torch.nn.Module` models, real `torch.autograd` gradients, real `state_dict()` snapshots | Every model-level observation is grounded in real PyTorch computation, not synthetic data |
-| Code-level debugging (Task 6) | Agent reads PyTorch code, identifies buggy line, submits code fix | Tests code understanding, not just metric interpretation — aligned with Meta's core interest |
----
-## 2. Target Users
-### 2.1 Primary: Hackathon Judges (Meta + Hugging Face Engineers)
-**What they evaluate:**
-- Real-world utility (30%) — Is this a genuine task? Would someone use this to train/evaluate agents?
-- Task & grader quality (25%) — Well-defined tasks, accurate graders, meaningful difficulty progression?
-- Environment design (20%) — Clean state management, sensible action/observation spaces, good reward shaping?
-- Code quality & spec compliance (15%) — OpenEnv spec, clean structure, typed models, working Dockerfile?
-- Creativity & novelty (10%) — Novel domain, interesting mechanics, original approach?
-**What impresses them:**
-- Real `import torch` in core modules (not numpy wrappers)
-- A live dashboard where they can watch an agent investigate in real time
-- Deterministic graders that produce different scores for different agent quality levels
-- The context-gated penalty — nuanced reward design that goes beyond standard practice
-**What disqualifies:**
-- HF Space doesn't deploy or respond to `reset()`
-- Plagiarized or trivially modified existing environments
-- Graders that always return the same score
-- No baseline inference script
-- Dockerfile doesn't build
-### 2.2 Secondary: RL Researchers and Agent Developers
-**What they need:**
-- A challenging benchmark that differentiates heuristic agents from reasoning-capable ones
-- Clear, typed action/observation schemas for agent integration
-- Reproducible baseline scores for comparison
-- Environments that produce meaningful reward signal across the full trajectory (not just sparse terminal reward)
-### 2.3 Tertiary: Auto-Validation System (Phase 1 Gate)
-A non-human "user" that must pass before any human judge sees the submission:
-- Pings HF Space URL — must return 200 and respond to `reset()`
-- Validates `openenv.yaml`, typed models, `step()`/`reset()`/`state()` endpoints
-- Runs `docker build` on submitted repo
-- Runs baseline script twice — scores must be identical
-- Enumerates tasks, runs each grader — scores must be in [0.0, 1.0]
----
-## 3. Success Metrics
-### 3.1 Evaluation Criteria Targets
-| Criterion | Weight | Target Score | How We Hit It |
-|---|---|---|---|
-| Real-world utility | 30% | 26-30 | ML debugging is a $B+ problem; every PyTorch team encounters these failures; fills a genuine OpenEnv gap |
-| Task & grader quality | 25% | 21-25 | 6 tasks (3 MVP), 3 difficulty tiers, deterministic graders, hard tasks challenge frontier models |
-| Environment design | 20% | 17-20 | Progressive reveal, context-gated penalties, dynamic `available_actions`, proper episode boundaries |
-| Code quality & spec compliance | 15% | 13-15 | Full OpenEnv spec, typed Pydantic models, working Dockerfile + HF Space, two baselines |
-| Creativity & novelty | 10% | 9-10 | Context-gated rewards, real PyTorch model internals, code fix task — all new to OpenEnv |
-| **Total** | **100%** | **86-100** | |
-### 3.2 Quantitative Success Criteria
-| Metric | Target | Measurement |
-|---|---|---|
-| Auto-validation | Pass all 5 gates | `openenv validate` + smoke test sequence |
-| Grader score range | Meaningful variance per task | Heuristic baseline ~0.30-0.85 across tasks (not flat) |
-| Heuristic-LLM gap | Measurable difference | LLM scores higher than heuristic on Tasks 5 and 6 |
-| `reset()` latency | <200ms | Model instantiation + 2 forward passes + parametric curves |
-| `step()` latency | <10ms | Action dispatch + reward computation + state update |
-| Baseline reproducibility | Bit-exact across runs | `diff run1.json run2.json` produces no output |
-| Docker image size | <500MB | PyTorch CPU-only + python:3.12-slim |
-| Test coverage | >80% | `pytest --cov` |
-### 3.3 Qualitative Success Criteria
-- A judge can open `/dashboard`, trigger a baseline run, and understand the agent's reasoning at a glance
-- Task 5 (BatchNorm eval mode) visibly differentiates disciplined investigation from red-herring chasing
-- Task 6 (code bug) produces a "wow" moment — an agent reading and fixing PyTorch code in front of Meta judges
-- The context-gated penalty creates a story: "this agent gathered evidence and then ignored it"
----
-## 4. Functional Requirements
-> **Complete typed specifications for all data models, actions, observations, tasks, reward components, and error handling are in `ml-training-debugger-spec.md` Sections 10-16.** This section provides a product-level summary.
-### 4.1 Agent Interaction Loop
-```
-reset(task_id) → initial observation (loss curves, config, error log — no gradients/weights/data/code)
-     ↓
-step(action)   → updated observation + reward + done flag (progressive reveal)
-     ↓
-  ... repeat ...
-     ↓
-step(mark_diagnosed) → terminal observation, done=True, episode scored by grader
-```
-### 4.2 Observation Space Summary
-The `MLTrainingObservation` extends `Observation` from openenv-core. Key design:
-- **Always visible from reset:** loss/accuracy histories, config, error_log, GPU memory, episode state, available actions
-- **Progressively revealed:** gradient stats (real torch.autograd), weight stats (real state_dict), data batch stats, model mode info, code snippets — each populated only after the corresponding `inspect_*` action
-- All fields are typed Pydantic models with explicit types. See spec Section 10 for complete field definitions.
-### 4.3 Action Space Summary
-The `MLTrainingAction` extends `Action` from openenv-core. 14 action types in 3 categories:
-- **Investigation** (5): `inspect_gradients`, `inspect_data_batch`, `inspect_model_modes`, `inspect_model_weights`, `inspect_code`
-- **Fix** (7): `modify_config`, `add_callback`, `replace_optimizer`, `patch_data_loader`, `fix_model_mode`, `fix_code`, `rollback_checkpoint`
-- **Terminal** (2): `restart_run`, `mark_diagnosed`
-Dynamic availability: `restart_run` requires `fix_action_taken`, `fix_code` requires `code_inspected`, `mark_diagnosed` disappears after submission. See spec Section 10 for complete action definitions and required fields.
-### 4.4 Diagnosis Enum (RootCauseDiagnosis)
-Closed set of 6 values. Grader is a single equality check — no fuzzy matching.
-| Value | Description |
-|---|---|
-| `lr_too_high` | Learning rate too large for the architecture |
-| `vanishing_gradients` | LR too low or architecture too deep, gradients decay to near-zero |
-| `data_leakage` | Validation samples appearing in training batches |
-| `overfitting` | Model memorizing training data, failing to generalize |
-| `batchnorm_eval_mode` | Model left in eval mode, BatchNorm using running statistics |
-| `code_bug` | Bug in the PyTorch training code (Task 6 — always this, regardless of bug variant) |
-### 4.5 Reward Function Summary
-Per-step signal. **Separate from the grader** (see 4.6). Range: [-1.0, 1.0] hard cap.
-| Event | Reward | Gate Condition |
-|---|---|---|
-| Any step taken | -0.01 | Unconditional, flat constant (never multiplied by step_count) |
-| First-time inspection (per type) | +0.05 | Not previously inspected for that type |
-| `add_callback` after normal gradients | -0.20 | `gradients_inspected == True AND gradients_were_normal == True` |
-| Invalid action | -0.05 | Action not in current `available_actions` |
-| Wrong code fix | -0.10 | `fix_code` with incorrect line or replacement |
-| Correct diagnosis | +0.50 | `diagnosis == true_root_cause` |
-| Wrong diagnosis | -0.30 | `diagnosis != true_root_cause` |
-| Convergence after fix+restart | +0.40 | `fix_action_taken AND restart_after_fix AND convergence_confirmed` |
-See spec Section 12 for full design rationale.
-### 4.6 Grader Function
-Returns a single normalized 0.0-1.0 score at episode end. Evaluates `EpisodeState` holistically — checks which key actions were taken, whether the correct fix was applied, whether the diagnosis is correct, and efficiency. **Not a sum of step rewards.** One grader function per task. All graders are deterministic.
-Exposed via `POST /grader`. Returns score for the most recently completed episode.
-### 4.7 The Six Tasks
-| Task | ID | Difficulty | Root Cause | Key Signal | Heuristic Score |
-|---|---|---|---|---|---|
-| Exploding Gradients | `task_001` | Easy | `lr_too_high` | All layers `is_exploding: True`, NaN in error_log | ~0.85 |
-| Vanishing Gradients | `task_002` | Easy | `vanishing_gradients` | Deeper layers `is_vanishing: True`, flat loss | ~0.80 |
-| Silent Data Leakage | `task_003` | Medium | `data_leakage` | High val accuracy from epoch 1, `class_overlap_score` 0.68-0.88 | ~0.70 |
-| Overfitting | `task_004` | Medium | `overfitting` | Train-val divergence, loss→0.01 while val climbs | ~0.65 |
-| BatchNorm Eval Mode | `task_005` | Hard | `batchnorm_eval_mode` | Slow val degradation + compound red herrings | ~0.45 |
-| PyTorch Code Bug | `task_006` | Hard | `code_bug` (always) | Anomalous metrics, root cause only visible in code | ~0.30 |
-**MVP tasks:** 1, 3, 5 (satisfies the 3-task minimum with easy→medium→hard range).
-See spec Section 11 for complete task specifications including fault parameters, red herrings, solution paths, and grader breakdowns.
-### 4.8 Baseline Agents
-**Rule-based baseline (submission default, `baseline_heuristic.py`):**
-- Deterministic decision tree: inspect_gradients → check exploding/vanishing → inspect_data → check leakage → check overfitting → inspect_model_modes → inspect_code → fallback
-- No API key required. Bit-exact reproducible.
-- Used for Phase 1 auto-validation reproducibility checks.
-**LLM baseline (optional, `baseline_inference.py`):**
-- GPT-4o at temperature=0.0, seed=42
-- Requires `OPENAI_API_KEY` environment variable
-- Supplementary demonstration of heuristic vs. reasoning score gap
-- Not used for Phase 1 reproducibility — scores reported only after empirical measurement
-### 4.9 Required Endpoints
-| Endpoint | Method | Required By | Response |
-|---|---|---|---|
-| `/ws` | WebSocket | OpenEnv framework | Handles `reset`, `step`, `state` messages |
-| `/tasks` | GET | Hackathon | Task list with IDs, difficulties, MLTrainingAction JSON schema |
-| `/grader` | POST | Hackathon | `{"score": float, "task_id": str, "steps": int}` for last completed episode |
-| `/baseline` | POST | Hackathon | Triggers baseline run, returns `{"scores": {"task_001": float, ...}}` |
-| `/health` | GET | Hackathon | `{"status": "ready", "tasks": N}` — N is active task count |
-| `/dashboard` | GET | Bonus | Live diagnostic dashboard (HTML/JS, Plotly.js via CDN) |
-| `/validation-report` | GET | Bonus | Pre-computed PyTorch fidelity reports |
-Framework auto-provides: `POST /reset`, `POST /step`, `GET /state`, `GET /schema`, `GET /docs`, `/mcp`.
-### 4.10 Error Handling
-`step()` must never raise an unhandled exception. All invalid actions return a valid observation with -0.05 penalty and an error note. See spec Section 16 for the complete error handling matrix covering all edge cases (invalid actions, malformed JSON, step before reset, etc.).
----
-## 5. Non-Functional Requirements
-### 5.1 OpenEnv Spec Compliance
-| Requirement | Implementation |
-|---|---|
-| `openenv.yaml` present | Name, version, description, framework, tags, observation/action space, tasks with IDs+difficulties+max_steps, reward config, endpoints |
-| Typed Pydantic models | `MLTrainingAction` extends `Action`, `MLTrainingObservation` extends `Observation`, all fields explicitly typed |
-| `step()`/`reset()`/`state()` | Implemented in `MLTrainingEnvironment` extending `Environment` from `openenv.core.env_server.interfaces` |
-| `openenv validate` passes | Tested before every submission |
-### 5.2 Framework Integration
-| Requirement | Implementation |
-|---|---|
-| `openenv-core` v0.2.2 | `create_app()` returns standard FastAPI instance — **verified** |
-| Custom routes compose | `/tasks`, `/grader`, `/baseline`, `/health` added via `@app.get()`/`@app.post()` on the returned FastAPI app |
-| Framework-provided routes | `/reset`, `/step`, `/state`, `/ws`, `/schema`, `/docs`, `/mcp` — do not reimplement |
-| Factory pattern | `create_app(MLTrainingEnvironment, ...)` takes the class, not an instance |
-| Concurrent sessions | `SUPPORTS_CONCURRENT_SESSIONS = True`, session state keyed by session ID |
-| Typed client | `client.py` extends `EnvClient` with typed action/observation — used by baseline scripts |
-### 5.3 Docker & Deployment
-| Requirement | Target |
-|---|---|
-| Base image | `python:3.12-slim` |
-| PyTorch | CPU-only wheel (`--index-url https://download.pytorch.org/whl/cpu`), ~150MB |
-| Total image size | <500MB |
-| Build time | <5 min (no real training during build; validation reports pre-computed) |
-| HF Spaces | Tagged with `openenv`, port 7860 |
-| Health check | `/health` returns `{"status": "ready", "tasks": N}` within 60s of container start |
-### 5.4 Reproducibility
-| Requirement | Implementation |
-|---|---|
-| Deterministic episodes | `torch.manual_seed(seed)` at every `reset()`, seed derived deterministically from task ID |
-| Baseline bit-exact | Rule-based baseline produces identical scores on two consecutive runs |
-| Exploit resistance | Parameters randomized per `reset()` from defined ranges; opaque task IDs |
-| Grader determinism | Same `EpisodeState` always produces same score |
-### 5.5 Performance
-| Requirement | Target |
-|---|---|
-| `reset()` latency | <200ms (model instantiation + 2 forward passes + parametric curves) |
-| `step()` latency | <10ms (action dispatch + reward + state update) |
-| Memory | <512MB RSS (small CNN ~50K params, no GPU, no large datasets) |
-### 5.6 Code Quality
-| Requirement | Standard |
-|---|---|
-| Formatting | black (line length 88) |
-| Linting | ruff |
-| Import ordering | isort (profile=black) |
-| Type hints | Every function signature and return type |
-| Tests | pytest, >80% coverage, every module has corresponding test file |
-| PyTorch-native | All core computation uses `torch.Tensor`, zero numpy in core modules |
----
-## 6. Prioritized Scope
-### Tier 1: MVP (Must Ship First)
-**Deadline within deadline:** Deploy to HF Spaces by Day 6 (April 2). Everything after is additive.
-| Deliverable | Description | DQ Risk if Missing |
-|---|---|---|
-| Task 1 (`task_001`) | Exploding gradients — easy | Yes (need 3+ tasks) |
-| Task 3 (`task_003`) | Silent data leakage — medium | Yes (need 3+ tasks) |
-| Task 5 (`task_005`) | BatchNorm eval mode — hard | Yes (need easy→hard range) |
-| Context-gated penalty | -0.20 for `add_callback` after `gradients_were_normal` | No (but kills differentiation) |
-| Rule-based baseline | `baseline_heuristic.py`, deterministic, no API key | Yes (baseline required) |
-| Reward engine | All 7 reward components implemented exactly | Yes (reward logic required) |
-| Graders (3) | One per MVP task, 0.0-1.0, deterministic | Yes (graders required) |
-| `openenv.yaml` | Full metadata, 3+ tasks listed | Yes (spec compliance) |
-| Required endpoints | `/tasks`, `/grader`, `/baseline`, `/health` | Yes (auto-validator checks) |
-| Dockerfile | Builds and runs, port 7860 | Yes (auto-validator checks) |
-| HF Space | Deployed, tagged `openenv`, responds to `reset()` | Yes (auto-validator pings) |
-| README | Environment description, action/observation spaces, task descriptions, setup instructions, baseline scores | Yes (submission requirement) |
-### Tier 2: Strongest Differentiator (Add Immediately After MVP)
-| Deliverable | Description | Why This Order |
-|---|---|---|
-| Task 6 (`task_006`) | PyTorch code bug — hard, code-level debugging | Single highest-impact feature for Meta judges |
-| Code fix validation | Multi-strategy pipeline (tokenize, AST, semantic patterns) | Required for Task 6 to work with LLM agents |
-| Grader for Task 6 | `code_bug` diagnosis, code fix scoring | Completes Task 6 |
-### Tier 3: Full Task Coverage (Time Permitting)
-| Deliverable | Description |
-|---|---|
-| Task 2 (`task_002`) | Vanishing gradients — easy (similar to Task 1, fast to implement) |
-| Task 4 (`task_004`) | Overfitting — medium (train-val divergence, regularization fix) |
-| Graders for Tasks 2 & 4 | Same pattern as existing graders |
-### Tier 4: Polish & Extras (Only After Tiers 1-3 Complete)
-| Deliverable | Description | Priority Within Tier |
-|---|---|---|
-| Live dashboard | HTML/JS at `/dashboard`, Plotly.js via CDN, 4-panel layout | 1st — transforms judging experience |
-| PyTorch validation suite | 6 scripts proving parametric curves match real training, R² > 0.85 | 2nd — answers "how realistic?" |
-| Validation report endpoint | `GET /validation-report` serving pre-computed fidelity plots | With validation suite |
-| LLM baseline | `baseline_inference.py`, GPT-4o, measures heuristic-LLM gap | 3rd — supplementary demonstration |
-### Implementation Timeline (11 days: March 28 - April 8)
-| Days | Focus | Exit Criteria |
-|---|---|---|
-| 1-2 | Skeleton server + Task 1 end-to-end | `reset()` → `step()` → `grader` works for one task, Docker builds |
-| 3-5 | Tasks 3 & 5 + reward engine + baseline | All 3 MVP tasks pass grader, `baseline_heuristic.py` reproduces |
-| 6 | **Deploy MVP to HF Spaces** | Auto-validation passes. This is the insurance policy. |
-| 7-8 | Task 6 (code debugging) | Code fix validation works for all 4 bug variants |
-| 9-10 | Tasks 2 & 4 + dashboard | Full 6-task environment, dashboard shows agent behavior |
-| 11 | Polish, README, final smoke test | Submission-ready |
-### What We Will NOT Build (Explicit Exclusions)
-- No game or toy environments
-- No numpy in core modules (torch.Tensor only)
-- No free-text diagnosis (closed enum only)
-- No grader that sums step rewards (holistic evaluation only)
-- No cumulative step penalty (flat -0.01 only, never -0.01 * step_count)
-- No accommodation support or non-RL features
-- No multi-GPU or CUDA dependencies (CPU-only PyTorch)

docs/PROJECT_GUIDE.md DELETED Viewed

@@ -1,691 +0,0 @@
-# PyTorch Training Run Debugger — Complete Project Guide
-## What Is This?
-A game where an AI agent plays detective to fix broken PyTorch training runs. The agent sees a failing training run, investigates clues (gradients, data, code), applies a fix, and submits a diagnosis. Built as an [OpenEnv](https://github.com/openenv) RL environment for the **Meta PyTorch OpenEnv Hackathon**.
----
-## How a Game Works
-```
-1. Agent receives a broken training run (loss curves, config, error log)
-2. Agent investigates (inspect gradients, data, weights, model modes, code)
-3. Agent applies a fix (reduce LR, patch data, fix code, etc.)
-4. Agent restarts training and confirms recovery
-5. Agent submits diagnosis ("the problem was lr_too_high")
-6. Grader scores the agent 0.0 to 1.0
-```
----
-## The 7 Tasks
-| Task | Problem | Difficulty | Root Cause | Key Clue |
-|------|---------|-----------|------------|----------|
-| `task_001` | Gradients explode | Easy | `lr_too_high` | All layers `is_exploding: true` |
-| `task_002` | Gradients vanish | Easy | `vanishing_gradients` | Deep layers `is_vanishing: true` |
-| `task_003` | Test data leaked into training | Medium | `data_leakage` | `class_overlap_score > 0.5` |
-| `task_004` | Model memorizes, doesn't learn | Medium | `overfitting` | Train loss drops, val loss rises |
-| `task_005` | BatchNorm stuck in eval mode | Hard | `batchnorm_eval_mode` | Model modes show "eval" + red herrings |
-| `task_006` | Bug in Python training code | Hard | `code_bug` | Bug visible in code snippet |
-| `task_007` | LR scheduler decays too fast | Medium-Hard | `scheduler_misconfigured` | Early progress then stagnation |
----
-## Reward System
-Every action earns or costs points (capped at -1.0 to 1.0):
-| Event | Reward | When |
-|-------|--------|------|
-| Any step taken | **-0.01** | Always (encourages efficiency) |
-| First-time inspection | **+0.05** | Once per inspection type |
-| Correct diagnosis | **+0.50** | Diagnosis matches root cause |
-| Wrong diagnosis | **-0.30** | Diagnosis doesn't match |
-| Fix works + training recovers | **+0.40** | After fix + restart + convergence |
-| Invalid action | **-0.05** | Action not available |
-| Wrong code fix | **-0.10** | `fix_code` with wrong line/replacement |
-| **Context-gated penalty** | **-0.20** | Inspected gradients, saw they're normal, then added gradient clipping anyway |
-### The Context-Gated Penalty (Core Innovation)
-- Agent checks gradients -> finds them **normal** -> adds gradient clipping = **-0.20 penalty** (ignoring evidence)
-- Agent adds gradient clipping **before** checking gradients = **no penalty** (reasonable prior)
-This teaches: *don't ignore what you've already learned*.
----
-## Architecture
-```
-ml_training_debugger/          # Core logic
-  models.py                    # All data types (Pydantic)
-  scenarios.py                 # Creates the 7 tasks with random params
-  pytorch_engine.py            # Real PyTorch model + fault injection
-  simulation.py                # Loss/accuracy curve generation
-  reward_engine.py             # Per-step reward calculation
-  graders.py                   # Final 0.0-1.0 scoring per task
-  code_templates.py            # Buggy code for Task 6
-  client.py                    # Client for connecting to the environment
-server/                        # Web server
-  app.py                       # FastAPI + all endpoints
-  environment.py               # Game logic (reset, step, state)
-tests/                         # 183 tests, 97% coverage
-baseline_heuristic.py          # Rule-based agent (deterministic)
-baseline_inference.py          # LLM agent (Llama/GPT-4o)
-```
----
-## API Endpoints
-### GET /health
-Server status check.
-**Response:**
-```json
-{
-  "status": "ready",
-  "tasks": 7
-}
-```
----
-### GET /tasks
-List all available tasks with action schema.
-**Response:**
-```json
-[
-  {
-    "id": "task_001",
-    "difficulty": "easy",
-    "max_steps": 20,
-    "action_schema": {
-      "title": "MLTrainingAction",
-      "type": "object",
-      "properties": {
-        "action_type": { "type": "string" },
-        "target": { "type": ["string", "null"] },
-        "value": { "type": ["number", "integer", "string", "null"] },
-        "diagnosis": { "type": ["string", "null"] },
-        "line": { "type": ["integer", "null"] },
-        "replacement": { "type": ["string", "null"] }
-      },
-      "required": ["action_type"]
-    }
-  }
-]
-```
----
-### POST /baseline
-Run the heuristic baseline agent on all 7 tasks.
-**Response:**
-```json
-{
-  "scores": {
-    "task_001": 1.00,
-    "task_002": 1.00,
-    "task_003": 1.00,
-    "task_004": 0.45,
-    "task_005": 0.35,
-    "task_006": 1.00,
-    "task_007": 1.00
-  }
-}
-```
-Returns `409` if baseline is already running.
----
-### POST /grader
-Get the grader score for the last completed episode.
-**Query params:** `session_id` (optional)
-**Response:**
-```json
-{
-  "score": 0.85,
-  "task_id": "task_001",
-  "steps": 5
-}
-```
-If no episode completed:
-```json
-{
-  "score": null,
-  "error": "no_completed_episode"
-}
-```
----
-### GET /dashboard
-Live diagnostic dashboard (HTML page with Plotly.js charts). Open in a browser.
-**Panels:**
-1. Training metrics (loss/accuracy curves)
-2. Gradient & weight heatmap
-3. Action timeline with rewards
-4. Episode summary with state flags
----
-### GET /validation-report
-Pre-computed fidelity report comparing parametric curves to real PyTorch training runs.
----
-### GET /curriculum
-Recommended task order for progressive training (easy to hard, 3 difficulty levels each).
-**Response:**
-```json
-{
-  "curriculum": [
-    { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 1, "max_steps": 20 },
-    { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 3, "max_steps": 20 },
-    { "task_id": "task_001", "difficulty": "easy", "difficulty_level": 5, "max_steps": 20 }
-  ],
-  "total_episodes": 21
-}
-```
----
-### GET /leaderboard
-Sorted episode scores from baseline runs.
-**Response:**
-```json
-{
-  "entries": [
-    { "score": 1.00, "task_id": "task_001", "steps": 5, "episode_id": "baseline_task_001" }
-  ],
-  "total": 7
-}
-```
----
-### GET /replay/{episode_id}
-Full action/observation trace for a completed episode.
-**Response:**
-```json
-{
-  "episode_id": "baseline_task_001",
-  "score": 1.00,
-  "task_id": "task_001",
-  "steps": 5
-}
-```
----
-## WebSocket Interface (Primary Agent Interface)
-**Endpoint:** `ws://localhost:7860/ws`
-This is the main way agents interact with the environment. HTTP endpoints are stateless — WebSocket maintains session state across a full episode.
-### Reset (Start New Episode)
-**Send:**
-```json
-{
-  "type": "reset",
-  "seed": 42,
-  "kwargs": {
-    "task_id": "task_003",
-    "difficulty_level": 3
-  }
-}
-```
-Without `kwargs`, defaults to `task_001`.
-**Receive:**
-```json
-{
-  "type": "observation",
-  "observation": {
-    "run_id": "ep_12345",
-    "framework": "pytorch",
-    "epoch": 20,
-    "training_loss_history": [2.3, 2.1, 1.9, ...],
-    "val_loss_history": [2.4, 2.2, 2.0, ...],
-    "val_accuracy_history": [0.3, 0.35, 0.4, ...],
-    "gradient_stats": [],
-    "model_weight_stats": null,
-    "data_batch_stats": null,
-    "model_mode_info": null,
-    "code_snippet": null,
-    "current_config": {
-      "learning_rate": 0.001,
-      "weight_decay": 0.0001,
-      "batch_size": 64,
-      "hidden_dim": 64,
-      "num_layers": 3,
-      "optimizer": "adam",
-      "dropout_rate": 0.0,
-      "gradient_clip_norm": null
-    },
-    "error_log": null,
-    "gpu_memory_used_gb": 6.2,
-    "gpu_memory_total_gb": 16.0,
-    "available_actions": [
-      "inspect_gradients",
-      "inspect_data_batch",
-      "inspect_model_modes",
-      "inspect_model_weights",
-      "inspect_code",
-      "modify_config",
-      "add_callback",
-      "replace_optimizer",
-      "patch_data_loader",
-      "fix_model_mode",
-      "mark_diagnosed"
-    ],
-    "episode_state": {
-      "step_count": 0,
-      "gradients_inspected": false,
-      "gradients_were_normal": false,
-      "data_inspected": false,
-      "model_modes_inspected": false,
-      "model_weights_inspected": false,
-      "code_inspected": false,
-      "fix_action_taken": false,
-      "restart_after_fix": false,
-      "diagnosis_submitted": false,
-      "actions_taken": []
-    },
-    "notes": null,
-    "done": false,
-    "reward": null,
-    "metadata": {}
-  }
-}
-```
-### Step (Take an Action)
-**Investigation actions** (no extra fields needed):
-```json
-{"type": "step", "action": {"action_type": "inspect_gradients"}}
-{"type": "step", "action": {"action_type": "inspect_data_batch"}}
-{"type": "step", "action": {"action_type": "inspect_model_modes"}}
-{"type": "step", "action": {"action_type": "inspect_model_weights"}}
-{"type": "step", "action": {"action_type": "inspect_code"}}
-```
-**Fix actions:**
-```json
-{"type": "step", "action": {"action_type": "modify_config", "target": "learning_rate", "value": 0.001}}
-{"type": "step", "action": {"action_type": "add_callback"}}
-{"type": "step", "action": {"action_type": "replace_optimizer"}}
-{"type": "step", "action": {"action_type": "patch_data_loader"}}
-{"type": "step", "action": {"action_type": "fix_model_mode"}}
-{"type": "step", "action": {"action_type": "fix_code", "line": 5, "replacement": "model.train()"}}
-```
-**Terminal actions:**
-```json
-{"type": "step", "action": {"action_type": "restart_run"}}
-{"type": "step", "action": {"action_type": "mark_diagnosed", "diagnosis": "lr_too_high"}}
-```
-**Receive (after each step):**
-```json
-{
-  "type": "observation",
-  "observation": {
-    "...same structure as reset response...",
-    "gradient_stats": [
-      {
-        "layer_name": "conv1",
-        "norm_history": [0.5, 0.6, 0.7],
-        "mean_norm": 51.1,
-        "max_norm": 98.3,
-        "is_exploding": true,
-        "is_vanishing": false
-      }
-    ],
-    "episode_state": {
-      "step_count": 1,
-      "gradients_inspected": true,
-      "actions_taken": ["inspect_gradients"]
-    },
-    "done": false,
-    "reward": 0.04
-  }
-}
-```
-When `done: true`, the episode is over.
----
-## All 14 Action Types
-| Action | Required Fields | Description |
-|--------|----------------|-------------|
-| `inspect_gradients` | none | View per-layer gradient stats |
-| `inspect_data_batch` | none | View data batch statistics |
-| `inspect_model_modes` | none | View train/eval mode per layer |
-| `inspect_model_weights` | none | View per-layer weight stats |
-| `inspect_code` | none | View source code (Task 6) |
-| `modify_config` | `target`, `value` | Change a hyperparameter |
-| `add_callback` | none | Add gradient clipping callback |
-| `replace_optimizer` | none | Switch optimizer |
-| `patch_data_loader` | none | Fix data pipeline |
-| `fix_model_mode` | none | Switch model to train mode |
-| `fix_code` | `line`, `replacement` | Fix a line of code |
-| `restart_run` | none | Restart training (requires fix first) |
-| `mark_diagnosed` | `diagnosis` | Submit final diagnosis |
-| `rollback_checkpoint` | none | Rollback to checkpoint |
-### Valid `target` values for modify_config
-`learning_rate`, `weight_decay`, `batch_size`, `hidden_dim`, `num_layers`, `optimizer`, `dropout_rate`, `gradient_clip_norm`
-### Valid `diagnosis` values for mark_diagnosed
-`lr_too_high`, `vanishing_gradients`, `data_leakage`, `overfitting`, `batchnorm_eval_mode`, `code_bug`, `scheduler_misconfigured`
----
-## Dynamic Action Availability
-Actions appear/disappear based on episode state:
-| Action | Available When |
-|--------|---------------|
-| `fix_code` | Only after `inspect_code` (code_inspected = true) |
-| `restart_run` | Only after a fix action (fix_action_taken = true) |
-| `rollback_checkpoint` | Only after restart (restart_after_fix = true) |
-| `mark_diagnosed` | Only while diagnosis_submitted = false |
----
-## Observation Fields — Progressive Reveal
-On reset, the agent sees loss curves, config, and error log. Everything else is `null` until inspected:
-| Field | Starts As | Populated After |
-|-------|-----------|----------------|
-| `training_loss_history` | 20 floats | Always visible |
-| `val_accuracy_history` | 20 floats | Always visible |
-| `val_loss_history` | 20 floats | Always visible |
-| `current_config` | Full config | Always visible |
-| `error_log` | String or null | Always visible |
-| `gradient_stats` | `[]` | `inspect_gradients` |
-| `model_weight_stats` | `null` | `inspect_model_weights` |
-| `data_batch_stats` | `null` | `inspect_data_batch` |
-| `model_mode_info` | `null` | `inspect_model_modes` |
-| `code_snippet` | `null` | `inspect_code` |
----
-## Data Types
-### GradientStats (per layer)
-```json
-{
-  "layer_name": "conv1",
-  "norm_history": [0.5, 0.6, 0.7],
-  "mean_norm": 12.5,
-  "max_norm": 25.3,
-  "is_exploding": true,
-  "is_vanishing": false
-}
-```
-- Exploding: `mean_norm > 10.0`
-- Vanishing: `mean_norm < 0.000001`
-### ModelWeightStats (per layer)
-```json
-{
-  "layer_name": "conv1",
-  "weight_norm": 1.234,
-  "weight_mean": 0.001,
-  "weight_std": 0.05,
-  "weight_min": -0.15,
-  "weight_max": 0.16,
-  "dead_neuron_pct": 0.0,
-  "has_nan": false,
-  "has_inf": false
-}
-```
-### DataBatchStats
-```json
-{
-  "label_distribution": {"0": 0.25, "1": 0.25, "2": 0.25, "3": 0.25},
-  "feature_mean": 0.5,
-  "feature_std": 0.2,
-  "null_count": 0,
-  "class_overlap_score": 0.15,
-  "batch_size": 64,
-  "duplicate_ratio": 0.0,
-  "confusion_matrix": [[10, 2, 1], [1, 9, 3], [2, 1, 11]]
-}
-```
-### CodeSnippet (Task 6 only)
-```json
-{
-  "code": "import torch\nimport torch.nn as nn\n...",
-  "filename": "train.py",
-  "line_count": 50,
-  "imports": ["torch", "torch.nn", "torch.optim"],
-  "hint": "Look for .detach() preventing gradient flow"
-}
-```
-### EpisodeState
-```json
-{
-  "step_count": 0,
-  "gradients_inspected": false,
-  "gradients_were_normal": false,
-  "data_inspected": false,
-  "model_modes_inspected": false,
-  "model_weights_inspected": false,
-  "code_inspected": false,
-  "fix_action_taken": false,
-  "restart_after_fix": false,
-  "diagnosis_submitted": false,
-  "actions_taken": []
-}
-```
----
-## Grading Breakdown (per task)
-Each task has its own grader that scores 0.0 to 1.0 based on what the agent did:
-### Task 1 — Exploding Gradients
-| Component | Points |
-|-----------|--------|
-| Inspected gradients | +0.05 |
-| Applied config fix | +0.20 |
-| Restarted training | +0.35 |
-| Correct diagnosis (`lr_too_high`) | +0.40 |
-### Task 2 — Vanishing Gradients
-| Component | Points |
-|-----------|--------|
-| Inspected gradients | +0.05 |
-| Applied config fix | +0.20 |
-| Restarted training | +0.35 |
-| Correct diagnosis (`vanishing_gradients`) | +0.40 |
-### Task 3 — Data Leakage
-| Component | Points |
-|-----------|--------|
-| Inspected data | +0.05 |
-| Patched data loader | +0.30 |
-| Restarted training | +0.30 |
-| Correct diagnosis (`data_leakage`) | +0.35 |
-### Task 4 — Overfitting
-| Component | Points |
-|-----------|--------|
-| Inspected data | +0.05 |
-| Applied fix (config or callback) | +0.25 |
-| Restarted training | +0.30 |
-| Correct diagnosis (`overfitting`) | +0.40 |
-### Task 5 — BatchNorm Eval Mode (with red herrings)
-| Component | Points |
-|-----------|--------|
-| Inspected gradients | +0.05 |
-| Inspected model modes | +0.05 |
-| **Fell for red herring** (add_callback after normal gradients) | **-0.20** |
-| Fixed model mode | +0.25 |
-| Restarted training | +0.30 |
-| Correct diagnosis (`batchnorm_eval_mode`) | +0.40 |
-### Task 6 — Code Bug
-| Component | Points |
-|-----------|--------|
-| Inspected code | +0.05 |
-| Fixed code correctly | +0.30 |
-| Restarted training | +0.25 |
-| Correct diagnosis (`code_bug`) | +0.40 |
-### Task 7 — Scheduler Misconfigured
-| Component | Points |
-|-----------|--------|
-| Inspected gradients | +0.05 |
-| Inspected data | +0.05 |
-| Applied config fix | +0.25 |
-| Restarted training | +0.25 |
-| Correct diagnosis (`scheduler_misconfigured`) | +0.40 |
----
-## Baseline Scores
-| Task | Heuristic | Llama 3.3 70B | Llama 3.1 8B |
-|------|-----------|---------------|--------------|
-| task_001 | **1.00** | 1.00 | 0.60 |
-| task_002 | **1.00** | 1.00 | 0.05 |
-| task_003 | **1.00** | 0.40 | 0.40 |
-| task_004 | 0.45 | 0.45 | **0.60** |
-| task_005 | **1.00** | 1.00 | 1.00 |
-| task_006 | **1.00** | — | 0.60-1.00 |
-| task_007 | **1.00** | — | 0.60 |
-| **Average** | **0.92** | ~0.69 | 0.55 |
----
-## Walkthrough: Solving Task 1 (Exploding Gradients)
-```
-Step 1: Reset
-  Send:    {"type": "reset", "kwargs": {"task_id": "task_001"}}
-  See:     Loss history going to infinity, error_log says "NaN at epoch 12"
-Step 2: Inspect gradients
-  Send:    {"type": "step", "action": {"action_type": "inspect_gradients"}}
-  See:     All layers is_exploding: true, mean_norm > 10.0
-  Reward:  +0.04 (-0.01 step + 0.05 investigation)
-Step 3: Reduce learning rate
-  Send:    {"type": "step", "action": {"action_type": "modify_config", "target": "learning_rate", "value": 0.001}}
-  Reward:  -0.01 (step penalty)
-Step 4: Restart training
-  Send:    {"type": "step", "action": {"action_type": "restart_run"}}
-  See:     Convergence detected!
-  Reward:  +0.39 (-0.01 step + 0.40 convergence)
-Step 5: Submit diagnosis
-  Send:    {"type": "step", "action": {"action_type": "mark_diagnosed", "diagnosis": "lr_too_high"}}
-  See:     done: true
-  Reward:  +0.49 (-0.01 step + 0.50 correct diagnosis)
-Grader score: 1.0 (perfect)
-```
----
-## Walkthrough: Task 5 Trap (Red Herring)
-```
-Step 1: Reset task_005
-Step 2: Inspect gradients
-  -> FC layer has a spike (mean_norm=4.2, but is_exploding: false)
-  -> gradients_were_normal is set to TRUE (nothing actually exploding)
-Step 3 (BAD): Add gradient clipping
-  -> Reward: -0.21 (-0.01 step - 0.20 context-gated penalty!)
-  -> Agent IGNORED the evidence that gradients were normal
-Step 3 (GOOD): Inspect model modes instead
-  -> Sees all layers in "eval" mode — that's the real problem!
-Step 4: Fix model mode
-Step 5: Restart training
-Step 6: Diagnose batchnorm_eval_mode -> correct!
-```
----
-## Quick Start
-```bash
-# Setup
-python3 -m venv .venv && source .venv/bin/activate
-pip install torch --index-url https://download.pytorch.org/whl/cpu
-pip install -r requirements.txt
-pip install pytest pytest-cov
-# Run server
-uvicorn server.app:app --host 0.0.0.0 --port 7860
-# Test
-pytest tests/ -v --cov=ml_training_debugger
-curl http://localhost:7860/health
-curl http://localhost:7860/tasks | python3 -m json.tool
-curl -X POST http://localhost:7860/baseline | python3 -m json.tool
-# Docker
-docker build -t pytorch-debugger .
-docker run -p 7860:7860 pytorch-debugger
-```
----
-## Tech Stack
-| Component | Purpose |
-|-----------|---------|
-| Python 3.12 | Runtime |
-| PyTorch (CPU-only) | Real neural networks, real gradients |
-| FastAPI | Web server |
-| OpenEnv | RL environment framework (step/reset/state API) |
-| Pydantic v2 | Typed data models |
-| Plotly.js | Dashboard charts |
-| Docker | Containerized deployment |

docs/ROADMAP.md DELETED Viewed

@@ -1,441 +0,0 @@
-# ROADMAP — PyTorch Training Run Debugger
-**Timeline:** March 28 - April 8, 2026 (11 days)
-**Runtime:** Python 3.12 · PyTorch CPU-only · openenv-core v0.2.2
-**Governing documents:** `ml-training-debugger-spec.md` (source of truth), `PRD.md` (requirements), `CLAUDE.md` (coding rules)
-**Iron rule:** No phase begins until the previous phase's acceptance criteria are met. The single exception: Phase 0 and Phase 1 file creation can overlap on Day 1.
----
-## Phase 0: Setup & Validation (Days 1-2)
-**Goal:** A running skeleton server that proves the toolchain works end-to-end. Zero business logic — just plumbing.
-### 0.1 Files to Create
-| File | Purpose | Lines (est.) |
-|---|---|---|
-| `ML Debugger/` (this directory) | Project root directory (git init here) | — |
-| `pyproject.toml` | Project metadata, dependencies (torch CPU, openenv-core, pydantic>=2.0, fastapi, uvicorn, pytest, black, ruff, isort) | ~40 |
-| `requirements.txt` | Flat dependency list mirroring pyproject.toml (Docker uses this). **Exclude openai** — deferred to Phase 3. | ~10 |
-| `.python-version` | `3.12` | 1 |
-| `openenv.yaml` | Full metadata — start with 3 MVP tasks (task_001, task_003, task_005), expand later | ~50 |
-| `Dockerfile` | `python:3.12-slim`, torch CPU-only, openenv-core, app deps, port 7860 | ~15 |
-| `.dockerignore` | Exclude `.venv/`, `__pycache__/`, `.git/`, `validation/reports/*.png` | ~10 |
-| `.gitignore` | `.venv/`, `__pycache__/`, `*.pyc`, `.env`, `run*.json` | ~15 |
-| `ml_training_debugger/__init__.py` | Package init, version string | ~3 |
-| `ml_training_debugger/models.py` | **Stub only:** `RootCauseDiagnosis` enum, `EpisodeState`, `TrainingConfig`, `GradientStats`, `DataBatchStats`, `ModelWeightStats`, `CodeSnippet`, `MLTrainingObservation` (extends `Observation`), `MLTrainingAction` (extends `Action`). All fields typed, all values defaulted. | ~200 |
-| `ml_training_debugger/client.py` | **Stub:** `MLTrainingEnvClient` extending `EnvClient` with `action_type = MLTrainingAction` and `observation_type = MLTrainingObservation`. Used by baseline scripts. | ~20 |
-| `server/__init__.py` | Empty | 0 |
-| `server/environment.py` | **Stub:** `MLTrainingEnvironment(Environment)` with `reset()` returning a hardcoded observation, `step()` echoing back, `state` property | ~50 |
-| `server/app.py` | `create_app(MLTrainingEnvironment, MLTrainingAction, MLTrainingObservation)` + stub routes for `/tasks`, `/grader`, `/baseline`, `/health` | ~60 |
-| `tests/__init__.py` | Empty | 0 |
-| `tests/test_models.py` | Validate all Pydantic models instantiate, serialize to JSON, and round-trip | ~60 |
-| `tests/conftest.py` | Shared fixtures: sample `EpisodeState`, sample `ScenarioParams`, sample observation | ~40 |
-### 0.2 Dependencies to Install
-```bash
-# Create venv inside ML Debugger/ project root
-python3 -m venv .venv && source .venv/bin/activate
-# Core runtime
-pip install torch --index-url https://download.pytorch.org/whl/cpu
-pip install openenv-core pydantic>=2.0 fastapi uvicorn
-# Dev tools
-pip install pytest pytest-cov pytest-asyncio black ruff isort httpx websockets
-# NOTE: openai is deferred to Phase 3 (LLM baseline). Do NOT install now.
-```
-### 0.3 Validation Steps (Must All Pass)
-| # | Command | Expected Result |
-|---|---|---|
-| 1 | `python -c "import torch; print(torch.__version__)"` | Version string, no CUDA |
-| 2 | `python -c "from openenv.core.env_server.http_server import create_app"` | No import error |
-| 3 | `python -c "from ml_training_debugger.models import MLTrainingAction, MLTrainingObservation"` | No import error |
-| 4 | `python -c "from ml_training_debugger.client import MLTrainingEnvClient"` | No import error |
-| 5 | `uvicorn server.app:app --host 0.0.0.0 --port 7860` | Server starts, no crash |
-| 6 | `curl http://localhost:7860/health` | `{"status": "ready", "tasks": 3}` |
-| 7 | `curl http://localhost:7860/tasks` | JSON with task list |
-| 8 | `curl http://localhost:7860/docs` | Swagger UI loads |
-| 9 | `pytest tests/test_models.py -v` | All pass |
-| 10 | `docker build -t pytorch-debugger .` | Builds in <5min, image <500MB |
-| 11 | `docker run -p 7860:7860 pytorch-debugger` then `curl /health` | Returns `{"status": "ready", "tasks": 3}` |
-| 12 | `openenv validate` | Passes (or identify what needs fixing) |
-| 13 | `black --check . && ruff check . && isort --check .` | Clean |
-### 0.4 Acceptance Criteria
-- [ ] Skeleton server starts on port 7860 and responds to `/health`, `/tasks`, `/docs`, `/ws`
-- [ ] `/health` returns `{"status": "ready", "tasks": 3}` (task count matches active tasks)
-- [ ] All Pydantic models instantiate without error and serialize to valid JSON
-- [ ] `client.py` imports without error
-- [ ] Docker image builds under 500MB and container starts cleanly
-- [ ] `openenv validate` passes or all failures are documented with a fix plan
-- [ ] `pytest` runs with zero failures
-- [ ] Git repo initialized, first commit made
----
-## Phase 1: MVP — Tasks 1, 3, 5 + Core Engine (Days 2-6)
-**Goal:** A fully functional 3-task environment that passes all auto-validation gates, deployed to HF Spaces. This is the survival milestone — everything after this is differentiation.
-### 1.1 Files to Create
-| File | Purpose | Lines (est.) | Depends On |
-|---|---|---|---|
-| `ml_training_debugger/scenarios.py` | `ScenarioParams` dataclass, `sample_scenario(task_id, seed)` for tasks 001/003/005. Parameter ranges from spec Section 11. | ~120 | `models.py` |
-| `ml_training_debugger/pytorch_engine.py` | `SimpleCNN(torch.nn.Module)`, `inject_fault(model, scenario)`, `extract_gradient_stats(model)`, `extract_weight_stats(model)`. Real torch.autograd. | ~250 | `scenarios.py` |
-| `ml_training_debugger/simulation.py` | `gen_loss_history(scenario)`, `gen_val_accuracy_history(scenario)`, `gen_val_loss_history(scenario)`. All `torch.Tensor` ops. Parametric curves per spec Section 6. | ~180 | `scenarios.py` |
-| `ml_training_debugger/reward_engine.py` | `compute_reward(action, episode_state, scenario) -> float`. All 7 reward components per spec Section 12. Context-gated penalty logic. | ~100 | `models.py` |
-| `ml_training_debugger/graders.py` | `grade_task_001(state, scenario)`, `grade_task_003(...)`, `grade_task_005(...)`. Each returns float in [0.0, 1.0]. Per spec Section 11 grader breakdowns. | ~150 | `models.py` |
-| `baseline_heuristic.py` | Deterministic decision tree agent using `MLTrainingEnvClient`. Runs all MVP tasks, prints JSON scores. | ~150 | `client.py`, server running |
-| `README.md` | Environment description, action/observation spaces, task descriptions with difficulty, setup instructions, baseline scores table | ~200 | Everything |
-### 1.2 Files to Edit
-| File | Changes | Why |
-|---|---|---|
-| `ml_training_debugger/models.py` | Finalize all field types, add `available_actions` computation logic to `EpisodeState`, add red herring fields (notes, gpu_memory) | Stubs from Phase 0 become real |
-| `ml_training_debugger/client.py` | Wire typed client to connect via WebSocket or HTTP as needed by baseline | Stub becomes functional |
-| `server/environment.py` | Full `reset()` and `step()` implementations. See spec Sections 9, 13 for lifecycle. | Stubs become real |
-| `server/app.py` | Wire `/tasks`, `/grader`, `/baseline`, `/health` to return real data. `/health` returns `{"status": "ready", "tasks": 3}`. | Stubs become real |
-| `openenv.yaml` | Finalize observation_space, action_space, reward section. Verify task IDs and max_steps per spec Section 14. | Was skeletal in Phase 0 |
-| `Dockerfile` | Add `COPY` for all new source files. Verify build still works. | New files added |
-### 1.3 Tests to Create
-| Test File | What It Covers | Critical Assertions |
-|---|---|---|
-| `tests/test_scenarios.py` | `sample_scenario()` for each MVP task | Returns correct root cause enum; params within defined ranges; different seeds produce different params |
-| `tests/test_pytorch_engine.py` | Model instantiation, fault injection, gradient/weight extraction | `SimpleCNN` is a real `torch.nn.Module`; `extract_gradient_stats` returns `GradientStats` with real float norms; exploding fault produces `is_exploding=True`; batchnorm eval fault produces `model.training==False` |
-| `tests/test_simulation.py` | Parametric curve generators | All outputs are `list[float]` of length 20; exploding LR produces diverging loss; leakage produces inflated val_acc; batchnorm produces slow val_acc degradation |
-| `tests/test_reward_engine.py` | All 7 reward components | **Critical:** context-gated penalty fires when `gradients_inspected=True AND gradients_were_normal=True` then `add_callback`; does NOT fire when `add_callback` without prior inspection; step penalty is flat -0.01; investigation bonus is +0.05 first-time only |
-| `tests/test_graders.py` | Graders for tasks 001, 003, 005 | Each returns float in [0.0, 1.0]; correct diagnosis + fix + restart = 1.0; wrong diagnosis < 0.5; partial completion scores between 0 and 1 |
-| `tests/test_episode_lifecycle.py` | Full reset→inspect→fix→restart→diagnose flow | State transitions match spec Section 13; `available_actions` updates correctly; `done=True` after `mark_diagnosed`; step limit triggers `done=True` |
-### 1.4 Task-Specific Implementation
-See spec Section 11 for complete task specifications. Key implementation notes per task:
-**Task 1 (`task_001`, easy):** Unambiguous signal. LR from spec ranges → real gradients explode → `is_exploding=True` on all layers. Straightforward grader.
-**Task 3 (`task_003`, medium):** Red herring note about architecture upgrade. Data leakage confirmed via `class_overlap_score`. Normal model (no gradient/weight anomaly). Mild gradient elevation on one layer (`is_exploding=False`).
-**Task 5 (`task_005`, hard):** The differentiator task. `gradients_were_normal=True` set inside `inspect_gradients` handler because `is_exploding=False` on ALL layers (FC spike mean_norm < 10.0). Context-gated penalty fires when agent then calls `add_callback`. Red herrings: FC spike, GPU 91%, conv1 near-vanishing, error_log warning.
-### 1.5 Endpoint Responses
-**`GET /health`:** `{"status": "ready", "tasks": 3}` (200) — or `{"status": "initializing"}` (503) during startup.
-**`GET /tasks`:** Task list with IDs, difficulties, max_steps, and MLTrainingAction JSON schema.
-**`POST /grader`:** `{"score": float, "task_id": str, "steps": int}` (200) — or `{"score": null, "error": "no_completed_episode"}` (200) if no episode. See spec Section 14 for edge cases.
-**`POST /baseline`:** Runs baseline logic internally, returns `{"scores": {"task_001": float, "task_003": float, "task_005": float}}`. Returns 409 if already running.
-### 1.6 Baseline Heuristic Decision Tree
-See spec Section 17 for the complete decision tree. Summary:
-```
-1. reset(task_id)
-2. inspect_gradients
-3. IF any layer is_exploding → fix LR → restart → diagnose lr_too_high
-4. IF any layer is_vanishing → fix LR → restart → diagnose vanishing_gradients
-5. inspect_data_batch
-6. IF class_overlap_score > 0.5 → patch_data_loader → restart → diagnose data_leakage
-7. IF val_loss diverging → modify weight_decay → restart → diagnose overfitting
-8. inspect_model_modes
-9. IF any layer in "eval" → fix_model_mode → restart → diagnose batchnorm_eval_mode
-10. inspect_code → attempt fix → restart → diagnose code_bug
-11. FALLBACK: diagnose overfitting
-```
-### 1.7 Deploy to HF Spaces
-| Step | Action | Verification |
-|---|---|---|
-| 1 | Create HF Space (Docker type), tag with `openenv` | Space page shows openenv tag |
-| 2 | Push Dockerfile + source to Space repo | Build triggers automatically |
-| 3 | Wait for build to complete | Build log shows success |
-| 4 | Test health endpoint | `curl https://<space-url>/health` returns `{"status": "ready", "tasks": 3}` |
-| 5 | Test reset via WebSocket | `wscat -c wss://<space-url>/ws` then send `{"type": "reset", "task_id": "task_001"}` |
-| 6 | Run `openenv validate` against deployed space | All checks pass |
-### 1.8 Acceptance Criteria
-- [ ] `reset(task_id)` for tasks 001, 003, 005 returns valid `MLTrainingObservation` with correct initial state
-- [ ] `step()` dispatches all 14 action types correctly (investigation, fix, terminal)
-- [ ] `inspect_gradients` on Task 1 → `is_exploding=True` on all layers (real torch.autograd)
-- [ ] `inspect_gradients` on Task 5 → `is_exploding=False` on all layers, `gradients_were_normal=True`
-- [ ] `inspect_data_batch` on Task 3 → `class_overlap_score > 0.5`
-- [ ] `inspect_model_modes` on Task 5 → all layers in "eval" mode
-- [ ] Context-gated penalty: `inspect_gradients`(normal) then `add_callback` → reward includes -0.20
-- [ ] Context-gated penalty: `add_callback` without prior inspection → NO -0.20 penalty
-- [ ] Grader for Task 1: correct path scores 1.0, wrong diagnosis scores < 0.5
-- [ ] Grader for Task 5: agent that chases red herring scores 0.80-0.85 (penalty applied)
-- [ ] `baseline_heuristic.py` runs twice → `diff run1.json run2.json` is empty
-- [ ] `POST /baseline` returns scores for all 3 tasks, all in [0.0, 1.0]
-- [ ] `POST /grader` returns score after completed episode
-- [ ] `GET /tasks` returns 3 tasks with action schema
-- [ ] `GET /health` returns `{"status": "ready", "tasks": 3}`
-- [ ] Docker builds <500MB, starts <60s, serves on port 7860
-- [ ] HF Space deployed, responds to `reset()`, tagged `openenv`
-- [ ] `openenv validate` passes
-- [ ] `pytest --cov` shows >80% coverage on all Phase 1 modules
-- [ ] `import torch` in every core module; zero `import numpy` in core
-- [ ] README has: description, action/observation spaces, 3 task descriptions, setup instructions, baseline scores
----
-## Phase 2: Stretch — Tasks 2, 4, 6 + Code Debugging (Days 7-9)
-**Goal:** Full 6-task environment with code-level debugging. Task 6 is the single highest-impact differentiator for Meta judges.
-**Prerequisites:** Phase 1 acceptance criteria ALL met. HF Space deployed and passing auto-validation.
-### 2.1 Priority Order (Strict)
-1. **Task 6** first — it is the strongest differentiator and the hardest to implement
-2. **Task 2** second — structurally identical to Task 1 (vanishing vs. exploding), fastest to add
-3. **Task 4** third — medium difficulty overfitting, similar pattern to existing tasks
-### 2.2 Files to Create
-| File | Purpose | Lines (est.) | Depends On |
-|---|---|---|---|
-| `ml_training_debugger/code_templates.py` | 4 bug variant templates, `generate_code_snippet(bug_type, seed)`, `validate_fix(bug_type, line, replacement)` with multi-strategy pipeline per spec Section 22 | ~250 | `models.py` |
-| `tests/test_code_templates.py` | All 4 variants generate valid code; fix validation accepts correct fixes; rejects wrong fixes; handles whitespace/comment variations | ~150 | `code_templates.py` |
-### 2.3 Files to Edit
-| File | Changes | Complexity |
-|---|---|---|
-| `ml_training_debugger/scenarios.py` | Add `sample_scenario` cases for task_002, task_004, task_006. Task 006 includes `bug_type` field. | Low |
-| `ml_training_debugger/pytorch_engine.py` | Add fault injection for vanishing gradients, overfitting, code bug variants. | Medium |
-| `ml_training_debugger/simulation.py` | Add curve generators for vanishing (flat loss), overfitting (train-val divergence), code bug variants. | Medium |
-| `ml_training_debugger/reward_engine.py` | Add wrong code fix penalty (-0.10). No other changes. | Low |
-| `ml_training_debugger/graders.py` | Add `grade_task_002`, `grade_task_004`, `grade_task_006`. Task 006: diagnosis must be `code_bug` always. | Medium |
-| `server/environment.py` | `step()` handlers for `inspect_code` and `fix_code`. Update `available_actions`. | Medium |
-| `server/app.py` | Update `/tasks` to return 6 tasks. Update `/health` to return `"tasks": 6`. | Low |
-| `openenv.yaml` | Add task_002, task_004, task_006. | Low |
-| `baseline_heuristic.py` | Extend decision tree for vanishing, overfitting, code bug. | Medium |
-| `README.md` | Add descriptions for Tasks 2, 4, 6. Update baseline scores. | Low |
-### 2.4 Task 6 Code Fix Validation
-The `validate_fix()` pipeline is defined in spec Section 22 (Known Risks). Key layers:
-1. **Normalize:** strip whitespace + inline comments → compare against known correct strings
-2. **Tokenize:** Python `tokenize` module, filter noise tokens, compare streams
-3. **Semantic patterns:** 2-3 per variant (e.g. `"criterion("` present AND `".detach()"` absent)
-4. **AST fallback:** `ast.parse()` full code with replacement, verify buggy pattern absent
-Test cases that MUST pass: correct fix, trailing whitespace, inline comments, different indentation.
-Test cases that MUST fail: bug still present, `pass`, wrong line number.
-### 2.5 Tests to Create/Extend
-| Test File | New Coverage |
-|---|---|
-| `tests/test_code_templates.py` | **New file.** All 4 variants, validate_fix accepts/rejects correctly, 5+ whitespace/comment variations per variant |
-| `tests/test_scenarios.py` | Extend: sample_scenario for task_002, 004, 006 |
-| `tests/test_simulation.py` | Extend: vanishing flat loss, overfitting divergence, code bug symptoms |
-| `tests/test_graders.py` | Extend: graders 002, 004, 006. Task 006: `code_bug` required; `batchnorm_eval_mode` on eval_mode variant = wrong |
-| `tests/test_reward_engine.py` | Extend: wrong code fix penalty (-0.10) |
-| `tests/test_episode_lifecycle.py` | Extend: `inspect_code` → `fix_code` available; `fix_code` before `inspect_code` → invalid |
-### 2.6 Acceptance Criteria
-- [ ] All 6 tasks return valid observations from `reset()` and process all action types in `step()`
-- [ ] Task 6: `inspect_code` returns `CodeSnippet` with real PyTorch code containing the sampled bug
-- [ ] Task 6: `fix_code` correct → `fix_action_taken=True`, no penalty
-- [ ] Task 6: `fix_code` wrong → -0.10 penalty
-- [ ] Task 6: `mark_diagnosed(code_bug)` → correct (+0.50)
-- [ ] Task 6: `mark_diagnosed(batchnorm_eval_mode)` on eval_mode variant → wrong (-0.30)
-- [ ] `validate_fix` accepts 5+ whitespace/comment variations per variant
-- [ ] `validate_fix` rejects all invalid fixes
-- [ ] Graders for all 6 tasks return [0.0, 1.0] with meaningful variance
-- [ ] `baseline_heuristic.py` handles all 6 tasks, still bit-exact reproducible
-- [ ] `POST /baseline` returns scores for all 6 tasks
-- [ ] `GET /tasks` returns 6 tasks
-- [ ] `GET /health` returns `{"status": "ready", "tasks": 6}`
-- [ ] All new tests pass; overall coverage >80%
-- [ ] Updated openenv.yaml lists all 6 tasks
-- [ ] HF Space redeployed with 6 tasks, auto-validation still passes
----
-## Phase 3: Polish — Dashboard, Validation Suite, LLM Baseline (Days 10-11)
-**Goal:** Transform a technically correct submission into a visually impressive, deeply validated, winning submission.
-**Prerequisites:** Phase 2 acceptance criteria ALL met. 6-task environment deployed.
-### 3.1 Priority Order Within Phase 3
-1. **Dashboard** — transforms judging experience (highest ROI for judges)
-2. **Full test suite + README polish** — ensures no auto-validation failure
-3. **Validation suite** — answers "how realistic are your curves?"
-4. **LLM baseline** — demonstrates heuristic-reasoning gap (lowest priority)
-### 3.2 Files to Create
-| File | Purpose | Lines (est.) | Priority |
-|---|---|---|---|
-| `server/dashboard.html` | Single-file SPA. 4 panels per spec Section 19. Plotly.js via CDN. | ~400 | 1st |
-| `validation/requirements.txt` | `torch`, `matplotlib`, `scipy` | ~3 | 3rd |
-| `validation/conftest.py` | Shared fixtures: CIFAR-10 subset loader, model definitions | ~50 | 3rd |
-| `validation/validate_exploding_gradients.py` | Real training, compare to parametric curve, R² > 0.85 | ~80 | 3rd |
-| `validation/validate_data_leakage.py` | Real training with leakage, compare | ~80 | 3rd |
-| `validation/validate_batchnorm_eval.py` | Real training with `model.eval()`, compare | ~80 | 3rd |
-| `validation/validate_vanishing_gradients.py` | Real gradient decay, compare | ~80 | 3rd |
-| `validation/validate_overfitting.py` | Real train-val divergence, compare | ~80 | 3rd |
-| `validation/validate_code_bugs.py` | Run 4 bug variants, confirm symptoms | ~80 | 3rd |
-| `validation/reports/` | Pre-computed fidelity scores + comparison plots | — | 3rd |
-| `baseline_inference.py` | LLM agent (GPT-4o, temp=0.0, seed=42). Runs all 6 tasks. **Now install openai.** | ~200 | 4th |
-### 3.3 Files to Edit
-| File | Changes | Priority |
-|---|---|---|
-| `server/app.py` | Add `GET /dashboard` and `GET /validation-report` routes | 1st/3rd |
-| `requirements.txt` | Add `openai` (only now, for LLM baseline) | 4th |
-| `Dockerfile` | `COPY validation/reports/` and `COPY server/dashboard.html` | 1st |
-| `README.md` | Final polish: dashboard description, validation suite, measured baseline scores | 2nd |
-| `openenv.yaml` | Add dashboard and validation-report to endpoints | 1st |
-### 3.4 Dashboard Panels
-See spec Section 19 for full specification. Summary:
-1. **Training Metrics** — Plotly.js line charts for loss/accuracy with restart markers
-2. **Gradient & Weight Heatmap** — color-coded per-layer grid (green/yellow/red/blue)
-3. **Action Timeline** — horizontal bars per step, color-coded by type, reward bars
-4. **Episode Summary** — task ID, state flags, available actions, grader score
-Tech: single HTML file, Plotly.js CDN, native WebSocket, CSS Grid. Zero Docker bloat.
-### 3.5 Validation Suite
-Run locally (NOT in Docker build). Each script: real training → capture metrics → compare to parametric → assert R² > 0.85 → save plots. Pre-computed reports committed to git and served via `/validation-report`. See spec Section 18.
-### 3.6 Tests to Create/Extend
-| Test File | Coverage |
-|---|---|
-| `tests/test_dashboard.py` | `GET /dashboard` returns 200 with HTML containing "Plotly" and "WebSocket" |
-| `tests/test_endpoints.py` | Integration: full episode via HTTP (reset→step→grader), verify response schemas |
-| `tests/test_baseline_reproducibility.py` | Run baseline twice, assert identical JSON |
-| Existing test files | Fill coverage gaps to >80% on every module |
-### 3.7 Acceptance Criteria
-- [ ] `GET /dashboard` serves HTML that renders in a browser with 4 panels
-- [ ] Dashboard connects to WebSocket and updates in real time during a baseline run
-- [ ] Validation suite passes all scripts with R² > 0.85 (run locally)
-- [ ] Pre-computed validation reports exist in `validation/reports/`
-- [ ] `GET /validation-report` serves fidelity data
-- [ ] LLM baseline runs, scores higher than heuristic on Tasks 5 and 6 (if implemented)
-- [ ] README is complete: all 6 tasks, both baselines, dashboard description, setup instructions
-- [ ] `pytest --cov` shows >80% coverage across all modules
-- [ ] Final `openenv validate` passes
-- [ ] Final Docker build <500MB, starts <60s
-- [ ] HF Space redeployed with dashboard + all features
----
-## Pre-Submission Gate Checklist
-**Every item must be checked before submitting. Failure on any starred (*) item = disqualification.**
-### Auto-Validation Gates (*)
-- [ ] * **HF Space deploys** — `curl https://<space-url>/health` returns `{"status": "ready", "tasks": N}` with HTTP 200
-- [ ] * **HF Space responds to reset** — WebSocket connection to `/ws`, send reset message, receive valid observation
-- [ ] * **OpenEnv spec compliance** — `openenv validate` passes (openenv.yaml present, typed models, step/reset/state work)
-- [ ] * **Dockerfile builds** — `docker build -t pytorch-debugger .` succeeds
-- [ ] * **Docker runs** — `docker run -p 7860:7860 pytorch-debugger` starts and serves on port 7860
-- [ ] * **Baseline reproduces** — `python baseline_heuristic.py > run1.json && python baseline_heuristic.py > run2.json && diff run1.json run2.json` produces no output
-- [ ] * **3+ tasks with graders** — `GET /tasks` returns ≥3 tasks; `POST /grader` returns score in [0.0, 1.0] after each task completes
-- [ ] * **Graders produce varying scores** — different agent behaviors produce different scores (not always same value)
-### Required Endpoint Gates (*)
-- [ ] * **`GET /tasks`** — returns JSON with task IDs, difficulties, action schema
-- [ ] * **`POST /grader`** — returns `{"score": float}` after a completed episode
-- [ ] * **`POST /baseline`** — triggers baseline, returns scores for all tasks
-- [ ] * **`GET /health`** — returns `{"status": "ready", "tasks": N}`
-### Submission Artifacts (*)
-- [ ] * **Public GitHub repo** — contains all code, README, requirements, openenv.yaml
-- [ ] * **HF Spaces demo link** — deployed, tagged `openenv`, accessible
-- [ ] * **README complete** — environment description, action/observation space definitions, task descriptions with difficulty, setup instructions, baseline scores
-### Quality Gates (Not DQ, but impact scoring)
-- [ ] All typed Pydantic models — no `Dict[str, Any]`
-- [ ] `import torch` in every core module — zero `import numpy` in core
-- [ ] Context-gated penalty fires correctly (manually tested both paths)
-- [ ] Task 5 red herrings present: FC spike, GPU 91%, conv1 near-vanishing, error_log warning
-- [ ] Task 6 code fix validation handles whitespace and comment variations
-- [ ] Task 6 diagnosis is always `code_bug` regardless of bug variant
-- [ ] Grader and reward function are separate modules
-- [ ] Step penalty is flat -0.01 (not multiplied by step_count)
-- [ ] Episode state is isolated per WebSocket session
-- [ ] Test suite passes with >80% coverage
-- [ ] Code formatted with black, linted with ruff, imports sorted with isort
-### Final Smoke Test Sequence
-Run this entire sequence the night before submission:
-```bash
-# 1. Clean build
-docker build --no-cache -t pytorch-debugger .
-docker run -d -p 7860:7860 --name smoke-test pytorch-debugger
-# 2. Wait for startup
-sleep 10
-curl -f http://localhost:7860/health || echo "FAIL: health"
-# 3. Tasks endpoint
-curl -f http://localhost:7860/tasks | python -m json.tool || echo "FAIL: tasks"
-# 4. Baseline reproducibility
-python baseline_heuristic.py > run1.json 2>/dev/null
-python baseline_heuristic.py > run2.json 2>/dev/null
-diff run1.json run2.json && echo "PASS: reproducible" || echo "FAIL: non-reproducible"
-# 5. Baseline via endpoint
-curl -f -X POST http://localhost:7860/baseline | python -m json.tool || echo "FAIL: baseline endpoint"
-# 6. Grader via endpoint (after baseline has completed episodes)
-curl -f -X POST http://localhost:7860/grader | python -m json.tool || echo "FAIL: grader endpoint"
-# 7. OpenEnv validation
-openenv validate || echo "FAIL: openenv validate"
-# 8. Test suite
-pytest tests/ -v --cov=ml_training_debugger --cov-report=term-missing
-# 9. Cleanup
-docker stop smoke-test && docker rm smoke-test
-echo "=== Smoke test complete ==="
-```
-### If Something Fails at Submission Time
-| Failure | Triage |
-|---|---|
-| HF Space won't deploy | Check Dockerfile CMD, port 7860, build logs. Redeploy. |
-| Baseline non-reproducible | Check `torch.manual_seed()` in `reset()`. Check for `random` module usage. |
-| Grader returns same score | Check that `sample_scenario` uses different seeds. Check grader logic has branching. |
-| `openenv validate` fails | Read error message. Usually missing field in openenv.yaml or wrong model base class. |
-| Docker image >500MB | Check `docker images` size. Remove unused deps. Ensure torch is CPU-only. |
-| Test coverage <80% | Run `pytest --cov` with `--cov-report=html`. Find uncovered branches. Add targeted tests. |

inference.py CHANGED Viewed

@@ -46,8 +46,10 @@ TASK_NAME = os.environ.get("TASK_NAME", "task_001")
 BENCHMARK = "pytorch-training-debugger"
 MAX_STEPS = 25
-MAX_TOTAL_REWARD = 1.0
-SUCCESS_SCORE_THRESHOLD = 0.6
 TEMPERATURE = 0.0
 MAX_TOKENS = 300
 FALLBACK_ACTION = '{"action_type": "inspect_gradients"}'

 BENCHMARK = "pytorch-training-debugger"
 MAX_STEPS = 25
+# Max achievable reward: +0.50 (diagnosis) +0.40 (convergence) +5*0.05 (investigations)
+# minus step penalties. Use 1.15 as the theoretical ceiling for normalization.
+MAX_TOTAL_REWARD = 1.15
+SUCCESS_SCORE_THRESHOLD = 0.5
 TEMPERATURE = 0.0
 MAX_TOKENS = 300
 FALLBACK_ACTION = '{"action_type": "inspect_gradients"}'

run_all_baselines.py DELETED Viewed

@@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-"""Run heuristic + multiple LLM baselines and show comparison table.
-Usage:
-    python3 run_all_baselines.py
-"""
-from __future__ import annotations
-import json
-import os
-import sys
-import time
-from concurrent.futures import ThreadPoolExecutor, as_completed
-from pathlib import Path
-# Load .env
-_env_path = Path(__file__).parent / ".env"
-if _env_path.exists():
-    for line in _env_path.read_text().splitlines():
-        line = line.strip()
-        if line and not line.startswith("#") and "=" in line:
-            key, _, value = line.partition("=")
-            os.environ.setdefault(key.strip(), value.strip())
-from baseline_heuristic import ALL_TASKS
-from baseline_heuristic import run_heuristic_episode
-from baseline_inference import PROVIDERS, run_llm_episode
-try:
-    from openai import OpenAI
-except ImportError:
-    print("Error: pip install openai")
-    sys.exit(1)
-def run_heuristic() -> dict[str, float]:
-    scores = {}
-    for task_id in ALL_TASKS:
-        scores[task_id] = round(run_heuristic_episode(task_id), 4)
-    return scores
-def run_llm_provider(provider_name: str, model: str | None = None) -> dict[str, float]:
-    prov = PROVIDERS[provider_name]
-    api_key = os.environ.get(prov["env_key"])
-    if not api_key:
-        return {t: -1.0 for t in ALL_TASKS}  # -1 = no key
-    model_name = model or prov["default_model"]
-    client_kwargs: dict = {"api_key": api_key}
-    if prov["base_url"]:
-        client_kwargs["base_url"] = prov["base_url"]
-    client = OpenAI(**client_kwargs)
-    scores: dict[str, float] = {}
-    for task_id in ALL_TASKS:
-        try:
-            score = run_llm_episode(task_id, client, model_name)
-            scores[task_id] = round(score, 4)
-            print(f"  [{provider_name}/{model_name}] {task_id}: {score:.4f}", file=sys.stderr)
-        except Exception as e:
-            err_str = str(e)[:80]
-            print(f"  [{provider_name}/{model_name}] {task_id}: ERROR — {err_str}", file=sys.stderr)
-            scores[task_id] = 0.0
-    return scores
-def main() -> None:
-    print("Running all baselines...\n", file=sys.stderr)
-    results: dict[str, dict[str, float]] = {}
-    # Run heuristic first (fast, deterministic)
-    print("--- Heuristic baseline ---", file=sys.stderr)
-    results["Heuristic"] = run_heuristic()
-    print(f"  Done: {json.dumps(results['Heuristic'])}", file=sys.stderr)
-    # Run LLM providers sequentially (avoids thread hang issues)
-    llm_runs = [
-        ("Cerebras/Llama-3.1-8B", "cerebras", "llama3.1-8b"),
-        ("Groq/Llama-3.1-8B", "groq", "llama-3.1-8b-instant"),
-    ]
-    for label, provider, model in llm_runs:
-        print(f"\n--- {label} ---", file=sys.stderr)
-        try:
-            results[label] = run_llm_provider(provider, model)
-        except Exception as e:
-            print(f"  {label}: FAILED — {e}", file=sys.stderr)
-            results[label] = {t: 0.0 for t in ALL_TASKS}
-    # Print comparison table
-    print("\n" + "=" * 80)
-    print("BASELINE COMPARISON TABLE")
-    print("=" * 80)
-    headers = list(results.keys())
-    print(f"\n{'Task':<12}", end="")
-    for h in headers:
-        print(f"{h:>25}", end="")
-    print()
-    print("-" * (12 + 25 * len(headers)))
-    for task_id in ALL_TASKS:
-        print(f"{task_id:<12}", end="")
-        for h in headers:
-            score = results[h].get(task_id, 0.0)
-            if score < 0:
-                print(f"{'no key':>25}", end="")
-            else:
-                print(f"{score:>25.4f}", end="")
-        print()
-    print("-" * (12 + 25 * len(headers)))
-    # Averages
-    print(f"{'AVERAGE':<12}", end="")
-    for h in headers:
-        valid = [v for v in results[h].values() if v >= 0]
-        avg = sum(valid) / len(valid) if valid else 0
-        print(f"{avg:>25.4f}", end="")
-    print()
-    # Save JSON
-    print(json.dumps(results, indent=2))
-if __name__ == "__main__":
-    main()