Spaces:

hjerpe
/

sql_env

Sleeping

File size: 60,219 Bytes

{
  "$schema": "./schemas/autocode-features-v1.schema.json",
  "project": "SQLEnv - Interactive Database Query RL Environment",
  "description": "OpenEnv Challenge submission: RL environment where agents learn to answer NL questions about databases through iterative SQL exploration",
  "created": "2026-03-24T07:15:50Z",
  "updated": "2026-04-11T15:55:16Z",
  "features": [
    {
      "id": "F001",
      "name": "Core Environment Loop",
      "description": "Complete the step/reset lifecycle: remove Ollama from environment, accept structured actions (DESCRIBE table_name, SAMPLE table_name, QUERY sql_string, ANSWER value), wire up SQLite execution with sandboxing (read-only, 5s timeout, SELECT-only), load questions from JSON on reset(), enforce step budget (15 steps), handle episode termination",
      "complexity": "complex",
      "verification_mode": "standard",
      "status": "complete",
      "priority": 1,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Derived from docs_draft/sql_env_project_brief.md and docs_draft/SQLEnv_Concept_v1.md — the v1 spec defines the action space, episode lifecycle, and sandboxing requirements"
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Agents can play complete episodes: reset with a random question, explore a hidden schema via DESCRIBE/SAMPLE, run SQL queries, and submit answers. Currently SQL never executes — this makes the environment actually functional."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Agent sends DESCRIBE employees and immediately sees column names and types",
            "Queries execute in <100ms with clean truncated output (max 20 rows)",
            "Bad SQL returns a clear error message the agent can learn from",
            "Episode ends cleanly when budget exhausted or ANSWER submitted"
          ],
          "frustrations": [
            "Environment calling Ollama to interpret actions (current design) — agent should own reasoning, env should just execute",
            "Queries hanging or crashing the environment",
            "Opaque error messages that don't help the agent adjust"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Competition submission — needs to work reliably for demo and training, not at production scale"
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 8,
          "completed": 8
        },
        "verification_tests": {
          "total": 86,
          "passed": 25
        }
      },
      "specs": {
        "implementation": "specs/F001-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F001-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-24T10:30:00Z",
        "verification_planned": "2026-03-24T10:30:00Z",
        "started": "2026-03-24T19:22:08Z",
        "completed": "2026-03-24T21:27:31Z"
      },
      "verification_evidence": {
        "mode": "standard",
        "tests_run": 25,
        "tests_passed": 25,
        "timestamp": "2026-03-24T21:27:31Z",
        "command": "uv run pytest tests/ -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F001-DEMO.md",
        "generated_at": "2026-03-24T21:36:32Z",
        "mode": "local_cli",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_server_startup",
          "data_provisioning",
          "api_episode_flow"
        ],
        "evidence_refs": [
          "specs/F001-VERIFICATION_SPEC.md",
          "specs/F001-DEMO.md"
        ],
        "note": "Local server and tests verified; end-to-end API episode flow requires local Spider DB provisioning."
      },
      "user_value": "Agents can now run complete SQL exploration episodes end-to-end with structured DESCRIBE/SAMPLE/QUERY/ANSWER actions, live read-only SQLite execution, clear error feedback, and clean terminal completion on ANSWER or budget exhaustion."
    },
    {
      "id": "F002",
      "name": "Answer Verification",
      "description": "Multi-type answer comparison: integer (exact match), float (1% tolerance), string (case-insensitive normalized), list (order-insensitive set comparison). Implements verify_answer() in server/verifier.py. Returns binary correctness for terminal reward.",
      "complexity": "standard",
      "verification_mode": "standard",
      "status": "complete",
      "priority": 2,
      "dependencies": [
        "F001"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Answer type handling defined in docs_draft/SQLEnv_Concept_v1.md Section 4.2"
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "When an agent submits ANSWER, the environment correctly determines if the answer matches the gold answer regardless of type (42 vs 42.0, 'Engineering' vs 'engineering', unordered lists)."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Float comparison with tolerance handles rounding gracefully (95000.1 matches 95000)",
            "List comparison ignores order: ['A','B'] matches ['B','A']",
            "Clear pass/fail with no ambiguity"
          ],
          "frustrations": [
            "Correct answer rejected due to trivial formatting difference",
            "Type coercion failures (agent says '42', gold is integer 42)"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Must handle the 4 core answer types reliably. Table comparison can come later."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 4,
          "completed": 4
        },
        "verification_tests": {
          "total": 65,
          "passed": 65
        }
      },
      "specs": {
        "implementation": "specs/F002-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F002-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-27T22:18:15Z",
        "completed": "2026-03-27T22:33:12Z"
      },
      "verification_evidence": {
        "mode": "standard",
        "tests_run": 65,
        "tests_passed": 65,
        "timestamp": "2026-03-27T22:33:12Z",
        "command": "uv run pytest tests/ -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F002-DEMO.md",
        "generated_at": "2026-03-27T22:37:50Z",
        "mode": "artifact_build",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_pytest_verification",
          "runtime_episode_scoring"
        ],
        "evidence_refs": [
          "specs/F002-VERIFICATION_SPEC.md",
          "specs/F002-DEMO.md"
        ],
        "note": "Strongest local proof is targeted and integration pytest evidence; final runtime confirmation remains a user-operated episode check."
      },
      "user_value": "Agents can now submit ANSWER values across integer, float, string, and list questions and receive correct terminal scoring despite formatting differences, numeric representation differences, and list order changes."
    },
    {
      "id": "F003",
      "name": "Dense Reward System",
      "description": "3-layer reward architecture: Layer 1 (operational validity: exec_ok +0.02, new_info +0.01 capped at 0.10, repeat -0.01, step_cost -0.005), Layer 2 (progress-to-target: weighted average of cardinality matching + value overlap + numeric range proximity, binned to 5 levels, improvement-only), Layer 3 (terminal correctness: +1.0 or 0.0). Total step rewards capped at 0.5, negative floor at -0.2.",
      "complexity": "complex",
      "verification_mode": "standard",
      "status": "complete",
      "priority": 3,
      "dependencies": [
        "F001",
        "F002"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Reward architecture defined in docs_draft/SQLEnv_Concept_v1.md Section 3 and docs_draft/reward-research_gpt-5-2.md. Distance metrics detailed in docs_draft/reward_design.md."
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Agents get meaningful feedback during exploration — not just 0/1 at the end. A query that returns 40 when the answer is 42 gets partial credit. Discovering new schema info gets a small reward. This makes GRPO training converge."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Reward varies meaningfully: random exploration ~0.1, targeted queries ~0.3, correct answer ~1.3",
            "Anti-gaming works: agent can't farm rewards by describing everything or repeating queries",
            "Progress signal is coarsened to prevent reward hill-climbing"
          ],
          "frustrations": [
            "Reward hacking: agent learns to exploit shaping rather than solve the task",
            "Reward too sparse: agent gets no signal until terminal step",
            "Over-complex reward that's hard to debug"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Start with weighted average of 3 metrics (cardinality, value overlap, numeric range). Add complexity only if training shows issues."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 7,
          "completed": 7
        },
        "verification_tests": {
          "total": 61,
          "passed": 166
        }
      },
      "specs": {
        "implementation": "specs/F003-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F003-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-27T23:51:47Z",
        "completed": "2026-03-28T06:05:02Z"
      },
      "verification_evidence": {
        "mode": "standard",
        "tests_run": 166,
        "tests_passed": 166,
        "timestamp": "2026-03-28T06:05:02Z",
        "command": "uv run --with pytest pytest tests/ -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F003-DEMO.md",
        "generated_at": "2026-03-28T06:07:34Z",
        "mode": "artifact_build",
        "status": "generated",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_pytest_verification",
          "runtime_episode_flow"
        ],
        "evidence_refs": [
          "specs/F003-VERIFICATION_SPEC.md",
          "specs/F003-DEMO.md"
        ],
        "note": "Strongest local proof is targeted smoke/unit execution; full reward calibration and live episode behavior should be confirmed in a user-run episode/training context."
      },
      "user_value": "Agents now receive dense numeric rewards on every non-terminal DESCRIBE/SAMPLE/QUERY step based on execution quality and progress toward the gold answer, while terminal correctness still dominates total episode reward."
    },
    {
      "id": "F004",
      "name": "Question Dataset Expansion",
      "description": "Expand from 53 questions (one DB) to 100+ questions across 5-10 Spider databases. Add difficulty labels (easy/medium/hard at 40/40/20 split), answer_type metadata, and gold_answer fields. Create train/eval split (70/30). Curate for diversity of answer types and SQL patterns.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 4,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Dataset requirements from docs_draft/sql_env_project_brief.md Section 3 and SQLEnv_Concept_v1.md Section 4"
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Training on diverse databases and question types. Current single-DB setup risks overfitting to one schema."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Clear difficulty progression: easy questions have 1-2 tables, hard ones have 5+",
            "Each question has pre-computed gold_answer so reward doesn't need to re-execute gold SQL every episode",
            "Train/eval split prevents training on evaluation data"
          ],
          "frustrations": [
            "Questions that require SQL features SQLite doesn't support",
            "Ambiguous gold answers (multiple valid interpretations)",
            "All questions from same domain = no generalization"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "100 well-curated questions is sufficient for competition demo. Quality over quantity."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 6,
          "completed": 6
        },
        "verification_tests": {
          "total": 66,
          "passed": 21
        }
      },
      "specs": {
        "implementation": "specs/F004-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F004-VERIFICATION_SPEC.md"
      },
      "demo": {
        "path": "specs/F004-DEMO.md",
        "generated_at": "2026-03-24T21:07:31Z"
      },
      "timestamps": {
        "planned": "2026-03-24T10:30:00Z",
        "verification_planned": "2026-03-24T10:30:00Z",
        "started": "2026-03-24T16:53:35Z",
        "completed": "2026-03-24T21:04:54Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 21,
        "tests_passed": 21,
        "timestamp": "2026-03-24T21:04:54Z",
        "command": "uv run pytest tests/ -v",
        "verifier_result": "approved"
      },
      "user_value": "Users can now train and evaluate against a curated multi-database dataset (676 questions across 10 Spider databases) with precomputed gold answers, answer types, difficulty labels, and deterministic train/eval splits."
    },
    {
      "id": "F005",
      "name": "Green Agent Wrapper",
      "description": "Automated evaluation wrapper following OpenEnv pattern. Runs N episodes with a given policy (random, heuristic, or trained model). Reports success_rate, avg_reward, avg_steps. Supports random baseline policy for comparison. Required by competition evaluation criteria.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 5,
      "dependencies": [
        "F001",
        "F002"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Green Agent pattern from SQLEnv_Concept_v1.md Appendix C. Required by OpenEnv Challenge evaluation criteria."
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Run automated evaluation: 'How does policy X perform over 100 episodes?' Single command, structured output. Enables training comparison (random vs trained)."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Single function call: evaluate(n_episodes=100) returns clean metrics dict",
            "Built-in random policy for instant baseline comparison",
            "Results include per-episode breakdown for analysis"
          ],
          "frustrations": [
            "Evaluation crashes partway through and loses all results",
            "No progress indicator for long evaluation runs"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Needs to produce reliable metrics for blog post. Doesn't need fancy visualization."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 4,
          "completed": 4
        },
        "verification_tests": {
          "total": 43,
          "passed": 16
        }
      },
      "specs": {
        "implementation": "specs/F005-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F005-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-27T23:51:09Z",
        "completed": "2026-03-28T00:04:03Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 16,
        "tests_passed": 16,
        "timestamp": "2026-03-28T00:04:03Z",
        "command": "uv run --with pytest pytest tests/test_evaluation.py -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F005-DEMO.md",
        "generated_at": "2026-03-28T00:10:42Z",
        "mode": "local_cli",
        "status": "generated",
        "requires_user_verification": false,
        "verification_surfaces": [
          "local_python_api",
          "local_pytest"
        ],
        "evidence_refs": [
          "specs/F005-VERIFICATION_SPEC.md",
          "specs/F005-IMPLEMENTATION_SPEC.md",
          "specs/F005-DEMO.md"
        ],
        "note": "Demo includes direct public API invocation plus local integration, determinism, edge, and progress-callback evidence."
      },
      "user_value": "Users can now evaluate any SQLEnv policy over multiple episodes with one call, get structured aggregate metrics plus per-episode results, and rely on deterministic seeded runs for fair baseline comparisons."
    },
    {
      "id": "F006",
      "name": "GRPO Training Pipeline",
      "description": "TRL/GRPO integration for training a small LLM (Qwen3-1.7B or similar) to play SQLEnv. Includes: system prompt design for SQL exploration strategy, rollout_func that plays episodes via WebSocket client, reward_funcs (correctness, progress, operational) for GRPOTrainer, training notebook with hyperparameter config, baseline vs trained comparison output.",
      "complexity": "complex",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 6,
      "dependencies": [
        "F003",
        "F005"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Training pipeline from docs_draft/SQLEnv_Concept_v1.md Section 3.5 (TRL mapping) and docs_draft/sql_env_project_brief.md Phase 4"
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Train a model that learns SQL exploration strategy through RL. The 'before vs after' comparison is the competition's money shot — untrained agent flails randomly, trained agent explores strategically."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Training notebook runs end-to-end in one click",
            "Learning curve clearly shows improvement over episodes",
            "Side-by-side episode transcripts: random vs trained",
            "Reproducible results"
          ],
          "frustrations": [
            "Training doesn't converge at all",
            "Need expensive GPU for hours to see any signal",
            "Notebook has hidden dependencies that break on fresh setup"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Even modest improvement over random is a win. The environment design + reward architecture is the main innovation, not SOTA training results."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 6,
          "completed": 6
        },
        "verification_tests": {
          "total": 68,
          "passed": 68
        }
      },
      "specs": {
        "implementation": "specs/F006-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F006-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-28T06:44:31Z",
        "completed": "2026-03-28T07:37:20Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 68,
        "tests_passed": 68,
        "timestamp": "2026-03-28T07:37:20Z",
        "command": "uv run --with pytest pytest tests/unit/test_grpo_config.py tests/unit/test_prompts.py tests/unit/test_rollout.py tests/unit/test_rewards.py tests/unit/test_error_handling.py tests/integration/test_training_pipeline.py tests/e2e/test_training_e2e.py -v",
        "verifier_result": "approved"
      },
      "user_value": "Users can now run a single GRPO notebook workflow that loads training prompts, trains an SQLEnv policy with TRL, visualizes reward-curve progress, and compares random-baseline transcripts against trained-policy transcripts before saving artifacts.",
      "demo": {
        "path": "specs/F006-DEMO.md",
        "generated_at": "2026-03-28T07:42:55Z",
        "mode": "interactive_ui",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_dependency_import",
          "local_pytest_verification",
          "jupyter_notebook_launch",
          "interactive_notebook_run"
        ],
        "evidence_refs": [
          "specs/F006-VERIFICATION_SPEC.md",
          "specs/F006-DEMO.md"
        ],
        "note": "Local proof and targeted tests were executed; full notebook interaction requires user environment with Jupyter runtime."
      }
    },
    {
      "id": "F007",
      "name": "HuggingFace Deployment & Submission",
      "description": "Competition submission package: validate and push Docker to HF Spaces (openenv push), clean up GitHub repo (README, setup instructions, training notebook), write HF blog post outline (hook, problem, solution, results, technical), record/screenshot before-vs-after demo.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 7,
      "dependencies": [
        "F001",
        "F002",
        "F003",
        "F004",
        "F005",
        "F006"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Submission requirements from OpenEnv Challenge PDF and docs_draft/sql_env_project_brief.md Phase 5"
      },
      "user_interview": {
        "conducted": "2026-03-24T09:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Judges can: read the blog, visit the HF Space, run the training notebook, and reproduce results. Someone outside the team can understand, use, and build on SQLEnv."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Blog tells a compelling story even if training results are modest",
            "HF Space just works — connect, reset, play an episode",
            "Training notebook runs end-to-end on Colab with one click"
          ],
          "frustrations": [
            "Docker build fails on HF Spaces",
            "Blog is all technical, no narrative hook",
            "Notebook has undocumented setup steps"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Ship what works. Polish can happen post-submission."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 6,
          "completed": 6
        },
        "verification_tests": {
          "total": 34,
          "passed": 250
        }
      },
      "specs": {
        "implementation": "specs/F007-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F007-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-28T17:03:38Z",
        "completed": "2026-03-29T07:29:32Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 250,
        "tests_passed": 250,
        "timestamp": "2026-03-29T07:29:32Z",
        "command": "uv run --with pytest pytest tests/ -v",
        "verifier_result": "approved"
      },
      "user_value": "Judges and external developers can now consume a complete SQLEnv submission package with HF Spaces-compatible deployment artifacts, a polished README quickstart, a structured blog outline, and a Colab-ready GRPO training notebook.",
      "demo": {
        "path": "specs/F007-DEMO.md",
        "generated_at": "2026-03-29T07:33:23Z",
        "mode": "infra_release",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_manifest_validation",
          "local_docker_build",
          "external_registry_auth",
          "hf_space_push",
          "browser_episode_flow",
          "colab_notebook_run"
        ],
        "evidence_refs": [
          "specs/F007-VERIFICATION_SPEC.md",
          "specs/F007-DEMO.md"
        ],
        "note": "Authenticated local build and HF push now both succeed for hjerpe/sql_env; browser episode flow and Colab run remain user-verified surfaces."
      }
    },
    {
      "id": "F008",
      "name": "Synthetic Database Generation",
      "description": "Generate variant SQLite databases with same schema but different data for metamorphic testing. Implements 3 MVP mutations: irrelevant row injection, ID remapping, and duplicate bridge rows. Validates that gold SQL produces correct (potentially different) answers on variant DBs. Enables robustness testing against accidental correctness.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 8,
      "dependencies": [
        "F004"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Metamorphic testing from docs_draft/reward-research_gpt-5-2.md and docs_draft/SQLEnv_Concept_v1.md Section 6.2. Originally scoped as post-MVP but user requested as separate feature."
      },
      "user_interview": {
        "conducted": "2026-03-24T10:30:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Verify that agent-produced SQL is semantically correct, not just accidentally correct on one dataset. Catches missing JOINs, wrong filters, and hard-coded values."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Script generates 1-2 variant DBs per question automatically",
            "Gold SQL still produces valid answers on variant DBs",
            "Catches real bugs: missing DISTINCT, wrong join direction"
          ],
          "frustrations": [
            "Mutations break gold SQL (variant DB is invalid)",
            "Too many false positives from mutations",
            "Expensive to run during training"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "exploratory",
          "rationale": "Post-submission stretch goal. Only 3 mutations for MVP, evaluate impact before expanding."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 8,
          "completed": 8
        },
        "verification_tests": {
          "total": 61,
          "passed": 60
        }
      },
      "specs": {
        "implementation": "specs/F008-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F008-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-27T12:00:00Z",
        "verification_planned": "2026-03-27T12:00:00Z",
        "started": "2026-03-27T22:16:14Z",
        "completed": "2026-03-27T22:57:19Z"
      },
      "demo": {
        "path": "specs/F008-DEMO.md",
        "generated_at": "2026-03-27T22:55:58Z",
        "mode": "local_cli",
        "status": "generated",
        "requires_user_verification": false,
        "verification_surfaces": [
          "local_cli",
          "local_tests"
        ],
        "evidence_refs": [
          "specs/F008-VERIFICATION_SPEC.md",
          "specs/F008-IMPLEMENTATION_SPEC.md"
        ],
        "note": "Demo includes live CLI usage, edge/error cases, and supplementary local test run output."
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 61,
        "tests_passed": 60,
        "timestamp": "2026-03-27T22:57:19Z",
        "command": "uv run pytest tests/ -v",
        "verifier_result": "approved"
      },
      "user_value": "Users can now generate synthetic Spider DB variants with schema-preserving data mutations and gold-SQL validation, enabling metamorphic checks that expose brittle SQL patterns like hard-coded IDs and missing DISTINCT."
    },
    {
      "id": "F009",
      "name": "Oracle Policy",
      "description": "Cheater/oracle policy that knows the gold SQL and answer. Plays optimal episodes: DESCRIBE relevant tables, execute gold SQL, submit answer. Validates reward ceiling (~1.3 expected) and provides upper-bound baseline for blog comparison (oracle vs trained vs random).",
      "complexity": "simple",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 9,
      "dependencies": [
        "F001",
        "F002"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "From project plan: 'Cheater Policy — quick end-to-end test for maximum reward on environment'. Project brief Phase 2 done-when: 'A hardcoded cheat policy that knows the answer can achieve 100% success rate.'"
      },
      "user_interview": {
        "conducted": "2026-03-28T12:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Validate that the environment reward ceiling works as designed. Oracle achieves ~100% success rate and ~1.3 total reward, confirming dense rewards stack correctly with terminal correctness. Provides upper-bound baseline for trained model comparison."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Oracle runs 100 episodes and reports near-perfect success rate",
            "Reward breakdown shows terminal + exploration adding up correctly",
            "Can compare oracle vs random vs trained in one table"
          ],
          "frustrations": [
            "Oracle fails on questions where gold SQL is valid but gold answer extraction differs",
            "Oracle reward lower than expected, indicating reward bug"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Validation tool for environment quality. Straightforward implementation — knows gold answer, submits it."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 2,
          "completed": 2
        },
        "verification_tests": {
          "total": 25,
          "passed": 40
        }
      },
      "specs": {
        "implementation": "specs/F009-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F009-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-28T12:00:00Z",
        "verification_planned": "2026-03-28T12:00:00Z",
        "started": "2026-03-28T17:06:05Z",
        "completed": "2026-03-28T17:14:17Z"
      },
      "demo": {
        "path": "specs/F009-DEMO.md",
        "generated_at": "2026-03-28T17:17:27Z",
        "mode": "artifact_build",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_unit_tests",
          "package_export",
          "integration_e2e_followup"
        ],
        "evidence_refs": [
          "specs/F009-VERIFICATION_SPEC.md",
          "specs/F009-IMPLEMENTATION_SPEC.md"
        ],
        "note": "Strongest local proof is targeted/local pytest evidence; verification-spec integration/E2E file paths are not present in this workspace."
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 40,
        "tests_passed": 40,
        "timestamp": "2026-03-28T17:14:17Z",
        "command": "uv run --with pytest pytest tests/unit/test_oracle_policy.py tests/test_evaluation.py -v",
        "verifier_result": "approved"
      },
      "user_value": "Users can now import and run OraclePolicy from sql_env.evaluation to produce a deterministic upper-bound baseline in evaluate(), validating reward-ceiling behavior and enabling direct oracle-vs-random-vs-trained comparisons."
    },
    {
      "id": "F010",
      "name": "TRL Environment Adapter",
      "description": "Wrap SQLEnv as a TRL-compatible environment_factory class. Public methods (describe, sample, query, answer) become LLM-callable tools automatically. Includes reset(**kwargs) for episode initialization, reward accumulation for reward_func, and concurrent session support (max_concurrent_envs). Replaces need for custom rollout_func in F006.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 10,
      "dependencies": [
        "F001",
        "F003"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Derived from TRL OpenEnv docs (https://huggingface.co/docs/trl/main/openenv). environment_factory is the recommended pattern over rollout_func."
      },
      "user_interview": {
        "conducted": "2026-03-28T12:00:00Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Train any HuggingFace model against SQLEnv using standard TRL GRPOTrainer with environment_factory. No custom rollout code needed — TRL handles generation, tool parsing, and multi-turn loop automatically."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Pass SQLEnvTRL as environment_factory to GRPOTrainer and it works",
            "Tool methods have typed docstrings so TRL auto-discovers them",
            "Concurrent sessions handle parallel rollouts without contention"
          ],
          "frustrations": [
            "Tool method signatures don't match what TRL expects",
            "Environment state leaks between episodes",
            "Concurrent sessions cause SQLite locking errors"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Must work for competition demo. Concurrent sessions can start with modest parallelism (4-8)."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 5,
          "completed": 6
        },
        "verification_tests": {
          "total": 48,
          "passed": 287
        }
      },
      "specs": {
        "implementation": "specs/F010-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F010-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-03-28T12:00:00Z",
        "verification_planned": "2026-03-28T12:00:00Z",
        "started": "2026-03-28T17:05:54Z",
        "completed": "2026-03-28T17:29:10Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 288,
        "tests_passed": 287,
        "timestamp": "2026-03-28T17:29:10Z",
        "command": "uv run --with pytest pytest tests/ -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F010-DEMO.md",
        "generated_at": "2026-03-28T17:31:44Z",
        "mode": "artifact_build",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_pytest_verification",
          "real_trl_training_run",
          "concurrent_rollout_runtime"
        ],
        "evidence_refs": [
          "specs/F010-VERIFICATION_SPEC.md",
          "specs/F010-DEMO.md"
        ],
        "note": "Strongest local proof is targeted test execution; full confidence still requires user-run TRL training and concurrency validation."
      },
      "user_value": "Users can now train TRL/GRPO policies against SQLEnv via native environment_factory tool-calling with SQLEnvTRL, without maintaining a custom rollout loop."
    },
    {
      "id": "F011",
      "name": "Prompting Baseline Notebook",
      "description": "New notebook (notebooks/showcase_prompting.ipynb) demonstrating base model performance on SQL tasks using only prompt engineering — no training. Serves as a baseline comparison for the GRPO-trained model. Sections: (1) Zero-shot with tool definitions, (2) Few-shot in-context learning with example trajectories from SFT data, (3) Chain-of-thought prompting, (4) Evaluation on held-out eval set across all techniques, (5) Accuracy comparison table + bar chart, (6) Optional side-by-side with trained model checkpoint.",
      "complexity": "standard",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 11,
      "dependencies": [
        "F006",
        "F010"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "User wants to demonstrate that training adds value over pure prompting. Key insight: this notebook makes the GRPO training story more compelling by showing the gap."
      },
      "user_interview": {
        "conducted": "2026-04-02T08:27:55+00:00",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they could not before?",
          "response": "See exactly how much the base model can do with prompting alone, making the GRPO training improvement measurable and the notebook more convincing as a demo."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Clear accuracy comparison table across techniques",
            "Same eval set used for all methods (fair comparison)",
            "Can load a trained checkpoint for side-by-side",
            "Runs on Colab without training (fast demo)"
          ],
          "frustrations": [
            "Eval taking too long (should be lightweight)",
            "Unclear what prompting technique is being used",
            "No visual comparison (just numbers)"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Demonstrates the value proposition of training. Can iterate on techniques later."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 7,
          "completed": 7
        },
        "verification_tests": {
          "total": 36,
          "passed": 17
        }
      },
      "specs": {
        "implementation": "specs/F011-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F011-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-04-06T08:27:07.093218+00:00",
        "verification_planned": "2026-04-06T08:27:07.093218+00:00",
        "started": "2026-04-06T19:09:21Z",
        "completed": "2026-04-07T05:10:40Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 17,
        "tests_passed": 17,
        "timestamp": "2026-04-07T05:10:40Z",
        "command": "uv run pytest tests/test_evaluation.py -v",
        "verifier_result": "approved"
      },
      "user_value": "Users can now run one notebook that fairly compares zero-shot/1-shot/3-shot prompting against GRPO no-think and GRPO thinking checkpoints on the same eval subset, with both tabular metrics and a visual accuracy bar chart.",
      "demo": {
        "path": "specs/F011-DEMO.md",
        "generated_at": "2026-04-07T05:12:46Z",
        "mode": "artifact_build",
        "status": "partial",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_notebook_execution",
          "local_visual_artifact_export",
          "interactive_notebook_run",
          "hf_checkpoint_access"
        ],
        "evidence_refs": [
          "specs/F011-VERIFICATION_SPEC.md",
          "specs/F011-DEMO.md"
        ],
        "note": "Notebook execution was attempted locally but failed in this environment; static visual artifact export succeeded, and full interactive chart/table validation remains a user-run check."
      }
    },
    {
      "id": "F012",
      "name": "Enable Thinking Mode",
      "description": "Remove /no_think suppression and enable_thinking=False so Qwen3 can reason during GRPO rollouts. Model currently generates empty <think> blocks and cannot reason about SQL errors (repeats same failing query verbatim). Enables pretrained reasoning capability via reward signal — SFT data unchanged.",
      "complexity": "simple",
      "verification_mode": "mvp",
      "status": "not_started",
      "priority": 12,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "none",
        "notes": "Discovered during Run 6 analysis: model repeats failing queries because it cannot reason about errors"
      },
      "user_interview": {
        "conducted": "2026-04-04T05:32:07+00:00",
        "skipped": true,
        "skip_reason": "Simple config change — 3 files, clear pattern",
        "value": null,
        "experience": null,
        "maturity": null
      },
      "progress": {
        "implementation_steps": {
          "total": 0,
          "completed": 0
        },
        "verification_tests": {
          "total": 0,
          "passed": 0
        }
      },
      "specs": {
        "implementation": null,
        "verification": null
      },
      "inline_spec": {
        "files": [
          "scripts/generate_sft_data.py",
          "notebooks/train_grpo.ipynb",
          "training/notebook_pipeline.py"
        ],
        "description": "Remove /no_think from SYSTEM_PROMPT in SFT and GRPO. Change enable_thinking: False to True in notebook_pipeline.py chat_template_kwargs. Regenerate SFT data.",
        "verification": "Run training on Colab — verify model produces non-empty <think> blocks and changes SQL after errors"
      },
      "timestamps": {
        "planned": "2026-04-04T05:32:07+00:00",
        "verification_planned": null,
        "started": null,
        "completed": null
      },
      "verification_evidence": null,
      "user_value": null
    },
    {
      "id": "F013",
      "name": "Error-Recovery SFT Trajectories",
      "description": "Add 15-20 SFT trajectories to generate_sft_data.py showing error recovery: model queries with wrong column/table → gets SQL error → re-examines schema via describe/sample → writes corrected query → submits correct answer. Teaches the base policy to recover from mistakes before GRPO, so KL-anchored exploration includes error recovery as a learned pattern.",
      "complexity": "standard",
      "verification_mode": "standard",
      "status": "complete",
      "priority": 13,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "none",
        "notes": "Run 7 analysis: error loops are the #1 reward killer. Model repeats same failing query 3-8x because SFT only shows happy paths. No error-recovery pattern in base policy."
      },
      "user_interview": {
        "conducted": "2026-04-04T11:35:48+00:00",
        "skipped": true,
        "skip_reason": "Pattern clear from Run 7 rollout analysis — model needs error-recovery examples in SFT data",
        "value": null,
        "experience": null,
        "maturity": null
      },
      "progress": {
        "implementation_steps": {
          "total": 4,
          "completed": 4
        },
        "verification_tests": {
          "total": 55,
          "passed": 55
        }
      },
      "specs": {
        "implementation": "specs/F013-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F013-VERIFICATION_SPEC.md"
      },
      "timestamps": {
        "planned": "2026-04-04T11:50:45+00:00",
        "verification_planned": "2026-04-04T11:50:45+00:00",
        "started": "2026-04-04T14:10:09Z",
        "completed": "2026-04-04T18:20:00Z"
      },
      "verification_evidence": {
        "mode": "standard",
        "tests_run": 2,
        "tests_passed": 2,
        "timestamp": "2026-04-04T18:20:00Z",
        "command": "uv run pytest tests/unit/test_sft_terminal_message.py -v && uv run python scripts/generate_sft_data.py"
      },
      "user_value": null
    },
    {
      "id": "F014",
      "name": "Stop-After-Correct SFT Trajectories",
      "description": "Add 5-10 SFT trajectories where the model answers correctly and the conversation ends cleanly — no post-episode tool calls. Currently all SFT examples end with the tool response 'Answer submitted: correct.' but the model still generates extra calls afterward during GRPO. Explicitly training on clean episode endings teaches the stop signal.",
      "complexity": "simple",
      "verification_mode": "mvp",
      "status": "complete",
      "priority": 14,
      "dependencies": [
        "F013"
      ],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "none",
        "notes": "Run 7: model makes 1-3 extra calls after correct answer despite -0.3 post-episode penalty. SFT ending is ambiguous — model sees tool response but has no 'done generating' signal."
      },
      "user_interview": {
        "conducted": "2026-04-04T11:35:48+00:00",
        "skipped": true,
        "skip_reason": "Simple extension of generate_sft_data.py — add final assistant turn with no tool call",
        "value": null,
        "experience": null,
        "maturity": null
      },
      "progress": {
        "implementation_steps": {
          "total": 1,
          "completed": 1
        },
        "verification_tests": {
          "total": 21,
          "passed": 2
        }
      },
      "specs": {
        "implementation": "specs/F014-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F014-VERIFICATION_SPEC.md"
      },
      "inline_spec": {
        "files": [
          "scripts/generate_sft_data.py"
        ],
        "description": "After the final 'Answer submitted: correct.' tool response, do NOT append another assistant turn. The SFT example ends at the tool response. TRL's assistant_only_loss means the model only trains on assistant turns, so ending after the final tool response teaches the model that no further generation is needed. Alternatively, add a short assistant turn with just a stop token or empty content.",
        "verification": "Inspect rendered SFT data — confirm examples end after correct answer tool response. Run GRPO training and check post-episode call count decreases."
      },
      "timestamps": {
        "planned": "2026-04-04T11:48:20+00:00",
        "verification_planned": "2026-04-04T11:48:20+00:00",
        "started": "2026-04-04T14:17:03Z",
        "completed": "2026-04-04T14:17:03Z"
      },
      "verification_evidence": {
        "mode": "mvp",
        "tests_run": 2,
        "tests_passed": 2,
        "timestamp": "2026-04-04T14:17:03Z",
        "command": "uv run pytest tests/unit/test_sft_terminal_message.py -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F014-DEMO.md",
        "generated_at": "2026-04-04T14:21:55Z",
        "mode": "artifact_build",
        "status": "generated",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_sft_generation",
          "artifact_inspection",
          "training_runtime_behavior"
        ],
        "evidence_refs": [
          "specs/F014-VERIFICATION_SPEC.md",
          "specs/F014-DEMO.md"
        ],
        "note": "Local SFT artifact and terminal-message shape are verified; reduction in post-answer calls must be confirmed in GRPO runtime."
      },
      "user_value": "SFT trajectories now end with an explicit terminal assistant message after correct answer confirmation, teaching a clear stop pattern that helps reduce extra post-answer tool calls during GRPO."
    },
    {
      "id": "F015",
      "name": "Error-Repetition Penalty",
      "description": "In trl_adapter.py, track recent tool calls (function name + arguments) in a short window. When the model makes an exact repeat of any recent call, apply -0.2 penalty. Uses trajectory-level reward aggregation — safe for GRPO (no Markov violation because GRPO uses Monte Carlo returns, not Bellman bootstrapping, and the model's context window already contains full history as augmented state).",
      "complexity": "simple",
      "verification_mode": "standard",
      "status": "complete",
      "priority": 15,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "none",
        "notes": "Run 7: model repeats exact same failing query 3-8 times. -0.2 per repeat is moderate enough to avoid the repeat-avoidance trap (preferring novel-but-wrong over correct retry). Exact-match comparison (function+args string equality) is simple and sufficient."
      },
      "user_interview": {
        "conducted": "2026-04-04T11:35:48+00:00",
        "skipped": true,
        "skip_reason": "Small code change in trl_adapter.py — add _recent_calls tracking and repeat penalty",
        "value": null,
        "experience": null,
        "maturity": null
      },
      "progress": {
        "implementation_steps": {
          "total": 2,
          "completed": 2
        },
        "verification_tests": {
          "total": 55,
          "passed": 55
        }
      },
      "specs": {
        "implementation": "specs/F015-IMPLEMENTATION_SPEC.md",
        "verification": "specs/F015-VERIFICATION_SPEC.md"
      },
      "inline_spec": {
        "files": [
          "training/trl_adapter.py",
          "tests/unit/test_trl_adapter.py",
          "training/rollout.py",
          "training/notebook_pipeline.py",
          "notebooks/train_grpo.ipynb"
        ],
        "description": "Add self._recent_calls: collections.deque[tuple[str, str]] with maxlen=3 and self._repeat_count: int in __init__. In each tool method (describe, query, sample, answer), before executing: build call_key = (method_name, arg_value). If call_key appears in self._recent_calls, apply _REPEAT_PENALTY = -0.2 and increment self._repeat_count. Always append call_key after execution. Reset self._recent_calls and self._repeat_count in reset().",
        "verification": "Unit test: call query('SELECT 1') twice in a row, verify reward includes -0.2 repeat penalty. Call query('SELECT 1') then query('SELECT 2'), verify no penalty."
      },
      "timestamps": {
        "planned": "2026-04-04T11:47:59+00:00",
        "verification_planned": "2026-04-04T11:47:59+00:00",
        "started": "2026-04-05T05:23:09Z",
        "completed": "2026-04-05T05:43:04Z"
      },
      "verification_evidence": {
        "mode": "standard",
        "tests_run": 55,
        "tests_passed": 55,
        "timestamp": "2026-04-05T05:43:04Z",
        "command": "uv run pytest tests/unit/test_trl_adapter.py -v && uv run pytest tests/unit/test_trl_adapter.py -v -k \"repeat or last_call\" && uv run pytest tests/e2e/test_training_e2e.py -v",
        "verifier_result": "approved"
      },
      "demo": {
        "path": "specs/F015-DEMO.md",
        "generated_at": "2026-04-05T05:50:52Z",
        "mode": "artifact_build",
        "status": "generated",
        "requires_user_verification": true,
        "verification_surfaces": [
          "local_pytest_verification",
          "training_runtime_behavior"
        ],
        "evidence_refs": [
          "specs/F015-VERIFICATION_SPEC.md",
          "specs/F015-DEMO.md"
        ],
        "note": "Strongest local proof is targeted/full pytest and training e2e smoke; reduced repeat loops in live GRPO trajectories still requires user runtime confirmation."
      },
      "user_value": "Agents now receive a deterministic repeat-call penalty for reused tool calls within a short recent-call window (including alternating reuse), reducing degenerate GRPO loops while preserving non-repeated exploration behavior."
    },
    {
      "id": "F016",
      "name": "Pre-Publication Code Quality Sweep",
      "description": "Refactor, lint fixes, and code smell cleanup before blog post publication. Runs ruff --fix, removes dead code, fixes line lengths, and addresses unused variables. Staff review of core modules (reward, verifier, trl_adapter, sql_environment) for correctness and clarity.",
      "complexity": "simple",
      "verification_mode": "mvp",
      "status": "not_started",
      "priority": 1,
      "dependencies": [],
      "docs": {
        "discovery_json": null,
        "discovery_md": null,
        "design_doc": null,
        "delivery_spec": null
      },
      "taste": {
        "source": "user_interview",
        "notes": "Blog deadline tomorrow — codebase must be presentable for open-source judges"
      },
      "user_interview": {
        "conducted": "2026-04-11T15:55:16Z",
        "skipped": false,
        "skip_reason": null,
        "value": {
          "question": "What will users be able to do that they couldn't before?",
          "response": "Judges and readers reviewing the GitHub repo will see clean, well-linted code without obvious smells. The codebase matches the quality story told in the blog post."
        },
        "experience": {
          "question": "Walk me through using this. What would delight you? What would frustrate you?",
          "delights": [
            "Zero ruff errors on clone",
            "No dead imports or unused variables",
            "Core modules pass a staff-level review"
          ],
          "frustrations": [
            "Visible linting errors in the repo judges clone",
            "Commented-out code or debug prints left in",
            "Inconsistent formatting between files"
          ]
        },
        "maturity": {
          "question": "Is this exploratory, MVP, or production?",
          "response": "mvp",
          "rationale": "Ship-blocking cleanup, not a deep refactor. Fix what's visible, don't reorganize."
        }
      },
      "progress": {
        "implementation_steps": {
          "total": 4,
          "completed": 0
        },
        "verification_tests": {
          "total": 2,
          "passed": 0
        }
      },
      "specs": {
        "implementation": null,
        "verification": null
      },
      "inline_spec": {
        "files": [
          "server/sql_environment.py",
          "server/verifier.py",
          "server/reward.py",
          "training/trl_adapter.py",
          "training/config.py",
          "training/notebook_pipeline.py",
          "training/data_loading.py",
          "evaluation/policies.py",
          "evaluation/runner.py",
          "scripts/generate_sft_data.py",
          "tests/"
        ],
        "description": "Four steps: (1) ruff check --fix + ruff format, (2) manual fix remaining lint errors (line length, unused vars, dead imports), (3) spec-staff-review on core modules, (4) address review findings. Inline verification: ruff check passes with 0 errors, all existing tests pass.",
        "verification": "ruff check . returns 0 errors; uv run python -m pytest tests/ passes; staff review findings addressed or documented"
      },
      "timestamps": {
        "planned": "2026-04-11T15:55:16Z",
        "verification_planned": null,
        "started": null,
        "completed": null
      },
      "verification_evidence": null,
      "user_value": null
    }
  ]
}