spec_version: 1 name: code-debug-env type: typed description: > A real-world RL environment where an LLM agent diagnoses and fixes buggy Python code across three difficulty levels (easy, medium, hard). Tasks cover real-world domains: data processing, string algorithms, API validation, sorting, dynamic programming, and graph algorithms. Rewards are partial and proportional to test cases passed, with bonuses for correct explanations on hard tasks. version: 1.0.0 author: Souravdanyal tags: - code-debugging - python - reinforcement-learning - openenv - llm-agent - software-engineering - real-world runtime: type: docker port: 7860 app: entry: server/app.py host: 0.0.0.0 port: 7860 config: episode_timeout: 300 max_steps: 5 tasks: - id: easy description: "Fix a single off-by-one, operator, or return bug in a Python function" difficulty: easy max_steps: 5 reward_range: [0.0, 1.0] grader: deterministic num_tasks: 15 - id: medium description: "Fix two bugs (logic bug + edge case) so all test cases pass" difficulty: medium max_steps: 5 reward_range: [0.0, 1.0] grader: deterministic num_tasks: 15 - id: hard description: "Fix an algorithmic bug AND provide a correct explanation of root cause" difficulty: hard max_steps: 5 reward_range: [0.0, 1.0] grader: deterministic num_tasks: 15 reward_range: [0.0, 1.0] action_space: type: dict description: "Agent submits fixed Python code and optional explanation" fields: fixed_code: type: string required: true description: "Complete corrected Python function. Must be valid Python including imports." explanation: type: string required: false description: "Required for hard tasks. Explain the bug, root cause, and fix." observation_space: type: dict description: "Returned after reset() and step()" fields: task_id: type: string description: "Unique task identifier e.g. easy_003" difficulty: type: enum values: [easy, medium, hard] buggy_code: type: string description: "The buggy Python function to fix" instructions: type: string description: "Natural language description of what is wrong" test_cases_description: type: string description: "What the test cases check" reward: type: float description: "Score 0.0-1.0 (null on reset)" passed_tests: type: integer description: "Test cases passed (null on reset)" total_tests: type: integer description: "Total test cases (always 3)" feedback: type: string description: "Per-test feedback showing Input, Expected, Got" done: type: boolean description: "True when episode complete" api: reset: /reset step: /step state: /state health: /health tasks: /tasks