Spaces:
Running
Running
| spec_version: 1 | |
| name: code-debug-env | |
| type: typed | |
| description: > | |
| A real-world RL environment where an LLM agent diagnoses and fixes buggy Python | |
| code across three difficulty levels (easy, medium, hard). Tasks cover real-world | |
| domains: data processing, string algorithms, API validation, sorting, dynamic | |
| programming, and graph algorithms. Rewards are partial and proportional to test | |
| cases passed, with bonuses for correct explanations on hard tasks. | |
| version: 1.0.0 | |
| author: Souravdanyal | |
| tags: | |
| - code-debugging | |
| - python | |
| - reinforcement-learning | |
| - openenv | |
| - llm-agent | |
| - software-engineering | |
| - real-world | |
| runtime: | |
| type: docker | |
| port: 7860 | |
| app: | |
| entry: server/app.py | |
| host: 0.0.0.0 | |
| port: 7860 | |
| config: | |
| episode_timeout: 300 | |
| max_steps: 5 | |
| tasks: | |
| - id: easy | |
| description: "Fix a single off-by-one, operator, or return bug in a Python function" | |
| difficulty: easy | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| grader: deterministic | |
| num_tasks: 15 | |
| - id: medium | |
| description: "Fix two bugs (logic bug + edge case) so all test cases pass" | |
| difficulty: medium | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| grader: deterministic | |
| num_tasks: 15 | |
| - id: hard | |
| description: "Fix an algorithmic bug AND provide a correct explanation of root cause" | |
| difficulty: hard | |
| max_steps: 5 | |
| reward_range: [0.0, 1.0] | |
| grader: deterministic | |
| num_tasks: 15 | |
| reward_range: [0.0, 1.0] | |
| action_space: | |
| type: dict | |
| description: "Agent submits fixed Python code and optional explanation" | |
| fields: | |
| fixed_code: | |
| type: string | |
| required: true | |
| description: "Complete corrected Python function. Must be valid Python including imports." | |
| explanation: | |
| type: string | |
| required: false | |
| description: "Required for hard tasks. Explain the bug, root cause, and fix." | |
| observation_space: | |
| type: dict | |
| description: "Returned after reset() and step()" | |
| fields: | |
| task_id: | |
| type: string | |
| description: "Unique task identifier e.g. easy_003" | |
| difficulty: | |
| type: enum | |
| values: [easy, medium, hard] | |
| buggy_code: | |
| type: string | |
| description: "The buggy Python function to fix" | |
| instructions: | |
| type: string | |
| description: "Natural language description of what is wrong" | |
| test_cases_description: | |
| type: string | |
| description: "What the test cases check" | |
| reward: | |
| type: float | |
| description: "Score 0.0-1.0 (null on reset)" | |
| passed_tests: | |
| type: integer | |
| description: "Test cases passed (null on reset)" | |
| total_tests: | |
| type: integer | |
| description: "Total test cases (always 3)" | |
| feedback: | |
| type: string | |
| description: "Per-test feedback showing Input, Expected, Got" | |
| done: | |
| type: boolean | |
| description: "True when episode complete" | |
| api: | |
| reset: /reset | |
| step: /step | |
| state: /state | |
| health: /health | |
| tasks: /tasks | |