code-debug-env / openenv.yaml
Souravdanyal's picture
error fixing
d510c1d
raw
history blame
2.91 kB
spec_version: 1
name: code-debug-env
type: typed
description: >
A real-world RL environment where an LLM agent diagnoses and fixes buggy Python
code across three difficulty levels (easy, medium, hard). Tasks cover real-world
domains: data processing, string algorithms, API validation, sorting, dynamic
programming, and graph algorithms. Rewards are partial and proportional to test
cases passed, with bonuses for correct explanations on hard tasks.
version: 1.0.0
author: Souravdanyal
tags:
- code-debugging
- python
- reinforcement-learning
- openenv
- llm-agent
- software-engineering
- real-world
runtime:
type: docker
port: 7860
app:
entry: server/app.py
host: 0.0.0.0
port: 7860
config:
episode_timeout: 300
max_steps: 5
tasks:
- id: easy
description: "Fix a single off-by-one, operator, or return bug in a Python function"
difficulty: easy
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
- id: medium
description: "Fix two bugs (logic bug + edge case) so all test cases pass"
difficulty: medium
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
- id: hard
description: "Fix an algorithmic bug AND provide a correct explanation of root cause"
difficulty: hard
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
reward_range: [0.0, 1.0]
action_space:
type: dict
description: "Agent submits fixed Python code and optional explanation"
fields:
fixed_code:
type: string
required: true
description: "Complete corrected Python function. Must be valid Python including imports."
explanation:
type: string
required: false
description: "Required for hard tasks. Explain the bug, root cause, and fix."
observation_space:
type: dict
description: "Returned after reset() and step()"
fields:
task_id:
type: string
description: "Unique task identifier e.g. easy_003"
difficulty:
type: enum
values: [easy, medium, hard]
buggy_code:
type: string
description: "The buggy Python function to fix"
instructions:
type: string
description: "Natural language description of what is wrong"
test_cases_description:
type: string
description: "What the test cases check"
reward:
type: float
description: "Score 0.0-1.0 (null on reset)"
passed_tests:
type: integer
description: "Test cases passed (null on reset)"
total_tests:
type: integer
description: "Total test cases (always 3)"
feedback:
type: string
description: "Per-test feedback showing Input, Expected, Got"
done:
type: boolean
description: "True when episode complete"
api:
reset: /reset
step: /step
state: /state
health: /health
tasks: /tasks