Spaces:
Running
Running
File size: 2,911 Bytes
2ce1061 d510c1d 2ce1061 c01667e d510c1d 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e 2ce1061 d510c1d 2ce1061 c01667e 2ce1061 c01667e 2ce1061 c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e d510c1d c01667e 2ce1061 d510c1d | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 | spec_version: 1
name: code-debug-env
type: typed
description: >
A real-world RL environment where an LLM agent diagnoses and fixes buggy Python
code across three difficulty levels (easy, medium, hard). Tasks cover real-world
domains: data processing, string algorithms, API validation, sorting, dynamic
programming, and graph algorithms. Rewards are partial and proportional to test
cases passed, with bonuses for correct explanations on hard tasks.
version: 1.0.0
author: Souravdanyal
tags:
- code-debugging
- python
- reinforcement-learning
- openenv
- llm-agent
- software-engineering
- real-world
runtime:
type: docker
port: 7860
app:
entry: server/app.py
host: 0.0.0.0
port: 7860
config:
episode_timeout: 300
max_steps: 5
tasks:
- id: easy
description: "Fix a single off-by-one, operator, or return bug in a Python function"
difficulty: easy
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
- id: medium
description: "Fix two bugs (logic bug + edge case) so all test cases pass"
difficulty: medium
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
- id: hard
description: "Fix an algorithmic bug AND provide a correct explanation of root cause"
difficulty: hard
max_steps: 5
reward_range: [0.0, 1.0]
grader: deterministic
num_tasks: 15
reward_range: [0.0, 1.0]
action_space:
type: dict
description: "Agent submits fixed Python code and optional explanation"
fields:
fixed_code:
type: string
required: true
description: "Complete corrected Python function. Must be valid Python including imports."
explanation:
type: string
required: false
description: "Required for hard tasks. Explain the bug, root cause, and fix."
observation_space:
type: dict
description: "Returned after reset() and step()"
fields:
task_id:
type: string
description: "Unique task identifier e.g. easy_003"
difficulty:
type: enum
values: [easy, medium, hard]
buggy_code:
type: string
description: "The buggy Python function to fix"
instructions:
type: string
description: "Natural language description of what is wrong"
test_cases_description:
type: string
description: "What the test cases check"
reward:
type: float
description: "Score 0.0-1.0 (null on reset)"
passed_tests:
type: integer
description: "Test cases passed (null on reset)"
total_tests:
type: integer
description: "Total test cases (always 3)"
feedback:
type: string
description: "Per-test feedback showing Input, Expected, Got"
done:
type: boolean
description: "True when episode complete"
api:
reset: /reset
step: /step
state: /state
health: /health
tasks: /tasks
|