File size: 2,911 Bytes
2ce1061
 
 
 
d510c1d
 
 
 
 
2ce1061
 
c01667e
 
 
 
 
 
 
 
 
d510c1d
2ce1061
 
 
 
 
 
 
 
 
 
c01667e
 
 
 
2ce1061
 
c01667e
2ce1061
c01667e
2ce1061
c01667e
 
2ce1061
 
c01667e
2ce1061
c01667e
2ce1061
c01667e
 
2ce1061
 
d510c1d
2ce1061
c01667e
2ce1061
c01667e
 
2ce1061
 
 
c01667e
 
 
 
 
 
 
d510c1d
c01667e
 
 
d510c1d
c01667e
 
 
d510c1d
c01667e
 
 
d510c1d
c01667e
 
 
 
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
 
d510c1d
c01667e
2ce1061
 
 
 
 
d510c1d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
spec_version: 1
name: code-debug-env
type: typed
description: >
  A real-world RL environment where an LLM agent diagnoses and fixes buggy Python
  code across three difficulty levels (easy, medium, hard). Tasks cover real-world
  domains: data processing, string algorithms, API validation, sorting, dynamic
  programming, and graph algorithms. Rewards are partial and proportional to test
  cases passed, with bonuses for correct explanations on hard tasks.

version: 1.0.0
author: Souravdanyal

tags:
  - code-debugging
  - python
  - reinforcement-learning
  - openenv
  - llm-agent
  - software-engineering
  - real-world

runtime:
  type: docker
  port: 7860

app:
  entry: server/app.py
  host: 0.0.0.0
  port: 7860

config:
  episode_timeout: 300
  max_steps: 5

tasks:
  - id: easy
    description: "Fix a single off-by-one, operator, or return bug in a Python function"
    difficulty: easy
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

  - id: medium
    description: "Fix two bugs (logic bug + edge case) so all test cases pass"
    difficulty: medium
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

  - id: hard
    description: "Fix an algorithmic bug AND provide a correct explanation of root cause"
    difficulty: hard
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

reward_range: [0.0, 1.0]

action_space:
  type: dict
  description: "Agent submits fixed Python code and optional explanation"
  fields:
    fixed_code:
      type: string
      required: true
      description: "Complete corrected Python function. Must be valid Python including imports."
    explanation:
      type: string
      required: false
      description: "Required for hard tasks. Explain the bug, root cause, and fix."

observation_space:
  type: dict
  description: "Returned after reset() and step()"
  fields:
    task_id:
      type: string
      description: "Unique task identifier e.g. easy_003"
    difficulty:
      type: enum
      values: [easy, medium, hard]
    buggy_code:
      type: string
      description: "The buggy Python function to fix"
    instructions:
      type: string
      description: "Natural language description of what is wrong"
    test_cases_description:
      type: string
      description: "What the test cases check"
    reward:
      type: float
      description: "Score 0.0-1.0 (null on reset)"
    passed_tests:
      type: integer
      description: "Test cases passed (null on reset)"
    total_tests:
      type: integer
      description: "Total test cases (always 3)"
    feedback:
      type: string
      description: "Per-test feedback showing Input, Expected, Got"
    done:
      type: boolean
      description: "True when episode complete"

api:
  reset: /reset
  step: /step
  state: /state
  health: /health
  tasks: /tasks