spec_version: 1
name: code-debug-env
type: typed
description: >
  A real-world RL environment where an LLM agent diagnoses and fixes buggy Python
  code across three difficulty levels (easy, medium, hard). Tasks cover real-world
  domains: data processing, string algorithms, API validation, sorting, dynamic
  programming, and graph algorithms. Rewards are partial and proportional to test
  cases passed, with bonuses for correct explanations on hard tasks.

version: 1.0.0
author: Souravdanyal

tags:
  - code-debugging
  - python
  - reinforcement-learning
  - openenv
  - llm-agent
  - software-engineering
  - real-world

runtime:
  type: docker
  port: 7860

app:
  entry: server/app.py
  host: 0.0.0.0
  port: 7860

config:
  episode_timeout: 300
  max_steps: 5

tasks:
  - id: easy
    description: "Fix a single off-by-one, operator, or return bug in a Python function"
    difficulty: easy
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

  - id: medium
    description: "Fix two bugs (logic bug + edge case) so all test cases pass"
    difficulty: medium
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

  - id: hard
    description: "Fix an algorithmic bug AND provide a correct explanation of root cause"
    difficulty: hard
    max_steps: 5
    reward_range: [0.0, 1.0]
    grader: deterministic
    num_tasks: 15

reward_range: [0.0, 1.0]

action_space:
  type: dict
  description: "Agent submits fixed Python code and optional explanation"
  fields:
    fixed_code:
      type: string
      required: true
      description: "Complete corrected Python function. Must be valid Python including imports."
    explanation:
      type: string
      required: false
      description: "Required for hard tasks. Explain the bug, root cause, and fix."

observation_space:
  type: dict
  description: "Returned after reset() and step()"
  fields:
    task_id:
      type: string
      description: "Unique task identifier e.g. easy_003"
    difficulty:
      type: enum
      values: [easy, medium, hard]
    buggy_code:
      type: string
      description: "The buggy Python function to fix"
    instructions:
      type: string
      description: "Natural language description of what is wrong"
    test_cases_description:
      type: string
      description: "What the test cases check"
    reward:
      type: float
      description: "Score 0.0-1.0 (null on reset)"
    passed_tests:
      type: integer
      description: "Test cases passed (null on reset)"
    total_tests:
      type: integer
      description: "Total test cases (always 3)"
    feedback:
      type: string
      description: "Per-test feedback showing Input, Expected, Got"
    done:
      type: boolean
      description: "True when episode complete"

api:
  reset: /reset
  step: /step
  state: /state
  health: /health
  tasks: /tasks