name: codearena-rl-benchmark
description: "RL Benchmark for Autonomous Code Repair — iterative debugging with execution feedback"
version: "1.0.0"
entrypoint: server.app:CodeArenaEnv

runtime:
  language: python
  python_version: "3.11"

api:
  reset: /reset
  step: /step
  state: /state

observation_space:
  type: json
  schema:
    buggy_code: string
    error_log: string
    test_results: string
    previous_attempts: list[string]

action_space:
  type: json
  schema:
    proposed_fix: string

tasks:
  - id: easy
    path: tasks/easy.json
    grader: server.grader:grade
  - id: medium
    path: tasks/medium.json
    grader: server.grader:grade
  - id: hard
    path: tasks/hard.json
    grader: server.grader:grade
  - id: type_errors
    path: tasks/type_errors/type_error_1.json
    grader: server.grader:grade
  - id: security_bugs
    path: tasks/security_bugs/security_bug_1.json
    grader: server.grader:grade

limits:
  step_timeout_seconds: 2
  max_runtime_minutes: 20