name: codearena-rl-benchmark description: "RL Benchmark for Autonomous Code Repair — iterative debugging with execution feedback" version: "1.0.0" entrypoint: server.app:CodeArenaEnv runtime: language: python python_version: "3.11" api: reset: /reset step: /step state: /state observation_space: type: json schema: buggy_code: string error_log: string test_results: string previous_attempts: list[string] action_space: type: json schema: proposed_fix: string tasks: - id: easy path: tasks/easy.json grader: server.grader:grade - id: medium path: tasks/medium.json grader: server.grader:grade - id: hard path: tasks/hard.json grader: server.grader:grade - id: type_errors path: tasks/type_errors/type_error_1.json grader: server.grader:grade - id: security_bugs path: tasks/security_bugs/security_bug_1.json grader: server.grader:grade limits: step_timeout_seconds: 2 max_runtime_minutes: 20