name: teamforge version: "1.1.0" description: > A structured multi-phase benchmark for autonomous software engineering agents. The agent simulates a full software development team: planning, coding, testing, reviewing, and reflecting — inside a real isolated Git repository. author: TeamForge license: MIT # ── OpenEnv Interface ────────────────────────────────────────────────────────── entry_point: environment.TeamForgeEnv methods: reset: description: "Start a new episode for the given task_id. Returns initial Observation." parameters: task_id: type: string enum: - easy_bugfix_chunk_list - medium_refactor_stats - hard_lru_cache_performance description: "Which task to run this episode." step: description: "Execute one typed action. Returns updated Observation with reward." parameters: action: type: object description: "A typed Action model (see action_space below)." state: description: "Return current environment state as a plain serialisable dict." # ── Action Space ─────────────────────────────────────────────────────────────── action_space: type: union description: "One of 8 structured actions. Discriminated by the `type` field." actions: - name: plan_step fields: type: {type: literal, value: plan_step} step_number: {type: integer, minimum: 1} description: {type: string, minLength: 5} estimated_effort: {type: string, enum: [low, medium, high]} depends_on: {type: array, items: integer} - name: edit_file fields: type: {type: literal, value: edit_file} file_path: {type: string} content: {type: string} reason: {type: string, minLength: 5} - name: run_tests fields: type: {type: literal, value: run_tests} test_path: {type: string, nullable: true} timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30} - name: run_lint fields: type: {type: literal, value: run_lint} fix: {type: boolean, default: false} file_path: {type: string, nullable: true} - name: generate_review fields: type: {type: literal, value: generate_review} focus_areas: {type: array, items: string} review_text: {type: string, minLength: 20} - name: commit fields: type: {type: literal, value: commit} message: {type: string, minLength: 10} files: {type: array, items: string} - name: self_reflect fields: type: {type: literal, value: self_reflect} what_went_well: {type: string, minLength: 10} what_to_improve: {type: string, minLength: 10} adjusted_plan: {type: string, nullable: true} - name: request_iteration fields: type: {type: literal, value: request_iteration} reason: {type: string, minLength: 10} target_issues: {type: array, items: string} # ── Observation Space ────────────────────────────────────────────────────────── observation_space: type: object description: "Full typed Observation returned after every step() and reset()." fields: task_id: {type: string} task_description: {type: string} difficulty: {type: string, enum: [easy, medium, hard]} step_number: {type: integer} max_steps: {type: integer} phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]} repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"} git_log: {type: array, items: string} last_action_type: {type: string, nullable: true} last_action_status: {type: string, enum: [success, failure, partial]} last_action_output: {type: string} test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"} lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"} plan: {type: array, description: "List of PlanStep actions issued so far"} reviews: {type: array, description: "List of ReviewArtifact objects"} reflections: {type: array, description: "List of ReflectionArtifact objects"} reward: {type: number, description: "Reward for the last action"} cumulative_reward: {type: number} done: {type: boolean} info: {type: object} # ── Reward ───────────────────────────────────────────────────────────────────── reward: range: [0.0, 1.0] type: dense description: > Dense shaped reward. Positive for: correct plan steps, edits, passing tests, clean lint, reviews, reflections, commits. Always strictly between 0 and 1. # ── Tasks ────────────────────────────────────────────────────────────────────── tasks: - id: easy_bugfix_chunk_list difficulty: easy max_steps: 20 description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass." grader: grader.grade_task score_range: [0.0, 1.0] - id: medium_refactor_stats difficulty: medium max_steps: 30 description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility." grader: grader.grade_task score_range: [0.0, 1.0] - id: hard_lru_cache_performance difficulty: hard max_steps: 40 description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)." grader: grader.grade_task score_range: [0.0, 1.0] - id: bonus_task difficulty: bonus max_steps: 10 description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded." grader: grader.grade_task score_range: [0.0, 1.0] # ── Infrastructure ───────────────────────────────────────────────────────────── runtime: python: ">=3.11" memory_gb: 8 vcpu: 2 max_episode_minutes: 20 inference: script: inference.py env_vars: API_BASE_URL: "https://api.groq.com/openai/v1" MODEL_NAME: "llama3-8b-8192" HF_TOKEN: "" deployment: dockerfile: Dockerfile huggingface_spaces: true gradio_app: server/app.py # ── API Endpoints (for OpenEnv validator) ────────────────────────────────────── api: reset: method: POST path: /reset body: '{"task_id": "easy_bugfix_chunk_list"}' step: method: POST path: /step body: '{"action": {"type": "run_tests"}}' state: method: GET path: /state health: method: GET path: /health