Spaces:
Sleeping
Sleeping
File size: 7,378 Bytes
637f42c 94d08ee 637f42c 4f893da 637f42c 0786522 637f42c 94d08ee 4f893da 637f42c 94d08ee 4f893da 637f42c efa2d2a 94d08ee 4f893da 637f42c 58ca26f 637f42c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 | name: teamforge
version: "1.1.0"
description: >
A structured multi-phase benchmark for autonomous software engineering agents.
The agent simulates a full software development team: planning, coding, testing,
reviewing, and reflecting β inside a real isolated Git repository.
author: TeamForge
license: MIT
# ββ OpenEnv Interface ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
entry_point: environment.TeamForgeEnv
methods:
reset:
description: "Start a new episode for the given task_id. Returns initial Observation."
parameters:
task_id:
type: string
enum:
- easy_bugfix_chunk_list
- medium_refactor_stats
- hard_lru_cache_performance
description: "Which task to run this episode."
step:
description: "Execute one typed action. Returns updated Observation with reward."
parameters:
action:
type: object
description: "A typed Action model (see action_space below)."
state:
description: "Return current environment state as a plain serialisable dict."
# ββ Action Space βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
action_space:
type: union
description: "One of 8 structured actions. Discriminated by the `type` field."
actions:
- name: plan_step
fields:
type: {type: literal, value: plan_step}
step_number: {type: integer, minimum: 1}
description: {type: string, minLength: 5}
estimated_effort: {type: string, enum: [low, medium, high]}
depends_on: {type: array, items: integer}
- name: edit_file
fields:
type: {type: literal, value: edit_file}
file_path: {type: string}
content: {type: string}
reason: {type: string, minLength: 5}
- name: run_tests
fields:
type: {type: literal, value: run_tests}
test_path: {type: string, nullable: true}
timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30}
- name: run_lint
fields:
type: {type: literal, value: run_lint}
fix: {type: boolean, default: false}
file_path: {type: string, nullable: true}
- name: generate_review
fields:
type: {type: literal, value: generate_review}
focus_areas: {type: array, items: string}
review_text: {type: string, minLength: 20}
- name: commit
fields:
type: {type: literal, value: commit}
message: {type: string, minLength: 10}
files: {type: array, items: string}
- name: self_reflect
fields:
type: {type: literal, value: self_reflect}
what_went_well: {type: string, minLength: 10}
what_to_improve: {type: string, minLength: 10}
adjusted_plan: {type: string, nullable: true}
- name: request_iteration
fields:
type: {type: literal, value: request_iteration}
reason: {type: string, minLength: 10}
target_issues: {type: array, items: string}
# ββ Observation Space ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
observation_space:
type: object
description: "Full typed Observation returned after every step() and reset()."
fields:
task_id: {type: string}
task_description: {type: string}
difficulty: {type: string, enum: [easy, medium, hard]}
step_number: {type: integer}
max_steps: {type: integer}
phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]}
repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"}
git_log: {type: array, items: string}
last_action_type: {type: string, nullable: true}
last_action_status: {type: string, enum: [success, failure, partial]}
last_action_output: {type: string}
test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"}
lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"}
plan: {type: array, description: "List of PlanStep actions issued so far"}
reviews: {type: array, description: "List of ReviewArtifact objects"}
reflections: {type: array, description: "List of ReflectionArtifact objects"}
reward: {type: number, description: "Reward for the last action"}
cumulative_reward: {type: number}
done: {type: boolean}
info: {type: object}
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
reward:
range: [0.0, 1.0]
type: dense
description: >
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
clean lint, reviews, reflections, commits. Always strictly between 0 and 1.
# ββ Tasks ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
tasks:
- id: easy_bugfix_chunk_list
difficulty: easy
max_steps: 20
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: medium_refactor_stats
difficulty: medium
max_steps: 30
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: hard_lru_cache_performance
difficulty: hard
max_steps: 40
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: bonus_task
difficulty: bonus
max_steps: 10
description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
grader: grader.grade_task
score_range: [0.0, 1.0]
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
runtime:
python: ">=3.11"
memory_gb: 8
vcpu: 2
max_episode_minutes: 20
inference:
script: inference.py
env_vars:
API_BASE_URL: "https://api.groq.com/openai/v1"
MODEL_NAME: "llama3-8b-8192"
HF_TOKEN: ""
deployment:
dockerfile: Dockerfile
huggingface_spaces: true
gradio_app: server/app.py
# ββ API Endpoints (for OpenEnv validator) ββββββββββββββββββββββββββββββββββββββ
api:
reset:
method: POST
path: /reset
body: '{"task_id": "easy_bugfix_chunk_list"}'
step:
method: POST
path: /step
body: '{"action": {"type": "run_tests"}}'
state:
method: GET
path: /state
health:
method: GET
path: /health
|