teamforge / openenv.yaml
Your Name
fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance
efa2d2a
name: teamforge
version: "1.1.0"
description: >
A structured multi-phase benchmark for autonomous software engineering agents.
The agent simulates a full software development team: planning, coding, testing,
reviewing, and reflecting β€” inside a real isolated Git repository.
author: TeamForge
license: MIT
# ── OpenEnv Interface ──────────────────────────────────────────────────────────
entry_point: environment.TeamForgeEnv
methods:
reset:
description: "Start a new episode for the given task_id. Returns initial Observation."
parameters:
task_id:
type: string
enum:
- easy_bugfix_chunk_list
- medium_refactor_stats
- hard_lru_cache_performance
description: "Which task to run this episode."
step:
description: "Execute one typed action. Returns updated Observation with reward."
parameters:
action:
type: object
description: "A typed Action model (see action_space below)."
state:
description: "Return current environment state as a plain serialisable dict."
# ── Action Space ───────────────────────────────────────────────────────────────
action_space:
type: union
description: "One of 8 structured actions. Discriminated by the `type` field."
actions:
- name: plan_step
fields:
type: {type: literal, value: plan_step}
step_number: {type: integer, minimum: 1}
description: {type: string, minLength: 5}
estimated_effort: {type: string, enum: [low, medium, high]}
depends_on: {type: array, items: integer}
- name: edit_file
fields:
type: {type: literal, value: edit_file}
file_path: {type: string}
content: {type: string}
reason: {type: string, minLength: 5}
- name: run_tests
fields:
type: {type: literal, value: run_tests}
test_path: {type: string, nullable: true}
timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30}
- name: run_lint
fields:
type: {type: literal, value: run_lint}
fix: {type: boolean, default: false}
file_path: {type: string, nullable: true}
- name: generate_review
fields:
type: {type: literal, value: generate_review}
focus_areas: {type: array, items: string}
review_text: {type: string, minLength: 20}
- name: commit
fields:
type: {type: literal, value: commit}
message: {type: string, minLength: 10}
files: {type: array, items: string}
- name: self_reflect
fields:
type: {type: literal, value: self_reflect}
what_went_well: {type: string, minLength: 10}
what_to_improve: {type: string, minLength: 10}
adjusted_plan: {type: string, nullable: true}
- name: request_iteration
fields:
type: {type: literal, value: request_iteration}
reason: {type: string, minLength: 10}
target_issues: {type: array, items: string}
# ── Observation Space ──────────────────────────────────────────────────────────
observation_space:
type: object
description: "Full typed Observation returned after every step() and reset()."
fields:
task_id: {type: string}
task_description: {type: string}
difficulty: {type: string, enum: [easy, medium, hard]}
step_number: {type: integer}
max_steps: {type: integer}
phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]}
repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"}
git_log: {type: array, items: string}
last_action_type: {type: string, nullable: true}
last_action_status: {type: string, enum: [success, failure, partial]}
last_action_output: {type: string}
test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"}
lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"}
plan: {type: array, description: "List of PlanStep actions issued so far"}
reviews: {type: array, description: "List of ReviewArtifact objects"}
reflections: {type: array, description: "List of ReflectionArtifact objects"}
reward: {type: number, description: "Reward for the last action"}
cumulative_reward: {type: number}
done: {type: boolean}
info: {type: object}
# ── Reward ─────────────────────────────────────────────────────────────────────
reward:
range: [0.0, 1.0]
type: dense
description: >
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
clean lint, reviews, reflections, commits. Always strictly between 0 and 1.
# ── Tasks ──────────────────────────────────────────────────────────────────────
tasks:
- id: easy_bugfix_chunk_list
difficulty: easy
max_steps: 20
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: medium_refactor_stats
difficulty: medium
max_steps: 30
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: hard_lru_cache_performance
difficulty: hard
max_steps: 40
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
grader: grader.grade_task
score_range: [0.0, 1.0]
- id: bonus_task
difficulty: bonus
max_steps: 10
description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
grader: grader.grade_task
score_range: [0.0, 1.0]
# ── Infrastructure ─────────────────────────────────────────────────────────────
runtime:
python: ">=3.11"
memory_gb: 8
vcpu: 2
max_episode_minutes: 20
inference:
script: inference.py
env_vars:
API_BASE_URL: "https://api.groq.com/openai/v1"
MODEL_NAME: "llama3-8b-8192"
HF_TOKEN: ""
deployment:
dockerfile: Dockerfile
huggingface_spaces: true
gradio_app: server/app.py
# ── API Endpoints (for OpenEnv validator) ──────────────────────────────────────
api:
reset:
method: POST
path: /reset
body: '{"task_id": "easy_bugfix_chunk_list"}'
step:
method: POST
path: /step
body: '{"action": {"type": "run_tests"}}'
state:
method: GET
path: /state
health:
method: GET
path: /health