File size: 7,378 Bytes
637f42c
94d08ee
637f42c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f893da
637f42c
 
 
0786522
637f42c
 
 
 
 
 
 
94d08ee
4f893da
637f42c
 
 
 
 
94d08ee
4f893da
637f42c
 
 
 
 
efa2d2a
 
 
 
 
 
 
94d08ee
4f893da
637f42c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58ca26f
637f42c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
name: teamforge
version: "1.1.0"
description: >
  A structured multi-phase benchmark for autonomous software engineering agents.
  The agent simulates a full software development team: planning, coding, testing,
  reviewing, and reflecting β€” inside a real isolated Git repository.

author: TeamForge
license: MIT

# ── OpenEnv Interface ──────────────────────────────────────────────────────────
entry_point: environment.TeamForgeEnv

methods:
  reset:
    description: "Start a new episode for the given task_id. Returns initial Observation."
    parameters:
      task_id:
        type: string
        enum:
          - easy_bugfix_chunk_list
          - medium_refactor_stats
          - hard_lru_cache_performance
        description: "Which task to run this episode."
  step:
    description: "Execute one typed action. Returns updated Observation with reward."
    parameters:
      action:
        type: object
        description: "A typed Action model (see action_space below)."
  state:
    description: "Return current environment state as a plain serialisable dict."

# ── Action Space ───────────────────────────────────────────────────────────────
action_space:
  type: union
  description: "One of 8 structured actions. Discriminated by the `type` field."
  actions:
    - name: plan_step
      fields:
        type: {type: literal, value: plan_step}
        step_number: {type: integer, minimum: 1}
        description: {type: string, minLength: 5}
        estimated_effort: {type: string, enum: [low, medium, high]}
        depends_on: {type: array, items: integer}

    - name: edit_file
      fields:
        type: {type: literal, value: edit_file}
        file_path: {type: string}
        content: {type: string}
        reason: {type: string, minLength: 5}

    - name: run_tests
      fields:
        type: {type: literal, value: run_tests}
        test_path: {type: string, nullable: true}
        timeout_seconds: {type: integer, minimum: 5, maximum: 120, default: 30}

    - name: run_lint
      fields:
        type: {type: literal, value: run_lint}
        fix: {type: boolean, default: false}
        file_path: {type: string, nullable: true}

    - name: generate_review
      fields:
        type: {type: literal, value: generate_review}
        focus_areas: {type: array, items: string}
        review_text: {type: string, minLength: 20}

    - name: commit
      fields:
        type: {type: literal, value: commit}
        message: {type: string, minLength: 10}
        files: {type: array, items: string}

    - name: self_reflect
      fields:
        type: {type: literal, value: self_reflect}
        what_went_well: {type: string, minLength: 10}
        what_to_improve: {type: string, minLength: 10}
        adjusted_plan: {type: string, nullable: true}

    - name: request_iteration
      fields:
        type: {type: literal, value: request_iteration}
        reason: {type: string, minLength: 10}
        target_issues: {type: array, items: string}

# ── Observation Space ──────────────────────────────────────────────────────────
observation_space:
  type: object
  description: "Full typed Observation returned after every step() and reset()."
  fields:
    task_id: {type: string}
    task_description: {type: string}
    difficulty: {type: string, enum: [easy, medium, hard]}
    step_number: {type: integer}
    max_steps: {type: integer}
    phase: {type: string, enum: [planning, coding, testing, reviewing, reflecting, done]}
    repo_files: {type: array, description: "List of FileSnapshot objects (path, content, size_bytes)"}
    git_log: {type: array, items: string}
    last_action_type: {type: string, nullable: true}
    last_action_status: {type: string, enum: [success, failure, partial]}
    last_action_output: {type: string}
    test_results: {type: object, nullable: true, description: "TestResult: passed, failed, errors, output, duration_seconds"}
    lint_results: {type: object, nullable: true, description: "LintResult: violations, output, score"}
    plan: {type: array, description: "List of PlanStep actions issued so far"}
    reviews: {type: array, description: "List of ReviewArtifact objects"}
    reflections: {type: array, description: "List of ReflectionArtifact objects"}
    reward: {type: number, description: "Reward for the last action"}
    cumulative_reward: {type: number}
    done: {type: boolean}
    info: {type: object}

# ── Reward ─────────────────────────────────────────────────────────────────────
reward:
  range: [0.0, 1.0]
  type: dense
  description: >
    Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
    clean lint, reviews, reflections, commits. Always strictly between 0 and 1.

# ── Tasks ──────────────────────────────────────────────────────────────────────
tasks:
  - id: easy_bugfix_chunk_list
    difficulty: easy
    max_steps: 20
    description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

  - id: medium_refactor_stats
    difficulty: medium
    max_steps: 30
    description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

  - id: hard_lru_cache_performance
    difficulty: hard
    max_steps: 40
    description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
    grader: grader.grade_task
    score_range: [0.0, 1.0]
    
  - id: bonus_task
    difficulty: bonus
    max_steps: 10
    description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
    grader: grader.grade_task
    score_range: [0.0, 1.0]

# ── Infrastructure ─────────────────────────────────────────────────────────────
runtime:
  python: ">=3.11"
  memory_gb: 8
  vcpu: 2
  max_episode_minutes: 20

inference:
  script: inference.py
  env_vars:
    API_BASE_URL: "https://api.groq.com/openai/v1"
    MODEL_NAME: "llama3-8b-8192"
    HF_TOKEN: ""

deployment:
  dockerfile: Dockerfile
  huggingface_spaces: true
  gradio_app: server/app.py

# ── API Endpoints (for OpenEnv validator) ──────────────────────────────────────
api:
  reset:
    method: POST
    path: /reset
    body: '{"task_id": "easy_bugfix_chunk_list"}'
  step:
    method: POST
    path: /step
    body: '{"action": {"type": "run_tests"}}'
  state:
    method: GET
    path: /state
  health:
    method: GET
    path: /health