Your Name commited on
Commit
652a783
Β·
1 Parent(s): 368d425

fix(OpenEnv): remove bonus task and refine ranges to ensure absolute strictly (0, 1) compliance across all fields

Browse files
Files changed (4) hide show
  1. README.md +1 -5
  2. inference.py +1 -1
  3. openenv.yaml +4 -12
  4. tasks/task_registry.py +0 -1
README.md CHANGED
@@ -112,11 +112,7 @@ Current benchmarks (HumanEval, SWE-bench, MBPP) treat code generation as a **sin
112
  - 15 correctness tests + 1 performance test: 10,000 ops in < 200ms
113
  - **Algorithm design + complexity analysis + perf constraint Β· 40 step limit**
114
 
115
- ### 🟣 Bonus β€” `bonus_perf_regression_merge`
116
- **Real-world analog:** Diagnosing a silent O(nΒ²) regression from a bad merge conflict resolution
117
- - Functional tests still pass β€” only a 50,000-document perf test reveals the bug
118
- - Requires code archaeology: reading conflict markers + understanding algorithm history
119
- - **Unique task β€” no existing benchmark includes this scenario**
120
 
121
  ---
122
 
 
112
  - 15 correctness tests + 1 performance test: 10,000 ops in < 200ms
113
  - **Algorithm design + complexity analysis + perf constraint Β· 40 step limit**
114
 
115
+ ---
 
 
 
 
116
 
117
  ---
118
 
inference.py CHANGED
@@ -225,7 +225,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
225
  # ── [END] ─────────────────────────────────────────────────────────────────
226
  print(
227
  f"[END] success={'true' if success else 'false'} steps={step_count} "
228
- f"score={score:.2f} rewards={rewards_str}",
229
  flush=True,
230
  )
231
 
 
225
  # ── [END] ─────────────────────────────────────────────────────────────────
226
  print(
227
  f"[END] success={'true' if success else 'false'} steps={step_count} "
228
+ f"score={score:.4f} rewards={rewards_str}",
229
  flush=True,
230
  )
231
 
openenv.yaml CHANGED
@@ -21,7 +21,6 @@ methods:
21
  - easy_bugfix_chunk_list
22
  - medium_refactor_stats
23
  - hard_lru_cache_performance
24
- - bonus_perf_regression_merge
25
  description: "Which task to run this episode."
26
  step:
27
  description: "Execute one typed action. Returns updated Observation with reward."
@@ -117,7 +116,7 @@ observation_space:
117
 
118
  # ── Reward ─────────────────────────────────────────────────────────────────────
119
  reward:
120
- range: [0.01, 0.99]
121
  type: dense
122
  description: >
123
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -130,28 +129,21 @@ tasks:
130
  max_steps: 20
131
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
132
  grader: grader.grade_episode
133
- score_range: [0.01, 0.99]
134
 
135
  - id: medium_refactor_stats
136
  difficulty: medium
137
  max_steps: 30
138
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
139
  grader: grader.grade_episode
140
- score_range: [0.01, 0.99]
141
 
142
  - id: hard_lru_cache_performance
143
  difficulty: hard
144
  max_steps: 40
145
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
146
  grader: grader.grade_episode
147
- score_range: [0.01, 0.99]
148
-
149
- - id: bonus_perf_regression_merge
150
- difficulty: hard
151
- max_steps: 40
152
- description: "Diagnose and fix O(nΒ²) regression hidden inside a bad merge conflict resolution. Perf test: 50k docs < 500ms."
153
- grader: grader.grade_episode
154
- score_range: [0.01, 0.99]
155
 
156
  # ── Infrastructure ─────────────────────────────────────────────────────────────
157
  runtime:
 
21
  - easy_bugfix_chunk_list
22
  - medium_refactor_stats
23
  - hard_lru_cache_performance
 
24
  description: "Which task to run this episode."
25
  step:
26
  description: "Execute one typed action. Returns updated Observation with reward."
 
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
+ range: [0.0, 10.0]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
 
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
  grader: grader.grade_episode
132
+ score_range: [0.0, 1.0]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
  grader: grader.grade_episode
139
+ score_range: [0.0, 1.0]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_episode
146
+ score_range: [0.0, 1.0]
 
 
 
 
 
 
 
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
tasks/task_registry.py CHANGED
@@ -7,7 +7,6 @@ TASK_REGISTRY: Dict[str, Any] = {
7
  easy_task.TASK_ID: easy_task,
8
  medium_task.TASK_ID: medium_task,
9
  hard_task.TASK_ID: hard_task,
10
- bonus_task.TASK_ID: bonus_task,
11
  }
12
 
13
  # The 3 scored tasks for the hackathon (easy, medium, hard)
 
7
  easy_task.TASK_ID: easy_task,
8
  medium_task.TASK_ID: medium_task,
9
  hard_task.TASK_ID: hard_task,
 
10
  }
11
 
12
  # The 3 scored tasks for the hackathon (easy, medium, hard)