Your Name commited on
Commit
94d08ee
Β·
1 Parent(s): 652a783

fix(OpenEnv): implement robust grader bridge and strict interior clamping [0.1, 0.9] to satisfy Phase 2 validator

Browse files
Files changed (6) hide show
  1. environment.py +5 -9
  2. grader.py +63 -16
  3. inference.py +22 -3
  4. models.py +9 -9
  5. openenv.yaml +8 -8
  6. reward.py +10 -10
environment.py CHANGED
@@ -64,7 +64,7 @@ class TeamForgeEnv:
64
 
65
  # Episode state
66
  self._step_number = 0
67
- self._cumulative_reward = 0.01
68
  self._plan: List[PlanStep] = []
69
  self._reviews: List[ReviewArtifact] = []
70
  self._reflections: List[ReflectionArtifact] = []
@@ -106,7 +106,7 @@ class TeamForgeEnv:
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
- self._cumulative_reward = 0.01
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
@@ -120,9 +120,7 @@ class TeamForgeEnv:
120
  # Build initial observation
121
  self._obs = self._build_observation(
122
  action_type=None,
123
- status=ActionStatus.SUCCESS,
124
- output="Environment initialized. Begin your task.",
125
- reward=0.01,
126
  done=False,
127
  )
128
  return self._obs
@@ -329,7 +327,7 @@ class TeamForgeEnv:
329
  ln for ln in output.splitlines()
330
  if re.match(r".+:\d+:\d+:", ln)
331
  ])
332
- score = max(0.01, min(0.99, 1.0 - violations * 0.05))
333
  self._last_lint_result = LintResult(
334
  violations=violations,
335
  output=output[:2000],
@@ -399,9 +397,7 @@ class TeamForgeEnv:
399
  self._log(f"[END] {reason}")
400
  self._obs = self._build_observation(
401
  action_type=None,
402
- status=ActionStatus.FAILURE,
403
- output=reason,
404
- reward=0.01,
405
  done=True,
406
  )
407
  return self._obs
 
64
 
65
  # Episode state
66
  self._step_number = 0
67
+ self._cumulative_reward = 0.1
68
  self._plan: List[PlanStep] = []
69
  self._reviews: List[ReviewArtifact] = []
70
  self._reflections: List[ReflectionArtifact] = []
 
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
+ self._cumulative_reward = 0.1
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
 
120
  # Build initial observation
121
  self._obs = self._build_observation(
122
  action_type=None,
123
+ reward=0.1,
 
 
124
  done=False,
125
  )
126
  return self._obs
 
327
  ln for ln in output.splitlines()
328
  if re.match(r".+:\d+:\d+:", ln)
329
  ])
330
+ score = max(0.1, min(0.9, 1.0 - violations * 0.05))
331
  self._last_lint_result = LintResult(
332
  violations=violations,
333
  output=output[:2000],
 
397
  self._log(f"[END] {reason}")
398
  self._obs = self._build_observation(
399
  action_type=None,
400
+ reward=0.1,
 
 
401
  done=True,
402
  )
403
  return self._obs
grader.py CHANGED
@@ -99,8 +99,8 @@ def score_tests(repo_path: str, timeout: int = 60) -> tuple[float, str]:
99
  return 0.01, output
100
 
101
  pass_rate = passed / total
102
- # Strictly (0, 1)
103
- pass_rate = max(0.01, min(0.99, pass_rate))
104
  return pass_rate, output
105
 
106
 
@@ -122,8 +122,8 @@ def score_lint(repo_path: str) -> tuple[float, str]:
122
  if re.match(r".+:\d+:\d+:", ln)
123
  ])
124
  # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
125
- # Strictly (0, 1)
126
- score = max(0.01, min(0.99, 1.0 - violations * 0.07))
127
  return score, output
128
 
129
 
@@ -156,8 +156,8 @@ def score_review_quality(
156
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
157
  specificity = min(0.1, len(set(code_words)) * 0.025)
158
 
159
- # Strictly (0, 1)
160
- return max(0.01, min(0.99, kw_score * 0.7 + length_bonus + specificity))
161
 
162
 
163
  def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
@@ -177,18 +177,17 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
177
  depth = min(1.0, depth + 0.2)
178
  total += depth
179
 
180
- # Strictly (0, 1)
181
- return max(0.01, min(0.99, total / max(1, len(reflections))))
182
 
183
 
184
  def score_efficiency(total_steps: int, max_steps: int) -> float:
185
  """Reward solving in fewer steps with smooth decay curve."""
186
- if total_steps <= 0:
187
- return 0.01 # never return exact 0.0
188
  ratio = total_steps / max_steps
189
  # Smooth exponential decay instead of step function
190
  import math
191
- return round(max(0.01, math.exp(-2.0 * max(0, ratio - 0.25))), 4)
 
192
 
193
 
194
  # ─────────────────────────────────────────────
@@ -216,7 +215,7 @@ def grade_episode(
216
  log.append("[GRADER] ⚠ TEST TAMPERING DETECTED β€” score zeroed")
217
  return EpisodeResult(
218
  task_id=task_id, total_steps=total_steps,
219
- final_score=0.01, passed=False,
220
  log=log + ["Test files were trivially rewritten to force passes."],
221
  )
222
 
@@ -224,7 +223,7 @@ def grade_episode(
224
  log.append("[GRADER] ⚠ NO IMPLEMENTATION FOUND β€” score zeroed")
225
  return EpisodeResult(
226
  task_id=task_id, total_steps=total_steps,
227
- final_score=0.01, passed=False,
228
  log=log + ["No non-test code was written."],
229
  )
230
 
@@ -257,9 +256,9 @@ def grade_episode(
257
  + 0.10 * review_q
258
  + 0.05 * reflect_q
259
  )
260
- # Clamp to [0.01, 0.99] so that :.2f format never outputs 0.00 or 1.00
261
- # Strictly (0, 1) as required by validator
262
- final = round(min(0.99, max(0.01, final)), 4)
263
  log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
264
 
265
  return EpisodeResult(
@@ -274,3 +273,51 @@ def grade_episode(
274
  passed=test_pass_rate >= 0.9 and lint_score >= 0.7,
275
  log=log,
276
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  return 0.01, output
100
 
101
  pass_rate = passed / total
102
+ # Strictly (0, 1) - Safer interior [0.1, 0.9]
103
+ pass_rate = max(0.1, min(0.9, pass_rate))
104
  return pass_rate, output
105
 
106
 
 
122
  if re.match(r".+:\d+:\d+:", ln)
123
  ])
124
  # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
125
+ # Strictly (0, 1) - Safer interior [0.1, 0.9]
126
+ score = max(0.1, min(0.9, 1.0 - violations * 0.07))
127
  return score, output
128
 
129
 
 
156
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
157
  specificity = min(0.1, len(set(code_words)) * 0.025)
158
 
159
+ # Strictly (0, 1) - Safer interior [0.1, 0.9]
160
+ return max(0.1, min(0.9, kw_score * 0.7 + length_bonus + specificity))
161
 
162
 
163
  def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
 
177
  depth = min(1.0, depth + 0.2)
178
  total += depth
179
 
180
+ # Strictly (0, 1) - Safer interior [0.1, 0.9]
181
+ return max(0.1, min(0.9, total / max(1, len(reflections))))
182
 
183
 
184
  def score_efficiency(total_steps: int, max_steps: int) -> float:
185
  """Reward solving in fewer steps with smooth decay curve."""
 
 
186
  ratio = total_steps / max_steps
187
  # Smooth exponential decay instead of step function
188
  import math
189
+ # Strictly (0.1, 0.9)
190
+ return round(max(0.1, min(0.9, math.exp(-2.0 * max(0, ratio - 0.25)))), 4)
191
 
192
 
193
  # ─────────────────────────────────────────────
 
215
  log.append("[GRADER] ⚠ TEST TAMPERING DETECTED β€” score zeroed")
216
  return EpisodeResult(
217
  task_id=task_id, total_steps=total_steps,
218
+ final_score=0.1, passed=False,
219
  log=log + ["Test files were trivially rewritten to force passes."],
220
  )
221
 
 
223
  log.append("[GRADER] ⚠ NO IMPLEMENTATION FOUND β€” score zeroed")
224
  return EpisodeResult(
225
  task_id=task_id, total_steps=total_steps,
226
+ final_score=0.1, passed=False,
227
  log=log + ["No non-test code was written."],
228
  )
229
 
 
256
  + 0.10 * review_q
257
  + 0.05 * reflect_q
258
  )
259
+ # Clamp to [0.1, 0.9] so that :.2f format never outputs 0.00 or 1.00
260
+ # Strictly (0.1, 0.9) interior range to satisfy Phase 2 validator
261
+ final = round(min(0.90, max(0.10, final)), 4)
262
  log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
263
 
264
  return EpisodeResult(
 
273
  passed=test_pass_rate >= 0.9 and lint_score >= 0.7,
274
  log=log,
275
  )
276
+
277
+
278
+ def grade_task(repo_path: str, **kwargs) -> float:
279
+ """
280
+ OpenEnv standard grader bridge – entry point from YAML.
281
+ Returns ONLY a float strictly between 0 and 1.
282
+ """
283
+ import json
284
+ import os
285
+ from typing import List
286
+ from pydantic import TypeAdapter
287
+
288
+ metadata_path = os.path.join(repo_path, "grading_metadata.json")
289
+
290
+ # Default fallback values for out-of-band grading
291
+ task_id = "unknown"
292
+ total_steps = 1
293
+ max_steps = 20
294
+ reviews = []
295
+ reflections = []
296
+ required_keywords = []
297
+
298
+ if os.path.exists(metadata_path):
299
+ try:
300
+ with open(metadata_path, "r") as f:
301
+ meta = json.load(f)
302
+ task_id = meta.get("task_id", task_id)
303
+ total_steps = meta.get("total_steps", total_steps)
304
+ max_steps = meta.get("max_steps", max_steps)
305
+
306
+ # Use TypeAdapter for robust Pydantic deserialization
307
+ from models import ReviewArtifact, ReflectionArtifact
308
+ reviews = TypeAdapter(List[ReviewArtifact]).validate_python(meta.get("reviews", []))
309
+ reflections = TypeAdapter(List[ReflectionArtifact]).validate_python(meta.get("reflections", []))
310
+ required_keywords = meta.get("required_keywords", [])
311
+ except Exception:
312
+ pass
313
+
314
+ result = grade_episode(
315
+ repo_path=repo_path,
316
+ task_id=task_id,
317
+ total_steps=total_steps,
318
+ max_steps=max_steps,
319
+ reviews=reviews,
320
+ reflections=reflections,
321
+ required_keywords=required_keywords,
322
+ )
323
+ return float(result.final_score)
inference.py CHANGED
@@ -193,7 +193,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
193
  # Emit a [STEP] for the failed action
194
  print(
195
  f"[STEP] step={obs.step_number + 1} action=null "
196
- f"reward=0.01 done=false error={error_msg}",
197
  flush=True,
198
  )
199
  break
@@ -215,17 +215,36 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
215
  except Exception as exc:
216
  error_msg = str(exc).replace("\n", " ")[:120]
217
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  # Grade the episode
219
  result = env.grade()
220
  score = result.final_score
221
  success = result.passed
222
 
223
- rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.01"
224
 
225
  # ── [END] ─────────────────────────────────────────────────────────────────
 
 
226
  print(
227
  f"[END] success={'true' if success else 'false'} steps={step_count} "
228
- f"score={score:.4f} rewards={rewards_str}",
229
  flush=True,
230
  )
231
 
 
193
  # Emit a [STEP] for the failed action
194
  print(
195
  f"[STEP] step={obs.step_number + 1} action=null "
196
+ f"reward=0.10 done=false error={error_msg}",
197
  flush=True,
198
  )
199
  break
 
215
  except Exception as exc:
216
  error_msg = str(exc).replace("\n", " ")[:120]
217
 
218
+ # Writing metadata for standalone OpenEnv grader
219
+ try:
220
+ from tasks.task_registry import get_task
221
+ task_module = get_task(task_id)
222
+ meta_payload = {
223
+ "task_id": task_id,
224
+ "total_steps": step_count,
225
+ "max_steps": task_module.MAX_STEPS,
226
+ "reviews": [r.model_dump() for r in env._reviews],
227
+ "reflections": [r.model_dump() for r in env._reflections],
228
+ "required_keywords": getattr(task_module, "REQUIRED_KEYWORDS_IN_REVIEW", []),
229
+ }
230
+ with open(os.path.join(str(env._sandbox.repo_path), "grading_metadata.json"), "w") as f:
231
+ json.dump(meta_payload, f)
232
+ except Exception:
233
+ pass
234
+
235
  # Grade the episode
236
  result = env.grade()
237
  score = result.final_score
238
  success = result.passed
239
 
240
+ rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.10"
241
 
242
  # ── [END] ─────────────────────────────────────────────────────────────────
243
+ # We use 2 decimal places to match common validator expectations,
244
+ # but the internal value is strictly interior [0.1, 0.9].
245
  print(
246
  f"[END] success={'true' if success else 'false'} steps={step_count} "
247
+ f"score={score:.2f} rewards={rewards_str}",
248
  flush=True,
249
  )
250
 
models.py CHANGED
@@ -129,7 +129,7 @@ class TestResult(BaseModel):
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
- score: float = 0.99 # 0.99 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
- reward: float = 0.01
179
- cumulative_reward: float = 0.01
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
@@ -188,11 +188,11 @@ class Observation(BaseModel):
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
- test_pass_rate: float = 0.01
192
- lint_score: float = 0.01
193
- efficiency_score: float = 0.01
194
- review_quality: float = 0.01
195
- reflection_quality: float = 0.01
196
- final_score: float = 0.01
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
 
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
+ score: float = 0.90 # 0.9 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
 
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
+ reward: float = 0.1
179
+ cumulative_reward: float = 0.1
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
 
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
+ test_pass_rate: float = 0.1
192
+ lint_score: float = 0.1
193
+ efficiency_score: float = 0.1
194
+ review_quality: float = 0.1
195
+ reflection_quality: float = 0.1
196
+ final_score: float = 0.1
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
openenv.yaml CHANGED
@@ -1,5 +1,5 @@
1
  name: teamforge
2
- version: "1.0.0"
3
  description: >
4
  A structured multi-phase benchmark for autonomous software engineering agents.
5
  The agent simulates a full software development team: planning, coding, testing,
@@ -116,7 +116,7 @@ observation_space:
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
- range: [0.0, 10.0]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -128,22 +128,22 @@ tasks:
128
  difficulty: easy
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
- grader: grader.grade_episode
132
- score_range: [0.0, 1.0]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
- grader: grader.grade_episode
139
- score_range: [0.0, 1.0]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
- grader: grader.grade_episode
146
- score_range: [0.0, 1.0]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
 
1
  name: teamforge
2
+ version: "1.1.0"
3
  description: >
4
  A structured multi-phase benchmark for autonomous software engineering agents.
5
  The agent simulates a full software development team: planning, coding, testing,
 
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
+ range: [0.1, 0.9]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
 
128
  difficulty: easy
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
+ grader: grader.grade_task
132
+ score_range: [0.1, 0.9]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
+ grader: grader.grade_task
139
+ score_range: [0.1, 0.9]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
+ grader: grader.grade_task
146
+ score_range: [0.1, 0.9]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
reward.py CHANGED
@@ -30,13 +30,13 @@ REFLECT_REWARD = 0.10
30
  TEST_PASS_BONUS_PER_TEST = 0.05
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
- # Neutral/Small signals (replacing negative penalties to stay in 0-1 range)
34
- # We use 0.01 to satisfy "strictly between 0 and 1" requirement
35
- ACTION_ERROR_REWARD = 0.01
36
- REPEATED_FAILURE_REWARD = 0.01
37
- STEP_BASE_REWARD = 0.01
38
- TEST_MODIFICATION_REWARD = 0.01
39
- LINT_VIOLATION_REWARD = 0.01
40
 
41
 
42
  # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
- }.get(action_type, 0.01)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
- # Final clamp to strictly within (0, 1) per OpenEnv validator requirement
118
- return round(max(0.01, min(0.99, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()
 
30
  TEST_PASS_BONUS_PER_TEST = 0.05
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
+ # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
34
+ # We use 0.1 to satisfy "strictly between 0 and 1" requirement with safe interior
35
+ ACTION_ERROR_REWARD = 0.10
36
+ REPEATED_FAILURE_REWARD = 0.10
37
+ STEP_BASE_REWARD = 0.10
38
+ TEST_MODIFICATION_REWARD = 0.10
39
+ LINT_VIOLATION_REWARD = 0.10
40
 
41
 
42
  # ─────────────────────────────────────────────
 
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
+ }.get(action_type, 0.10)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
 
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
+ # Final clamp to strictly within (0.1, 0.9) per OpenEnv validator requirement
118
+ return round(max(0.1, min(0.9, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()