Your Name commited on
Commit
e317eba
Β·
1 Parent(s): a3b9d4b

fix(OpenEnv): global overhaul to high-resolution interior clamping (0.001-0.999) per technical diagnosis

Browse files
Files changed (7) hide show
  1. README.md +13 -2
  2. environment.py +5 -5
  3. grader.py +25 -19
  4. inference.py +1 -1
  5. models.py +9 -9
  6. openenv.yaml +4 -4
  7. reward.py +9 -9
README.md CHANGED
@@ -40,8 +40,8 @@ Every mandatory requirement is implemented and verified:
40
  | `step()` / `reset()` / `state()` OpenEnv API | βœ… | `environment.py` |
41
  | `openenv.yaml` spec file | βœ… | `openenv.yaml` |
42
  | Typed Pydantic models | βœ… | `models.py` β€” 8 action types + Observation |
43
- | Minimum 3 tasks (easy β†’ medium β†’ hard) | βœ… | 4 tasks (3 scored + 1 bonus) |
44
- | Graders return score in `(0, 1)` | βœ… | `grader.py` β€” strictly 0.01 to 0.99 |
45
  | Deterministic, reproducible | βœ… | Anti-exploit guards included |
46
  | Dense reward with strictly `(0, 1)` range | βœ… | `reward.py` β€” delta-based per step |
47
  | Baseline inference script named `inference.py` | βœ… | `inference.py` |
@@ -55,6 +55,17 @@ Every mandatory requirement is implemented and verified:
55
  | Runs on 2 vCPU / 8 GB RAM / < 20 min | βœ… | Verified β€” easy=~2min, hard=~8min |
56
  | README with action/observation space docs | βœ… | This file |
57
 
 
 
 
 
 
 
 
 
 
 
 
58
  ---
59
 
60
  ## 🎯 What Makes TeamForge Different
 
40
  | `step()` / `reset()` / `state()` OpenEnv API | βœ… | `environment.py` |
41
  | `openenv.yaml` spec file | βœ… | `openenv.yaml` |
42
  | Typed Pydantic models | βœ… | `models.py` β€” 8 action types + Observation |
43
+ | Minimum 3 tasks (easy β†’ medium β†’ hard) | βœ… | 3 core tasks (aligned with YAML) |
44
+ | Graders return score in `(0, 1)` | βœ… | `grader.py` β€” strictly 0.001 to 0.999 |
45
  | Deterministic, reproducible | βœ… | Anti-exploit guards included |
46
  | Dense reward with strictly `(0, 1)` range | βœ… | `reward.py` β€” delta-based per step |
47
  | Baseline inference script named `inference.py` | βœ… | `inference.py` |
 
55
  | Runs on 2 vCPU / 8 GB RAM / < 20 min | βœ… | Verified β€” easy=~2min, hard=~8min |
56
  | README with action/observation space docs | βœ… | This file |
57
 
58
+ # OpenEnv Validator Compliance
59
+ **Status:** Strictly within `(0.001, 0.999)` interior range.
60
+
61
+ ### πŸ” Technical Diagnosis & Fix
62
+ - **Error:** "Each task's score must be strictly between 0 and 1 (not 0.0 and not 1.0)"
63
+ - **Cause:** The hackathon validator requires scores in the open interval (0, 1). A perfect lint or test score returning exactly 1.0 (or 0.0 on failure) was triggering the range rejection.
64
+ - **Fix:** Implemented a robust `_clamp()` system in `grader.py` and global baselines.
65
+ - `_SCORE_MIN = 0.001` (never exactly 0.0)
66
+ - `_SCORE_MAX = 0.999` (never exactly 1.0)
67
+ - **Compliance:** Every sub-score, reward, and final result is now guaranteed to be in the `[0.001, 0.999]` range.
68
+
69
  ---
70
 
71
  ## 🎯 What Makes TeamForge Different
environment.py CHANGED
@@ -64,7 +64,7 @@ class TeamForgeEnv:
64
 
65
  # Episode state
66
  self._step_number = 0
67
- self._cumulative_reward = 0.1
68
  self._plan: List[PlanStep] = []
69
  self._reviews: List[ReviewArtifact] = []
70
  self._reflections: List[ReflectionArtifact] = []
@@ -106,7 +106,7 @@ class TeamForgeEnv:
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
- self._cumulative_reward = 0.1
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
@@ -122,7 +122,7 @@ class TeamForgeEnv:
122
  action_type=None,
123
  status=ActionStatus.SUCCESS,
124
  output="Environment initialized.",
125
- reward=0.1,
126
  done=False,
127
  )
128
  return self._obs
@@ -329,7 +329,7 @@ class TeamForgeEnv:
329
  ln for ln in output.splitlines()
330
  if re.match(r".+:\d+:\d+:", ln)
331
  ])
332
- score = max(0.1, min(0.9, 1.0 - violations * 0.05))
333
  self._last_lint_result = LintResult(
334
  violations=violations,
335
  output=output[:2000],
@@ -401,7 +401,7 @@ class TeamForgeEnv:
401
  action_type=None,
402
  status=ActionStatus.FAILURE,
403
  output=reason,
404
- reward=0.1,
405
  done=True,
406
  )
407
  return self._obs
 
64
 
65
  # Episode state
66
  self._step_number = 0
67
+ self._cumulative_reward = 0.001
68
  self._plan: List[PlanStep] = []
69
  self._reviews: List[ReviewArtifact] = []
70
  self._reflections: List[ReflectionArtifact] = []
 
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
+ self._cumulative_reward = 0.001
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
 
122
  action_type=None,
123
  status=ActionStatus.SUCCESS,
124
  output="Environment initialized.",
125
+ reward=0.001,
126
  done=False,
127
  )
128
  return self._obs
 
329
  ln for ln in output.splitlines()
330
  if re.match(r".+:\d+:\d+:", ln)
331
  ])
332
+ score = max(0.001, min(0.999, 1.0 - violations * 0.05))
333
  self._last_lint_result = LintResult(
334
  violations=violations,
335
  output=output[:2000],
 
401
  action_type=None,
402
  status=ActionStatus.FAILURE,
403
  output=reason,
404
+ reward=0.001,
405
  done=True,
406
  )
407
  return self._obs
grader.py CHANGED
@@ -28,6 +28,18 @@ from typing import List, Optional
28
  from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # ─────────────────────────────────────────────
32
  # ANTI-EXPLOIT GUARDS
33
  # ─────────────────────────────────────────────
@@ -99,9 +111,7 @@ def score_tests(repo_path: str, timeout: int = 60) -> tuple[float, str]:
99
  return 0.01, output
100
 
101
  pass_rate = passed / total
102
- # Strictly (0, 1) - Safer interior [0.1, 0.9]
103
- pass_rate = max(0.1, min(0.9, pass_rate))
104
- return pass_rate, output
105
 
106
 
107
  def score_lint(repo_path: str) -> tuple[float, str]:
@@ -122,9 +132,8 @@ def score_lint(repo_path: str) -> tuple[float, str]:
122
  if re.match(r".+:\d+:\d+:", ln)
123
  ])
124
  # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
125
- # Strictly (0, 1) - Safer interior [0.1, 0.9]
126
- score = max(0.1, min(0.9, 1.0 - violations * 0.07))
127
- return score, output
128
 
129
 
130
  def score_review_quality(
@@ -156,8 +165,8 @@ def score_review_quality(
156
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
157
  specificity = min(0.1, len(set(code_words)) * 0.025)
158
 
159
- # Strictly (0, 1) - Safer interior [0.1, 0.9]
160
- return max(0.1, min(0.9, kw_score * 0.7 + length_bonus + specificity))
161
 
162
 
163
  def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
@@ -177,17 +186,15 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
177
  depth = min(1.0, depth + 0.2)
178
  total += depth
179
 
180
- # Strictly (0, 1) - Safer interior [0.1, 0.9]
181
- return max(0.1, min(0.9, total / max(1, len(reflections))))
182
 
183
 
184
  def score_efficiency(total_steps: int, max_steps: int) -> float:
185
  """Reward solving in fewer steps with smooth decay curve."""
186
  ratio = total_steps / max_steps
187
- # Smooth exponential decay instead of step function
188
- import math
189
- # Strictly (0.1, 0.9)
190
- return round(max(0.1, min(0.9, math.exp(-2.0 * max(0, ratio - 0.25)))), 4)
191
 
192
 
193
  # ─────────────────────────────────────────────
@@ -215,7 +222,7 @@ def grade_episode(
215
  log.append("[GRADER] ⚠ TEST TAMPERING DETECTED β€” score zeroed")
216
  return EpisodeResult(
217
  task_id=task_id, total_steps=total_steps,
218
- final_score=0.1, passed=False,
219
  log=log + ["Test files were trivially rewritten to force passes."],
220
  )
221
 
@@ -223,7 +230,7 @@ def grade_episode(
223
  log.append("[GRADER] ⚠ NO IMPLEMENTATION FOUND β€” score zeroed")
224
  return EpisodeResult(
225
  task_id=task_id, total_steps=total_steps,
226
- final_score=0.1, passed=False,
227
  log=log + ["No non-test code was written."],
228
  )
229
 
@@ -256,9 +263,8 @@ def grade_episode(
256
  + 0.10 * review_q
257
  + 0.05 * reflect_q
258
  )
259
- # Clamp to [0.1, 0.9] so that :.2f format never outputs 0.00 or 1.00
260
- # Strictly (0.1, 0.9) interior range to satisfy Phase 2 validator
261
- final = round(min(0.90, max(0.10, final)), 4)
262
  log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
263
 
264
  return EpisodeResult(
 
28
  from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
29
 
30
 
31
+ # ─────────────────────────────────────────────
32
+ # SCORING CONFIG
33
+ # ─────────────────────────────────────────────
34
+ _SCORE_MIN = 0.001 # never exactly 0.0
35
+ _SCORE_MAX = 0.999 # never exactly 1.0
36
+
37
+
38
+ def _clamp(score: float) -> float:
39
+ """Ensure score is strictly within the open interval (0, 1)."""
40
+ return round(max(_SCORE_MIN, min(_SCORE_MAX, score)), 4)
41
+
42
+
43
  # ─────────────────────────────────────────────
44
  # ANTI-EXPLOIT GUARDS
45
  # ─────────────────────────────────────────────
 
111
  return 0.01, output
112
 
113
  pass_rate = passed / total
114
+ return _clamp(pass_rate), output
 
 
115
 
116
 
117
  def score_lint(repo_path: str) -> tuple[float, str]:
 
132
  if re.match(r".+:\d+:\d+:", ln)
133
  ])
134
  # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
135
+ # Strictly (0, 1) - Safer interior
136
+ return _clamp(1.0 - violations * 0.07), output
 
137
 
138
 
139
  def score_review_quality(
 
165
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
166
  specificity = min(0.1, len(set(code_words)) * 0.025)
167
 
168
+ # Strictly (0, 1) - Safer interior
169
+ return _clamp(kw_score * 0.7 + length_bonus + specificity)
170
 
171
 
172
  def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
 
186
  depth = min(1.0, depth + 0.2)
187
  total += depth
188
 
189
+ # Strictly (0, 1) - Safer interior
190
+ return _clamp(total / max(1, len(reflections)))
191
 
192
 
193
  def score_efficiency(total_steps: int, max_steps: int) -> float:
194
  """Reward solving in fewer steps with smooth decay curve."""
195
  ratio = total_steps / max_steps
196
+ # Strictly (0, 1) - Safer interior
197
+ return _clamp(math.exp(-2.0 * max(0, ratio - 0.25)))
 
 
198
 
199
 
200
  # ─────────────────────────────────────────────
 
222
  log.append("[GRADER] ⚠ TEST TAMPERING DETECTED β€” score zeroed")
223
  return EpisodeResult(
224
  task_id=task_id, total_steps=total_steps,
225
+ final_score=_SCORE_MIN, passed=False,
226
  log=log + ["Test files were trivially rewritten to force passes."],
227
  )
228
 
 
230
  log.append("[GRADER] ⚠ NO IMPLEMENTATION FOUND β€” score zeroed")
231
  return EpisodeResult(
232
  task_id=task_id, total_steps=total_steps,
233
+ final_score=_SCORE_MIN, passed=False,
234
  log=log + ["No non-test code was written."],
235
  )
236
 
 
263
  + 0.10 * review_q
264
  + 0.05 * reflect_q
265
  )
266
+ # Strictly (0, 1) interior range to satisfy Phase 2 validator
267
+ final = _clamp(final)
 
268
  log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
269
 
270
  return EpisodeResult(
inference.py CHANGED
@@ -237,7 +237,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
237
  score = result.final_score
238
  success = result.passed
239
 
240
- rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.10"
241
 
242
  # ── [END] ─────────────────────────────────────────────────────────────────
243
  # We use 2 decimal places to match common validator expectations,
 
237
  score = result.final_score
238
  success = result.passed
239
 
240
+ rewards_str = ",".join(f"{r:.3f}" for r in rewards) if rewards else "0.001"
241
 
242
  # ── [END] ─────────────────────────────────────────────────────────────────
243
  # We use 2 decimal places to match common validator expectations,
models.py CHANGED
@@ -129,7 +129,7 @@ class TestResult(BaseModel):
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
- score: float = 0.90 # 0.9 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
- reward: float = 0.1
179
- cumulative_reward: float = 0.1
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
@@ -188,11 +188,11 @@ class Observation(BaseModel):
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
- test_pass_rate: float = 0.1
192
- lint_score: float = 0.1
193
- efficiency_score: float = 0.1
194
- review_quality: float = 0.1
195
- reflection_quality: float = 0.1
196
- final_score: float = 0.1
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
 
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
+ score: float = 0.999 # 0.999 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
 
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
+ reward: float = 0.001
179
+ cumulative_reward: float = 0.001
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
 
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
+ test_pass_rate: float = 0.001
192
+ lint_score: float = 0.001
193
+ efficiency_score: float = 0.001
194
+ review_quality: float = 0.001
195
+ reflection_quality: float = 0.001
196
+ final_score: float = 0.001
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
openenv.yaml CHANGED
@@ -116,7 +116,7 @@ observation_space:
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
- range: [0.1, 0.9]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -129,21 +129,21 @@ tasks:
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
  grader: grader.grade_task
132
- score_range: [0.1, 0.9]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
  grader: grader.grade_task
139
- score_range: [0.1, 0.9]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
- score_range: [0.1, 0.9]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
 
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
+ range: [0.001, 0.999]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
 
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
  grader: grader.grade_task
132
+ score_range: [0.001, 0.999]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
  grader: grader.grade_task
139
+ score_range: [0.001, 0.999]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
+ score_range: [0.001, 0.999]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
reward.py CHANGED
@@ -31,12 +31,12 @@ TEST_PASS_BONUS_PER_TEST = 0.05
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
  # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
34
- # We use 0.1 to satisfy "strictly between 0 and 1" requirement with safe interior
35
- ACTION_ERROR_REWARD = 0.10
36
- REPEATED_FAILURE_REWARD = 0.10
37
- STEP_BASE_REWARD = 0.10
38
- TEST_MODIFICATION_REWARD = 0.10
39
- LINT_VIOLATION_REWARD = 0.10
40
 
41
 
42
  # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
- }.get(action_type, 0.10)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
- # Final clamp to strictly within (0.1, 0.9) per OpenEnv validator requirement
118
- return round(max(0.1, min(0.9, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()
 
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
  # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
34
+ # We use 0.001 to satisfy "strictly between 0 and 1" requirement with high resolution
35
+ ACTION_ERROR_REWARD = 0.001
36
+ REPEATED_FAILURE_REWARD = 0.001
37
+ STEP_BASE_REWARD = 0.001
38
+ TEST_MODIFICATION_REWARD = 0.001
39
+ LINT_VIOLATION_REWARD = 0.001
40
 
41
 
42
  # ─────────────────────────────────────────────
 
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
+ }.get(action_type, 0.001)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
 
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
+ # Final clamp to strictly within (0.001, 0.999) per OpenEnv validator requirement
118
+ return round(max(0.001, min(0.999, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()