Your Name commited on
Commit
efa2d2a
Β·
1 Parent(s): 4f893da

fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance

Browse files
Files changed (6) hide show
  1. environment.py +2 -2
  2. grader.py +5 -5
  3. models.py +9 -9
  4. openenv.yaml +7 -0
  5. reward.py +10 -10
  6. tasks/task_registry.py +1 -0
environment.py CHANGED
@@ -106,7 +106,7 @@ class TeamForgeEnv:
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
- self._cumulative_reward = 0.001
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
@@ -122,7 +122,7 @@ class TeamForgeEnv:
122
  action_type=None,
123
  status=ActionStatus.SUCCESS,
124
  output="Environment initialized.",
125
- reward=0.001,
126
  done=False,
127
  )
128
  return self._obs
 
106
 
107
  # Reset episode state
108
  self._step_number = 0
109
+ self._cumulative_reward = 0.1
110
  self._plan = []
111
  self._reviews = []
112
  self._reflections = []
 
122
  action_type=None,
123
  status=ActionStatus.SUCCESS,
124
  output="Environment initialized.",
125
+ reward=0.1,
126
  done=False,
127
  )
128
  return self._obs
grader.py CHANGED
@@ -132,9 +132,9 @@ def score_lint(repo_path: str) -> tuple[float, str]:
132
  ln for ln in output.splitlines()
133
  if re.match(r".+:\d+:\d+:", ln)
134
  ])
135
- # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
136
- # Strictly (0, 1) - Safer interior
137
- return _clamp(1.0 - violations * 0.07), output
138
 
139
 
140
  def score_review_quality(
@@ -149,7 +149,7 @@ def score_review_quality(
149
 
150
  # Anti-exploit: minimum meaningful length
151
  if len(combined.strip()) < 40:
152
- return 0.05
153
 
154
  # Keyword coverage
155
  if not required_keywords:
@@ -183,7 +183,7 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
183
  depth += 0.5
184
  # Bonus if adjusted_plan provided
185
  if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
186
- depth = min(1.0, depth + 0.2)
187
  total += depth
188
 
189
  # Strictly (0, 1) - Safer interior
 
132
  ln for ln in output.splitlines()
133
  if re.match(r".+:\d+:\d+:", ln)
134
  ])
135
+ # Strictly interior [SCORE_MIN, SCORE_MAX]
136
+ raw_score = 1.0 - violations * 0.07
137
+ return _clamp(raw_score), output
138
 
139
 
140
  def score_review_quality(
 
149
 
150
  # Anti-exploit: minimum meaningful length
151
  if len(combined.strip()) < 40:
152
+ return _SCORE_MIN + 0.05
153
 
154
  # Keyword coverage
155
  if not required_keywords:
 
183
  depth += 0.5
184
  # Bonus if adjusted_plan provided
185
  if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
186
+ depth = min(0.9, depth + 0.2)
187
  total += depth
188
 
189
  # Strictly (0, 1) - Safer interior
models.py CHANGED
@@ -129,7 +129,7 @@ class TestResult(BaseModel):
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
- score: float = 0.999 # 0.999 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
- reward: float = 0.001
179
- cumulative_reward: float = 0.001
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
@@ -188,11 +188,11 @@ class Observation(BaseModel):
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
- test_pass_rate: float = 0.001
192
- lint_score: float = 0.001
193
- efficiency_score: float = 0.001
194
- review_quality: float = 0.001
195
- reflection_quality: float = 0.001
196
- final_score: float = 0.001
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
 
129
  class LintResult(BaseModel):
130
  violations: int = 0
131
  output: str = ""
132
+ score: float = 0.9 # 0.9 = clean
133
 
134
 
135
  class ReviewArtifact(BaseModel):
 
175
  reflections: List[ReflectionArtifact] = Field(default_factory=list)
176
 
177
  # Signals
178
+ reward: float = 0.1
179
+ cumulative_reward: float = 0.1
180
  done: bool = False
181
  info: Dict[str, Any] = Field(default_factory=dict)
182
 
 
188
  class EpisodeResult(BaseModel):
189
  task_id: str
190
  total_steps: int
191
+ test_pass_rate: float = 0.1
192
+ lint_score: float = 0.1
193
+ efficiency_score: float = 0.1
194
+ review_quality: float = 0.1
195
+ reflection_quality: float = 0.1
196
+ final_score: float = 0.1
197
  passed: bool = False
198
  log: List[str] = Field(default_factory=list)
openenv.yaml CHANGED
@@ -144,6 +144,13 @@ tasks:
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
  score_range: [0.0, 1.0]
 
 
 
 
 
 
 
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
 
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
  score_range: [0.0, 1.0]
147
+
148
+ - id: bonus_task
149
+ difficulty: bonus
150
+ max_steps: 10
151
+ description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
152
+ grader: grader.grade_task
153
+ score_range: [0.0, 1.0]
154
 
155
  # ── Infrastructure ─────────────────────────────────────────────────────────────
156
  runtime:
reward.py CHANGED
@@ -30,13 +30,13 @@ REFLECT_REWARD = 0.10
30
  TEST_PASS_BONUS_PER_TEST = 0.05
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
- # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
34
- # We use 0.001 to satisfy "strictly between 0 and 1" requirement with high resolution
35
- ACTION_ERROR_REWARD = 0.001
36
- REPEATED_FAILURE_REWARD = 0.001
37
- STEP_BASE_REWARD = 0.001
38
- TEST_MODIFICATION_REWARD = 0.001
39
- LINT_VIOLATION_REWARD = 0.001
40
 
41
 
42
  # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
- }.get(action_type, 0.001)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
- # Final clamp to strictly within (0.001, 0.999) per OpenEnv validator requirement
118
- return round(max(0.001, min(0.999, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()
 
30
  TEST_PASS_BONUS_PER_TEST = 0.05
31
  LINT_CLEAN_BONUS = 0.05
32
 
33
+ # Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
34
+ # We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
35
+ ACTION_ERROR_REWARD = 0.1
36
+ REPEATED_FAILURE_REWARD = 0.1
37
+ STEP_BASE_REWARD = 0.1
38
+ TEST_MODIFICATION_REWARD = 0.1
39
+ LINT_VIOLATION_REWARD = 0.1
40
 
41
 
42
  # ─────────────────────────────────────────────
 
95
  "run_tests": 0.02,
96
  "run_lint": 0.02,
97
  "request_iteration": 0.02,
98
+ }.get(action_type, 0.1)
99
 
100
  # ── Test progress bonus ──
101
  if tests_passed is not None:
 
114
  reward += abs(delta) * LINT_VIOLATION_REWARD
115
  self._prev_lint_violations = lint_violations
116
 
117
+ # Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
118
+ return round(max(0.1, min(0.9, reward)), 4)
119
 
120
  def _is_test_file(self, path: str) -> bool:
121
  low = path.lower()
tasks/task_registry.py CHANGED
@@ -7,6 +7,7 @@ TASK_REGISTRY: Dict[str, Any] = {
7
  easy_task.TASK_ID: easy_task,
8
  medium_task.TASK_ID: medium_task,
9
  hard_task.TASK_ID: hard_task,
 
10
  }
11
 
12
  # The 3 scored tasks for the hackathon (easy, medium, hard)
 
7
  easy_task.TASK_ID: easy_task,
8
  medium_task.TASK_ID: medium_task,
9
  hard_task.TASK_ID: hard_task,
10
+ bonus_task.TASK_ID: bonus_task,
11
  }
12
 
13
  # The 3 scored tasks for the hackathon (easy, medium, hard)