Spaces:
Sleeping
Sleeping
Your Name commited on
Commit Β·
efa2d2a
1
Parent(s): 4f893da
fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance
Browse files- environment.py +2 -2
- grader.py +5 -5
- models.py +9 -9
- openenv.yaml +7 -0
- reward.py +10 -10
- tasks/task_registry.py +1 -0
environment.py
CHANGED
|
@@ -106,7 +106,7 @@ class TeamForgeEnv:
|
|
| 106 |
|
| 107 |
# Reset episode state
|
| 108 |
self._step_number = 0
|
| 109 |
-
self._cumulative_reward = 0.
|
| 110 |
self._plan = []
|
| 111 |
self._reviews = []
|
| 112 |
self._reflections = []
|
|
@@ -122,7 +122,7 @@ class TeamForgeEnv:
|
|
| 122 |
action_type=None,
|
| 123 |
status=ActionStatus.SUCCESS,
|
| 124 |
output="Environment initialized.",
|
| 125 |
-
reward=0.
|
| 126 |
done=False,
|
| 127 |
)
|
| 128 |
return self._obs
|
|
|
|
| 106 |
|
| 107 |
# Reset episode state
|
| 108 |
self._step_number = 0
|
| 109 |
+
self._cumulative_reward = 0.1
|
| 110 |
self._plan = []
|
| 111 |
self._reviews = []
|
| 112 |
self._reflections = []
|
|
|
|
| 122 |
action_type=None,
|
| 123 |
status=ActionStatus.SUCCESS,
|
| 124 |
output="Environment initialized.",
|
| 125 |
+
reward=0.1,
|
| 126 |
done=False,
|
| 127 |
)
|
| 128 |
return self._obs
|
grader.py
CHANGED
|
@@ -132,9 +132,9 @@ def score_lint(repo_path: str) -> tuple[float, str]:
|
|
| 132 |
ln for ln in output.splitlines()
|
| 133 |
if re.match(r".+:\d+:\d+:", ln)
|
| 134 |
])
|
| 135 |
-
#
|
| 136 |
-
|
| 137 |
-
return _clamp(
|
| 138 |
|
| 139 |
|
| 140 |
def score_review_quality(
|
|
@@ -149,7 +149,7 @@ def score_review_quality(
|
|
| 149 |
|
| 150 |
# Anti-exploit: minimum meaningful length
|
| 151 |
if len(combined.strip()) < 40:
|
| 152 |
-
return 0.05
|
| 153 |
|
| 154 |
# Keyword coverage
|
| 155 |
if not required_keywords:
|
|
@@ -183,7 +183,7 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
|
|
| 183 |
depth += 0.5
|
| 184 |
# Bonus if adjusted_plan provided
|
| 185 |
if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
|
| 186 |
-
depth = min(
|
| 187 |
total += depth
|
| 188 |
|
| 189 |
# Strictly (0, 1) - Safer interior
|
|
|
|
| 132 |
ln for ln in output.splitlines()
|
| 133 |
if re.match(r".+:\d+:\d+:", ln)
|
| 134 |
])
|
| 135 |
+
# Strictly interior [SCORE_MIN, SCORE_MAX]
|
| 136 |
+
raw_score = 1.0 - violations * 0.07
|
| 137 |
+
return _clamp(raw_score), output
|
| 138 |
|
| 139 |
|
| 140 |
def score_review_quality(
|
|
|
|
| 149 |
|
| 150 |
# Anti-exploit: minimum meaningful length
|
| 151 |
if len(combined.strip()) < 40:
|
| 152 |
+
return _SCORE_MIN + 0.05
|
| 153 |
|
| 154 |
# Keyword coverage
|
| 155 |
if not required_keywords:
|
|
|
|
| 183 |
depth += 0.5
|
| 184 |
# Bonus if adjusted_plan provided
|
| 185 |
if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
|
| 186 |
+
depth = min(0.9, depth + 0.2)
|
| 187 |
total += depth
|
| 188 |
|
| 189 |
# Strictly (0, 1) - Safer interior
|
models.py
CHANGED
|
@@ -129,7 +129,7 @@ class TestResult(BaseModel):
|
|
| 129 |
class LintResult(BaseModel):
|
| 130 |
violations: int = 0
|
| 131 |
output: str = ""
|
| 132 |
-
score: float = 0.
|
| 133 |
|
| 134 |
|
| 135 |
class ReviewArtifact(BaseModel):
|
|
@@ -175,8 +175,8 @@ class Observation(BaseModel):
|
|
| 175 |
reflections: List[ReflectionArtifact] = Field(default_factory=list)
|
| 176 |
|
| 177 |
# Signals
|
| 178 |
-
reward: float = 0.
|
| 179 |
-
cumulative_reward: float = 0.
|
| 180 |
done: bool = False
|
| 181 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 182 |
|
|
@@ -188,11 +188,11 @@ class Observation(BaseModel):
|
|
| 188 |
class EpisodeResult(BaseModel):
|
| 189 |
task_id: str
|
| 190 |
total_steps: int
|
| 191 |
-
test_pass_rate: float = 0.
|
| 192 |
-
lint_score: float = 0.
|
| 193 |
-
efficiency_score: float = 0.
|
| 194 |
-
review_quality: float = 0.
|
| 195 |
-
reflection_quality: float = 0.
|
| 196 |
-
final_score: float = 0.
|
| 197 |
passed: bool = False
|
| 198 |
log: List[str] = Field(default_factory=list)
|
|
|
|
| 129 |
class LintResult(BaseModel):
|
| 130 |
violations: int = 0
|
| 131 |
output: str = ""
|
| 132 |
+
score: float = 0.9 # 0.9 = clean
|
| 133 |
|
| 134 |
|
| 135 |
class ReviewArtifact(BaseModel):
|
|
|
|
| 175 |
reflections: List[ReflectionArtifact] = Field(default_factory=list)
|
| 176 |
|
| 177 |
# Signals
|
| 178 |
+
reward: float = 0.1
|
| 179 |
+
cumulative_reward: float = 0.1
|
| 180 |
done: bool = False
|
| 181 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 182 |
|
|
|
|
| 188 |
class EpisodeResult(BaseModel):
|
| 189 |
task_id: str
|
| 190 |
total_steps: int
|
| 191 |
+
test_pass_rate: float = 0.1
|
| 192 |
+
lint_score: float = 0.1
|
| 193 |
+
efficiency_score: float = 0.1
|
| 194 |
+
review_quality: float = 0.1
|
| 195 |
+
reflection_quality: float = 0.1
|
| 196 |
+
final_score: float = 0.1
|
| 197 |
passed: bool = False
|
| 198 |
log: List[str] = Field(default_factory=list)
|
openenv.yaml
CHANGED
|
@@ -144,6 +144,13 @@ tasks:
|
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
score_range: [0.0, 1.0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
runtime:
|
|
|
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
score_range: [0.0, 1.0]
|
| 147 |
+
|
| 148 |
+
- id: bonus_task
|
| 149 |
+
difficulty: bonus
|
| 150 |
+
max_steps: 10
|
| 151 |
+
description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
|
| 152 |
+
grader: grader.grade_task
|
| 153 |
+
score_range: [0.0, 1.0]
|
| 154 |
|
| 155 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 156 |
runtime:
|
reward.py
CHANGED
|
@@ -30,13 +30,13 @@ REFLECT_REWARD = 0.10
|
|
| 30 |
TEST_PASS_BONUS_PER_TEST = 0.05
|
| 31 |
LINT_CLEAN_BONUS = 0.05
|
| 32 |
|
| 33 |
-
# Neutral/Small signals (replacing negative penalties to stay strictly in 0
|
| 34 |
-
# We use 0.
|
| 35 |
-
ACTION_ERROR_REWARD = 0.
|
| 36 |
-
REPEATED_FAILURE_REWARD = 0.
|
| 37 |
-
STEP_BASE_REWARD = 0.
|
| 38 |
-
TEST_MODIFICATION_REWARD = 0.
|
| 39 |
-
LINT_VIOLATION_REWARD = 0.
|
| 40 |
|
| 41 |
|
| 42 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -95,7 +95,7 @@ class RewardCalculator:
|
|
| 95 |
"run_tests": 0.02,
|
| 96 |
"run_lint": 0.02,
|
| 97 |
"request_iteration": 0.02,
|
| 98 |
-
}.get(action_type, 0.
|
| 99 |
|
| 100 |
# ββ Test progress bonus ββ
|
| 101 |
if tests_passed is not None:
|
|
@@ -114,8 +114,8 @@ class RewardCalculator:
|
|
| 114 |
reward += abs(delta) * LINT_VIOLATION_REWARD
|
| 115 |
self._prev_lint_violations = lint_violations
|
| 116 |
|
| 117 |
-
# Final clamp to strictly within
|
| 118 |
-
return round(max(0.
|
| 119 |
|
| 120 |
def _is_test_file(self, path: str) -> bool:
|
| 121 |
low = path.lower()
|
|
|
|
| 30 |
TEST_PASS_BONUS_PER_TEST = 0.05
|
| 31 |
LINT_CLEAN_BONUS = 0.05
|
| 32 |
|
| 33 |
+
# Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
|
| 34 |
+
# We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
|
| 35 |
+
ACTION_ERROR_REWARD = 0.1
|
| 36 |
+
REPEATED_FAILURE_REWARD = 0.1
|
| 37 |
+
STEP_BASE_REWARD = 0.1
|
| 38 |
+
TEST_MODIFICATION_REWARD = 0.1
|
| 39 |
+
LINT_VIOLATION_REWARD = 0.1
|
| 40 |
|
| 41 |
|
| 42 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 95 |
"run_tests": 0.02,
|
| 96 |
"run_lint": 0.02,
|
| 97 |
"request_iteration": 0.02,
|
| 98 |
+
}.get(action_type, 0.1)
|
| 99 |
|
| 100 |
# ββ Test progress bonus ββ
|
| 101 |
if tests_passed is not None:
|
|
|
|
| 114 |
reward += abs(delta) * LINT_VIOLATION_REWARD
|
| 115 |
self._prev_lint_violations = lint_violations
|
| 116 |
|
| 117 |
+
# Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
|
| 118 |
+
return round(max(0.1, min(0.9, reward)), 4)
|
| 119 |
|
| 120 |
def _is_test_file(self, path: str) -> bool:
|
| 121 |
low = path.lower()
|
tasks/task_registry.py
CHANGED
|
@@ -7,6 +7,7 @@ TASK_REGISTRY: Dict[str, Any] = {
|
|
| 7 |
easy_task.TASK_ID: easy_task,
|
| 8 |
medium_task.TASK_ID: medium_task,
|
| 9 |
hard_task.TASK_ID: hard_task,
|
|
|
|
| 10 |
}
|
| 11 |
|
| 12 |
# The 3 scored tasks for the hackathon (easy, medium, hard)
|
|
|
|
| 7 |
easy_task.TASK_ID: easy_task,
|
| 8 |
medium_task.TASK_ID: medium_task,
|
| 9 |
hard_task.TASK_ID: hard_task,
|
| 10 |
+
bonus_task.TASK_ID: bonus_task,
|
| 11 |
}
|
| 12 |
|
| 13 |
# The 3 scored tasks for the hackathon (easy, medium, hard)
|