Spaces:
Sleeping
Sleeping
Your Name commited on
Commit Β·
e317eba
1
Parent(s): a3b9d4b
fix(OpenEnv): global overhaul to high-resolution interior clamping (0.001-0.999) per technical diagnosis
Browse files- README.md +13 -2
- environment.py +5 -5
- grader.py +25 -19
- inference.py +1 -1
- models.py +9 -9
- openenv.yaml +4 -4
- reward.py +9 -9
README.md
CHANGED
|
@@ -40,8 +40,8 @@ Every mandatory requirement is implemented and verified:
|
|
| 40 |
| `step()` / `reset()` / `state()` OpenEnv API | β
| `environment.py` |
|
| 41 |
| `openenv.yaml` spec file | β
| `openenv.yaml` |
|
| 42 |
| Typed Pydantic models | β
| `models.py` β 8 action types + Observation |
|
| 43 |
-
| Minimum 3 tasks (easy β medium β hard) | β
|
|
| 44 |
-
| Graders return score in `(0, 1)` | β
| `grader.py` β strictly 0.
|
| 45 |
| Deterministic, reproducible | β
| Anti-exploit guards included |
|
| 46 |
| Dense reward with strictly `(0, 1)` range | β
| `reward.py` β delta-based per step |
|
| 47 |
| Baseline inference script named `inference.py` | β
| `inference.py` |
|
|
@@ -55,6 +55,17 @@ Every mandatory requirement is implemented and verified:
|
|
| 55 |
| Runs on 2 vCPU / 8 GB RAM / < 20 min | β
| Verified β easy=~2min, hard=~8min |
|
| 56 |
| README with action/observation space docs | β
| This file |
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
---
|
| 59 |
|
| 60 |
## π― What Makes TeamForge Different
|
|
|
|
| 40 |
| `step()` / `reset()` / `state()` OpenEnv API | β
| `environment.py` |
|
| 41 |
| `openenv.yaml` spec file | β
| `openenv.yaml` |
|
| 42 |
| Typed Pydantic models | β
| `models.py` β 8 action types + Observation |
|
| 43 |
+
| Minimum 3 tasks (easy β medium β hard) | β
| 3 core tasks (aligned with YAML) |
|
| 44 |
+
| Graders return score in `(0, 1)` | β
| `grader.py` β strictly 0.001 to 0.999 |
|
| 45 |
| Deterministic, reproducible | β
| Anti-exploit guards included |
|
| 46 |
| Dense reward with strictly `(0, 1)` range | β
| `reward.py` β delta-based per step |
|
| 47 |
| Baseline inference script named `inference.py` | β
| `inference.py` |
|
|
|
|
| 55 |
| Runs on 2 vCPU / 8 GB RAM / < 20 min | β
| Verified β easy=~2min, hard=~8min |
|
| 56 |
| README with action/observation space docs | β
| This file |
|
| 57 |
|
| 58 |
+
# OpenEnv Validator Compliance
|
| 59 |
+
**Status:** Strictly within `(0.001, 0.999)` interior range.
|
| 60 |
+
|
| 61 |
+
### π Technical Diagnosis & Fix
|
| 62 |
+
- **Error:** "Each task's score must be strictly between 0 and 1 (not 0.0 and not 1.0)"
|
| 63 |
+
- **Cause:** The hackathon validator requires scores in the open interval (0, 1). A perfect lint or test score returning exactly 1.0 (or 0.0 on failure) was triggering the range rejection.
|
| 64 |
+
- **Fix:** Implemented a robust `_clamp()` system in `grader.py` and global baselines.
|
| 65 |
+
- `_SCORE_MIN = 0.001` (never exactly 0.0)
|
| 66 |
+
- `_SCORE_MAX = 0.999` (never exactly 1.0)
|
| 67 |
+
- **Compliance:** Every sub-score, reward, and final result is now guaranteed to be in the `[0.001, 0.999]` range.
|
| 68 |
+
|
| 69 |
---
|
| 70 |
|
| 71 |
## π― What Makes TeamForge Different
|
environment.py
CHANGED
|
@@ -64,7 +64,7 @@ class TeamForgeEnv:
|
|
| 64 |
|
| 65 |
# Episode state
|
| 66 |
self._step_number = 0
|
| 67 |
-
self._cumulative_reward = 0.
|
| 68 |
self._plan: List[PlanStep] = []
|
| 69 |
self._reviews: List[ReviewArtifact] = []
|
| 70 |
self._reflections: List[ReflectionArtifact] = []
|
|
@@ -106,7 +106,7 @@ class TeamForgeEnv:
|
|
| 106 |
|
| 107 |
# Reset episode state
|
| 108 |
self._step_number = 0
|
| 109 |
-
self._cumulative_reward = 0.
|
| 110 |
self._plan = []
|
| 111 |
self._reviews = []
|
| 112 |
self._reflections = []
|
|
@@ -122,7 +122,7 @@ class TeamForgeEnv:
|
|
| 122 |
action_type=None,
|
| 123 |
status=ActionStatus.SUCCESS,
|
| 124 |
output="Environment initialized.",
|
| 125 |
-
reward=0.
|
| 126 |
done=False,
|
| 127 |
)
|
| 128 |
return self._obs
|
|
@@ -329,7 +329,7 @@ class TeamForgeEnv:
|
|
| 329 |
ln for ln in output.splitlines()
|
| 330 |
if re.match(r".+:\d+:\d+:", ln)
|
| 331 |
])
|
| 332 |
-
score = max(0.
|
| 333 |
self._last_lint_result = LintResult(
|
| 334 |
violations=violations,
|
| 335 |
output=output[:2000],
|
|
@@ -401,7 +401,7 @@ class TeamForgeEnv:
|
|
| 401 |
action_type=None,
|
| 402 |
status=ActionStatus.FAILURE,
|
| 403 |
output=reason,
|
| 404 |
-
reward=0.
|
| 405 |
done=True,
|
| 406 |
)
|
| 407 |
return self._obs
|
|
|
|
| 64 |
|
| 65 |
# Episode state
|
| 66 |
self._step_number = 0
|
| 67 |
+
self._cumulative_reward = 0.001
|
| 68 |
self._plan: List[PlanStep] = []
|
| 69 |
self._reviews: List[ReviewArtifact] = []
|
| 70 |
self._reflections: List[ReflectionArtifact] = []
|
|
|
|
| 106 |
|
| 107 |
# Reset episode state
|
| 108 |
self._step_number = 0
|
| 109 |
+
self._cumulative_reward = 0.001
|
| 110 |
self._plan = []
|
| 111 |
self._reviews = []
|
| 112 |
self._reflections = []
|
|
|
|
| 122 |
action_type=None,
|
| 123 |
status=ActionStatus.SUCCESS,
|
| 124 |
output="Environment initialized.",
|
| 125 |
+
reward=0.001,
|
| 126 |
done=False,
|
| 127 |
)
|
| 128 |
return self._obs
|
|
|
|
| 329 |
ln for ln in output.splitlines()
|
| 330 |
if re.match(r".+:\d+:\d+:", ln)
|
| 331 |
])
|
| 332 |
+
score = max(0.001, min(0.999, 1.0 - violations * 0.05))
|
| 333 |
self._last_lint_result = LintResult(
|
| 334 |
violations=violations,
|
| 335 |
output=output[:2000],
|
|
|
|
| 401 |
action_type=None,
|
| 402 |
status=ActionStatus.FAILURE,
|
| 403 |
output=reason,
|
| 404 |
+
reward=0.001,
|
| 405 |
done=True,
|
| 406 |
)
|
| 407 |
return self._obs
|
grader.py
CHANGED
|
@@ -28,6 +28,18 @@ from typing import List, Optional
|
|
| 28 |
from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
# ANTI-EXPLOIT GUARDS
|
| 33 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -99,9 +111,7 @@ def score_tests(repo_path: str, timeout: int = 60) -> tuple[float, str]:
|
|
| 99 |
return 0.01, output
|
| 100 |
|
| 101 |
pass_rate = passed / total
|
| 102 |
-
|
| 103 |
-
pass_rate = max(0.1, min(0.9, pass_rate))
|
| 104 |
-
return pass_rate, output
|
| 105 |
|
| 106 |
|
| 107 |
def score_lint(repo_path: str) -> tuple[float, str]:
|
|
@@ -122,9 +132,8 @@ def score_lint(repo_path: str) -> tuple[float, str]:
|
|
| 122 |
if re.match(r".+:\d+:\d+:", ln)
|
| 123 |
])
|
| 124 |
# Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
|
| 125 |
-
# Strictly (0, 1) - Safer interior
|
| 126 |
-
|
| 127 |
-
return score, output
|
| 128 |
|
| 129 |
|
| 130 |
def score_review_quality(
|
|
@@ -156,8 +165,8 @@ def score_review_quality(
|
|
| 156 |
code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
|
| 157 |
specificity = min(0.1, len(set(code_words)) * 0.025)
|
| 158 |
|
| 159 |
-
# Strictly (0, 1) - Safer interior
|
| 160 |
-
return
|
| 161 |
|
| 162 |
|
| 163 |
def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
|
|
@@ -177,17 +186,15 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
|
|
| 177 |
depth = min(1.0, depth + 0.2)
|
| 178 |
total += depth
|
| 179 |
|
| 180 |
-
# Strictly (0, 1) - Safer interior
|
| 181 |
-
return
|
| 182 |
|
| 183 |
|
| 184 |
def score_efficiency(total_steps: int, max_steps: int) -> float:
|
| 185 |
"""Reward solving in fewer steps with smooth decay curve."""
|
| 186 |
ratio = total_steps / max_steps
|
| 187 |
-
#
|
| 188 |
-
|
| 189 |
-
# Strictly (0.1, 0.9)
|
| 190 |
-
return round(max(0.1, min(0.9, math.exp(-2.0 * max(0, ratio - 0.25)))), 4)
|
| 191 |
|
| 192 |
|
| 193 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -215,7 +222,7 @@ def grade_episode(
|
|
| 215 |
log.append("[GRADER] β TEST TAMPERING DETECTED β score zeroed")
|
| 216 |
return EpisodeResult(
|
| 217 |
task_id=task_id, total_steps=total_steps,
|
| 218 |
-
final_score=
|
| 219 |
log=log + ["Test files were trivially rewritten to force passes."],
|
| 220 |
)
|
| 221 |
|
|
@@ -223,7 +230,7 @@ def grade_episode(
|
|
| 223 |
log.append("[GRADER] β NO IMPLEMENTATION FOUND β score zeroed")
|
| 224 |
return EpisodeResult(
|
| 225 |
task_id=task_id, total_steps=total_steps,
|
| 226 |
-
final_score=
|
| 227 |
log=log + ["No non-test code was written."],
|
| 228 |
)
|
| 229 |
|
|
@@ -256,9 +263,8 @@ def grade_episode(
|
|
| 256 |
+ 0.10 * review_q
|
| 257 |
+ 0.05 * reflect_q
|
| 258 |
)
|
| 259 |
-
#
|
| 260 |
-
|
| 261 |
-
final = round(min(0.90, max(0.10, final)), 4)
|
| 262 |
log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
|
| 263 |
|
| 264 |
return EpisodeResult(
|
|
|
|
| 28 |
from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
|
| 29 |
|
| 30 |
|
| 31 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 32 |
+
# SCORING CONFIG
|
| 33 |
+
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 34 |
+
_SCORE_MIN = 0.001 # never exactly 0.0
|
| 35 |
+
_SCORE_MAX = 0.999 # never exactly 1.0
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def _clamp(score: float) -> float:
|
| 39 |
+
"""Ensure score is strictly within the open interval (0, 1)."""
|
| 40 |
+
return round(max(_SCORE_MIN, min(_SCORE_MAX, score)), 4)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 44 |
# ANTI-EXPLOIT GUARDS
|
| 45 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 111 |
return 0.01, output
|
| 112 |
|
| 113 |
pass_rate = passed / total
|
| 114 |
+
return _clamp(pass_rate), output
|
|
|
|
|
|
|
| 115 |
|
| 116 |
|
| 117 |
def score_lint(repo_path: str) -> tuple[float, str]:
|
|
|
|
| 132 |
if re.match(r".+:\d+:\d+:", ln)
|
| 133 |
])
|
| 134 |
# Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
|
| 135 |
+
# Strictly (0, 1) - Safer interior
|
| 136 |
+
return _clamp(1.0 - violations * 0.07), output
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
def score_review_quality(
|
|
|
|
| 165 |
code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
|
| 166 |
specificity = min(0.1, len(set(code_words)) * 0.025)
|
| 167 |
|
| 168 |
+
# Strictly (0, 1) - Safer interior
|
| 169 |
+
return _clamp(kw_score * 0.7 + length_bonus + specificity)
|
| 170 |
|
| 171 |
|
| 172 |
def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
|
|
|
|
| 186 |
depth = min(1.0, depth + 0.2)
|
| 187 |
total += depth
|
| 188 |
|
| 189 |
+
# Strictly (0, 1) - Safer interior
|
| 190 |
+
return _clamp(total / max(1, len(reflections)))
|
| 191 |
|
| 192 |
|
| 193 |
def score_efficiency(total_steps: int, max_steps: int) -> float:
|
| 194 |
"""Reward solving in fewer steps with smooth decay curve."""
|
| 195 |
ratio = total_steps / max_steps
|
| 196 |
+
# Strictly (0, 1) - Safer interior
|
| 197 |
+
return _clamp(math.exp(-2.0 * max(0, ratio - 0.25)))
|
|
|
|
|
|
|
| 198 |
|
| 199 |
|
| 200 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 222 |
log.append("[GRADER] β TEST TAMPERING DETECTED β score zeroed")
|
| 223 |
return EpisodeResult(
|
| 224 |
task_id=task_id, total_steps=total_steps,
|
| 225 |
+
final_score=_SCORE_MIN, passed=False,
|
| 226 |
log=log + ["Test files were trivially rewritten to force passes."],
|
| 227 |
)
|
| 228 |
|
|
|
|
| 230 |
log.append("[GRADER] β NO IMPLEMENTATION FOUND β score zeroed")
|
| 231 |
return EpisodeResult(
|
| 232 |
task_id=task_id, total_steps=total_steps,
|
| 233 |
+
final_score=_SCORE_MIN, passed=False,
|
| 234 |
log=log + ["No non-test code was written."],
|
| 235 |
)
|
| 236 |
|
|
|
|
| 263 |
+ 0.10 * review_q
|
| 264 |
+ 0.05 * reflect_q
|
| 265 |
)
|
| 266 |
+
# Strictly (0, 1) interior range to satisfy Phase 2 validator
|
| 267 |
+
final = _clamp(final)
|
|
|
|
| 268 |
log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
|
| 269 |
|
| 270 |
return EpisodeResult(
|
inference.py
CHANGED
|
@@ -237,7 +237,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
|
|
| 237 |
score = result.final_score
|
| 238 |
success = result.passed
|
| 239 |
|
| 240 |
-
rewards_str = ",".join(f"{r:.
|
| 241 |
|
| 242 |
# ββ [END] βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
# We use 2 decimal places to match common validator expectations,
|
|
|
|
| 237 |
score = result.final_score
|
| 238 |
success = result.passed
|
| 239 |
|
| 240 |
+
rewards_str = ",".join(f"{r:.3f}" for r in rewards) if rewards else "0.001"
|
| 241 |
|
| 242 |
# ββ [END] βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
# We use 2 decimal places to match common validator expectations,
|
models.py
CHANGED
|
@@ -129,7 +129,7 @@ class TestResult(BaseModel):
|
|
| 129 |
class LintResult(BaseModel):
|
| 130 |
violations: int = 0
|
| 131 |
output: str = ""
|
| 132 |
-
score: float = 0.
|
| 133 |
|
| 134 |
|
| 135 |
class ReviewArtifact(BaseModel):
|
|
@@ -175,8 +175,8 @@ class Observation(BaseModel):
|
|
| 175 |
reflections: List[ReflectionArtifact] = Field(default_factory=list)
|
| 176 |
|
| 177 |
# Signals
|
| 178 |
-
reward: float = 0.
|
| 179 |
-
cumulative_reward: float = 0.
|
| 180 |
done: bool = False
|
| 181 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 182 |
|
|
@@ -188,11 +188,11 @@ class Observation(BaseModel):
|
|
| 188 |
class EpisodeResult(BaseModel):
|
| 189 |
task_id: str
|
| 190 |
total_steps: int
|
| 191 |
-
test_pass_rate: float = 0.
|
| 192 |
-
lint_score: float = 0.
|
| 193 |
-
efficiency_score: float = 0.
|
| 194 |
-
review_quality: float = 0.
|
| 195 |
-
reflection_quality: float = 0.
|
| 196 |
-
final_score: float = 0.
|
| 197 |
passed: bool = False
|
| 198 |
log: List[str] = Field(default_factory=list)
|
|
|
|
| 129 |
class LintResult(BaseModel):
|
| 130 |
violations: int = 0
|
| 131 |
output: str = ""
|
| 132 |
+
score: float = 0.999 # 0.999 = clean
|
| 133 |
|
| 134 |
|
| 135 |
class ReviewArtifact(BaseModel):
|
|
|
|
| 175 |
reflections: List[ReflectionArtifact] = Field(default_factory=list)
|
| 176 |
|
| 177 |
# Signals
|
| 178 |
+
reward: float = 0.001
|
| 179 |
+
cumulative_reward: float = 0.001
|
| 180 |
done: bool = False
|
| 181 |
info: Dict[str, Any] = Field(default_factory=dict)
|
| 182 |
|
|
|
|
| 188 |
class EpisodeResult(BaseModel):
|
| 189 |
task_id: str
|
| 190 |
total_steps: int
|
| 191 |
+
test_pass_rate: float = 0.001
|
| 192 |
+
lint_score: float = 0.001
|
| 193 |
+
efficiency_score: float = 0.001
|
| 194 |
+
review_quality: float = 0.001
|
| 195 |
+
reflection_quality: float = 0.001
|
| 196 |
+
final_score: float = 0.001
|
| 197 |
passed: bool = False
|
| 198 |
log: List[str] = Field(default_factory=list)
|
openenv.yaml
CHANGED
|
@@ -116,7 +116,7 @@ observation_space:
|
|
| 116 |
|
| 117 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
reward:
|
| 119 |
-
range: [0.
|
| 120 |
type: dense
|
| 121 |
description: >
|
| 122 |
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
|
|
@@ -129,21 +129,21 @@ tasks:
|
|
| 129 |
max_steps: 20
|
| 130 |
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
|
| 131 |
grader: grader.grade_task
|
| 132 |
-
score_range: [0.
|
| 133 |
|
| 134 |
- id: medium_refactor_stats
|
| 135 |
difficulty: medium
|
| 136 |
max_steps: 30
|
| 137 |
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
|
| 138 |
grader: grader.grade_task
|
| 139 |
-
score_range: [0.
|
| 140 |
|
| 141 |
- id: hard_lru_cache_performance
|
| 142 |
difficulty: hard
|
| 143 |
max_steps: 40
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
-
score_range: [0.
|
| 147 |
|
| 148 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
runtime:
|
|
|
|
| 116 |
|
| 117 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
reward:
|
| 119 |
+
range: [0.001, 0.999]
|
| 120 |
type: dense
|
| 121 |
description: >
|
| 122 |
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
|
|
|
|
| 129 |
max_steps: 20
|
| 130 |
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
|
| 131 |
grader: grader.grade_task
|
| 132 |
+
score_range: [0.001, 0.999]
|
| 133 |
|
| 134 |
- id: medium_refactor_stats
|
| 135 |
difficulty: medium
|
| 136 |
max_steps: 30
|
| 137 |
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
|
| 138 |
grader: grader.grade_task
|
| 139 |
+
score_range: [0.001, 0.999]
|
| 140 |
|
| 141 |
- id: hard_lru_cache_performance
|
| 142 |
difficulty: hard
|
| 143 |
max_steps: 40
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
+
score_range: [0.001, 0.999]
|
| 147 |
|
| 148 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
runtime:
|
reward.py
CHANGED
|
@@ -31,12 +31,12 @@ TEST_PASS_BONUS_PER_TEST = 0.05
|
|
| 31 |
LINT_CLEAN_BONUS = 0.05
|
| 32 |
|
| 33 |
# Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
|
| 34 |
-
# We use 0.
|
| 35 |
-
ACTION_ERROR_REWARD = 0.
|
| 36 |
-
REPEATED_FAILURE_REWARD = 0.
|
| 37 |
-
STEP_BASE_REWARD = 0.
|
| 38 |
-
TEST_MODIFICATION_REWARD = 0.
|
| 39 |
-
LINT_VIOLATION_REWARD = 0.
|
| 40 |
|
| 41 |
|
| 42 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -95,7 +95,7 @@ class RewardCalculator:
|
|
| 95 |
"run_tests": 0.02,
|
| 96 |
"run_lint": 0.02,
|
| 97 |
"request_iteration": 0.02,
|
| 98 |
-
}.get(action_type, 0.
|
| 99 |
|
| 100 |
# ββ Test progress bonus ββ
|
| 101 |
if tests_passed is not None:
|
|
@@ -114,8 +114,8 @@ class RewardCalculator:
|
|
| 114 |
reward += abs(delta) * LINT_VIOLATION_REWARD
|
| 115 |
self._prev_lint_violations = lint_violations
|
| 116 |
|
| 117 |
-
# Final clamp to strictly within (0.
|
| 118 |
-
return round(max(0.
|
| 119 |
|
| 120 |
def _is_test_file(self, path: str) -> bool:
|
| 121 |
low = path.lower()
|
|
|
|
| 31 |
LINT_CLEAN_BONUS = 0.05
|
| 32 |
|
| 33 |
# Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
|
| 34 |
+
# We use 0.001 to satisfy "strictly between 0 and 1" requirement with high resolution
|
| 35 |
+
ACTION_ERROR_REWARD = 0.001
|
| 36 |
+
REPEATED_FAILURE_REWARD = 0.001
|
| 37 |
+
STEP_BASE_REWARD = 0.001
|
| 38 |
+
TEST_MODIFICATION_REWARD = 0.001
|
| 39 |
+
LINT_VIOLATION_REWARD = 0.001
|
| 40 |
|
| 41 |
|
| 42 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 95 |
"run_tests": 0.02,
|
| 96 |
"run_lint": 0.02,
|
| 97 |
"request_iteration": 0.02,
|
| 98 |
+
}.get(action_type, 0.001)
|
| 99 |
|
| 100 |
# ββ Test progress bonus ββ
|
| 101 |
if tests_passed is not None:
|
|
|
|
| 114 |
reward += abs(delta) * LINT_VIOLATION_REWARD
|
| 115 |
self._prev_lint_violations = lint_violations
|
| 116 |
|
| 117 |
+
# Final clamp to strictly within (0.001, 0.999) per OpenEnv validator requirement
|
| 118 |
+
return round(max(0.001, min(0.999, reward)), 4)
|
| 119 |
|
| 120 |
def _is_test_file(self, path: str) -> bool:
|
| 121 |
low = path.lower()
|