Spaces:

PrakashCider
/

teamforge

Sleeping

App Files Files Community

Your Name commited on Apr 12

Commit

efa2d2a

1 Parent(s): 4f893da

fix(OpenEnv): implement system-wide [0.1, 0.9] boundary scrub for Phase 2 compliance

Browse files

Files changed (6) hide show

environment.py +2 -2
grader.py +5 -5
models.py +9 -9
openenv.yaml +7 -0
reward.py +10 -10
tasks/task_registry.py +1 -0

environment.py CHANGED Viewed

@@ -106,7 +106,7 @@ class TeamForgeEnv:
         # Reset episode state
         self._step_number = 0
-        self._cumulative_reward = 0.001
         self._plan = []
         self._reviews = []
         self._reflections = []
@@ -122,7 +122,7 @@ class TeamForgeEnv:
             action_type=None,
             status=ActionStatus.SUCCESS,
             output="Environment initialized.",
-            reward=0.001,
             done=False,
         )
         return self._obs

         # Reset episode state
         self._step_number = 0
+        self._cumulative_reward = 0.1
         self._plan = []
         self._reviews = []
         self._reflections = []
             action_type=None,
             status=ActionStatus.SUCCESS,
             output="Environment initialized.",
+            reward=0.1,
             done=False,
         )
         return self._obs

grader.py CHANGED Viewed

@@ -132,9 +132,9 @@ def score_lint(repo_path: str) -> tuple[float, str]:
         ln for ln in output.splitlines()
         if re.match(r".+:\d+:\d+:", ln)
     ])
-    # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
-    # Strictly (0, 1) - Safer interior
-    return _clamp(1.0 - violations * 0.07), output
 def score_review_quality(
@@ -149,7 +149,7 @@ def score_review_quality(
     # Anti-exploit: minimum meaningful length
     if len(combined.strip()) < 40:
-        return 0.05
     # Keyword coverage
     if not required_keywords:
@@ -183,7 +183,7 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
             depth += 0.5
         # Bonus if adjusted_plan provided
         if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
-            depth = min(1.0, depth + 0.2)
         total += depth
     # Strictly (0, 1) - Safer interior

         ln for ln in output.splitlines()
         if re.match(r".+:\d+:\d+:", ln)
     ])
+    # Strictly interior [SCORE_MIN, SCORE_MAX]
+    raw_score = 1.0 - violations * 0.07
+    return _clamp(raw_score), output
 def score_review_quality(
     # Anti-exploit: minimum meaningful length
     if len(combined.strip()) < 40:
+        return _SCORE_MIN + 0.05
     # Keyword coverage
     if not required_keywords:
             depth += 0.5
         # Bonus if adjusted_plan provided
         if ref.adjusted_plan and len(ref.adjusted_plan.strip()) > 20:
+            depth = min(0.9, depth + 0.2)
         total += depth
     # Strictly (0, 1) - Safer interior

models.py CHANGED Viewed

@@ -129,7 +129,7 @@ class TestResult(BaseModel):
 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
-    score: float = 0.999  # 0.999 = clean
 class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
-    reward: float = 0.001
-    cumulative_reward: float = 0.001
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
@@ -188,11 +188,11 @@ class Observation(BaseModel):
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
-    test_pass_rate: float = 0.001
-    lint_score: float = 0.001
-    efficiency_score: float = 0.001
-    review_quality: float = 0.001
-    reflection_quality: float = 0.001
-    final_score: float = 0.001
     passed: bool = False
     log: List[str] = Field(default_factory=list)

 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
+    score: float = 0.9  # 0.9 = clean
 class ReviewArtifact(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
+    reward: float = 0.1
+    cumulative_reward: float = 0.1
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
+    test_pass_rate: float = 0.1
+    lint_score: float = 0.1
+    efficiency_score: float = 0.1
+    review_quality: float = 0.1
+    reflection_quality: float = 0.1
+    final_score: float = 0.1
     passed: bool = False
     log: List[str] = Field(default_factory=list)

openenv.yaml CHANGED Viewed

@@ -144,6 +144,13 @@ tasks:
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_task
     score_range: [0.0, 1.0]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_task
     score_range: [0.0, 1.0]
+  - id: bonus_task
+    difficulty: bonus
+    max_steps: 10
+    description: "Bonus: Optimize the LRU cache for memory efficiency. Gradual memory reduction is rewarded."
+    grader: grader.grade_task
+    score_range: [0.0, 1.0]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

reward.py CHANGED Viewed

@@ -30,13 +30,13 @@ REFLECT_REWARD = 0.10
 TEST_PASS_BONUS_PER_TEST = 0.05
 LINT_CLEAN_BONUS = 0.05
-# Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
-# We use 0.001 to satisfy "strictly between 0 and 1" requirement with high resolution
-ACTION_ERROR_REWARD = 0.001
-REPEATED_FAILURE_REWARD = 0.001
-STEP_BASE_REWARD = 0.001
-TEST_MODIFICATION_REWARD = 0.001
-LINT_VIOLATION_REWARD = 0.001
 # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
-        }.get(action_type, 0.001)
         # ── Test progress bonus ──
         if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
-        # Final clamp to strictly within (0.001, 0.999) per OpenEnv validator requirement
-        return round(max(0.001, min(0.999, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()

 TEST_PASS_BONUS_PER_TEST = 0.05
 LINT_CLEAN_BONUS = 0.05
+# Neutral/Small signals (replacing negative penalties to stay strictly in 0.1-0.9 range)
+# We use 0.1 to satisfy "strictly between 0 and 1" requirement with high rounding safety
+ACTION_ERROR_REWARD = 0.1
+REPEATED_FAILURE_REWARD = 0.1
+STEP_BASE_REWARD = 0.1
+TEST_MODIFICATION_REWARD = 0.1
+LINT_VIOLATION_REWARD = 0.1
 # ─────────────────────────────────────────────
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
+        }.get(action_type, 0.1)
         # ── Test progress bonus ──
         if tests_passed is not None:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
+        # Final clamp to strictly within [0.1, 0.9] per OpenEnv validator requirement
+        return round(max(0.1, min(0.9, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()

tasks/task_registry.py CHANGED Viewed

@@ -7,6 +7,7 @@ TASK_REGISTRY: Dict[str, Any] = {
     easy_task.TASK_ID:   easy_task,
     medium_task.TASK_ID: medium_task,
     hard_task.TASK_ID:   hard_task,
 }
 # The 3 scored tasks for the hackathon (easy, medium, hard)

     easy_task.TASK_ID:   easy_task,
     medium_task.TASK_ID: medium_task,
     hard_task.TASK_ID:   hard_task,
+    bonus_task.TASK_ID:  bonus_task,
 }
 # The 3 scored tasks for the hackathon (easy, medium, hard)