Spaces:

PrakashCider
/

teamforge

Sleeping

App Files Files Community

Your Name commited on Apr 11

Commit

94d08ee

1 Parent(s): 652a783

fix(OpenEnv): implement robust grader bridge and strict interior clamping [0.1, 0.9] to satisfy Phase 2 validator

Browse files

Files changed (6) hide show

environment.py +5 -9
grader.py +63 -16
inference.py +22 -3
models.py +9 -9
openenv.yaml +8 -8
reward.py +10 -10

environment.py CHANGED Viewed

@@ -64,7 +64,7 @@ class TeamForgeEnv:
         # Episode state
         self._step_number = 0
-        self._cumulative_reward = 0.01
         self._plan: List[PlanStep] = []
         self._reviews: List[ReviewArtifact] = []
         self._reflections: List[ReflectionArtifact] = []
@@ -106,7 +106,7 @@ class TeamForgeEnv:
         # Reset episode state
         self._step_number = 0
-        self._cumulative_reward = 0.01
         self._plan = []
         self._reviews = []
         self._reflections = []
@@ -120,9 +120,7 @@ class TeamForgeEnv:
         # Build initial observation
         self._obs = self._build_observation(
             action_type=None,
-            status=ActionStatus.SUCCESS,
-            output="Environment initialized. Begin your task.",
-            reward=0.01,
             done=False,
         )
         return self._obs
@@ -329,7 +327,7 @@ class TeamForgeEnv:
             ln for ln in output.splitlines()
             if re.match(r".+:\d+:\d+:", ln)
         ])
-        score = max(0.01, min(0.99, 1.0 - violations * 0.05))
         self._last_lint_result = LintResult(
             violations=violations,
             output=output[:2000],
@@ -399,9 +397,7 @@ class TeamForgeEnv:
         self._log(f"[END] {reason}")
         self._obs = self._build_observation(
             action_type=None,
-            status=ActionStatus.FAILURE,
-            output=reason,
-            reward=0.01,
             done=True,
         )
         return self._obs

         # Episode state
         self._step_number = 0
+        self._cumulative_reward = 0.1
         self._plan: List[PlanStep] = []
         self._reviews: List[ReviewArtifact] = []
         self._reflections: List[ReflectionArtifact] = []
         # Reset episode state
         self._step_number = 0
+        self._cumulative_reward = 0.1
         self._plan = []
         self._reviews = []
         self._reflections = []
         # Build initial observation
         self._obs = self._build_observation(
             action_type=None,
+            reward=0.1,
             done=False,
         )
         return self._obs
             ln for ln in output.splitlines()
             if re.match(r".+:\d+:\d+:", ln)
         ])
+        score = max(0.1, min(0.9, 1.0 - violations * 0.05))
         self._last_lint_result = LintResult(
             violations=violations,
             output=output[:2000],
         self._log(f"[END] {reason}")
         self._obs = self._build_observation(
             action_type=None,
+            reward=0.1,
             done=True,
         )
         return self._obs

grader.py CHANGED Viewed

@@ -99,8 +99,8 @@ def score_tests(repo_path: str, timeout: int = 60) -> tuple[float, str]:
         return 0.01, output
     pass_rate = passed / total
-    # Strictly (0, 1)
-    pass_rate = max(0.01, min(0.99, pass_rate))
     return pass_rate, output
@@ -122,8 +122,8 @@ def score_lint(repo_path: str) -> tuple[float, str]:
         if re.match(r".+:\d+:\d+:", ln)
     ])
     # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
-    # Strictly (0, 1)
-    score = max(0.01, min(0.99, 1.0 - violations * 0.07))
     return score, output
@@ -156,8 +156,8 @@ def score_review_quality(
     code_words   = re.findall(r'\b[a-z_]{3,}\(\)', combined)
     specificity  = min(0.1, len(set(code_words)) * 0.025)
-    # Strictly (0, 1)
-    return max(0.01, min(0.99, kw_score * 0.7 + length_bonus + specificity))
 def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
@@ -177,18 +177,17 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
             depth = min(1.0, depth + 0.2)
         total += depth
-    # Strictly (0, 1)
-    return max(0.01, min(0.99, total / max(1, len(reflections))))
 def score_efficiency(total_steps: int, max_steps: int) -> float:
     """Reward solving in fewer steps with smooth decay curve."""
-    if total_steps <= 0:
-        return 0.01  # never return exact 0.0
     ratio = total_steps / max_steps
     # Smooth exponential decay instead of step function
     import math
-    return round(max(0.01, math.exp(-2.0 * max(0, ratio - 0.25))), 4)
 # ─────────────────────────────────────────────
@@ -216,7 +215,7 @@ def grade_episode(
         log.append("[GRADER] ⚠  TEST TAMPERING DETECTED — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
-            final_score=0.01, passed=False,
             log=log + ["Test files were trivially rewritten to force passes."],
         )
@@ -224,7 +223,7 @@ def grade_episode(
         log.append("[GRADER] ⚠  NO IMPLEMENTATION FOUND — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
-            final_score=0.01, passed=False,
             log=log + ["No non-test code was written."],
         )
@@ -257,9 +256,9 @@ def grade_episode(
         + 0.10 * review_q
         + 0.05 * reflect_q
     )
-    # Clamp to [0.01, 0.99] so that :.2f format never outputs 0.00 or 1.00
-    # Strictly (0, 1) as required by validator
-    final = round(min(0.99, max(0.01, final)), 4)
     log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
     return EpisodeResult(
@@ -274,3 +273,51 @@ def grade_episode(
         passed=test_pass_rate >= 0.9 and lint_score >= 0.7,
         log=log,
     )

         return 0.01, output
     pass_rate = passed / total
+    # Strictly (0, 1) - Safer interior [0.1, 0.9]
+    pass_rate = max(0.1, min(0.9, pass_rate))
     return pass_rate, output
         if re.match(r".+:\d+:\d+:", ln)
     ])
     # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
+    # Strictly (0, 1) - Safer interior [0.1, 0.9]
+    score = max(0.1, min(0.9, 1.0 - violations * 0.07))
     return score, output
     code_words   = re.findall(r'\b[a-z_]{3,}\(\)', combined)
     specificity  = min(0.1, len(set(code_words)) * 0.025)
+    # Strictly (0, 1) - Safer interior [0.1, 0.9]
+    return max(0.1, min(0.9, kw_score * 0.7 + length_bonus + specificity))
 def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
             depth = min(1.0, depth + 0.2)
         total += depth
+    # Strictly (0, 1) - Safer interior [0.1, 0.9]
+    return max(0.1, min(0.9, total / max(1, len(reflections))))
 def score_efficiency(total_steps: int, max_steps: int) -> float:
     """Reward solving in fewer steps with smooth decay curve."""
     ratio = total_steps / max_steps
     # Smooth exponential decay instead of step function
     import math
+    # Strictly (0.1, 0.9)
+    return round(max(0.1, min(0.9, math.exp(-2.0 * max(0, ratio - 0.25)))), 4)
 # ─────────────────────────────────────────────
         log.append("[GRADER] ⚠  TEST TAMPERING DETECTED — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
+            final_score=0.1, passed=False,
             log=log + ["Test files were trivially rewritten to force passes."],
         )
         log.append("[GRADER] ⚠  NO IMPLEMENTATION FOUND — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
+            final_score=0.1, passed=False,
             log=log + ["No non-test code was written."],
         )
         + 0.10 * review_q
         + 0.05 * reflect_q
     )
+    # Clamp to [0.1, 0.9] so that :.2f format never outputs 0.00 or 1.00
+    # Strictly (0.1, 0.9) interior range to satisfy Phase 2 validator
+    final = round(min(0.90, max(0.10, final)), 4)
     log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
     return EpisodeResult(
         passed=test_pass_rate >= 0.9 and lint_score >= 0.7,
         log=log,
     )
+def grade_task(repo_path: str, **kwargs) -> float:
+    """
+    OpenEnv standard grader bridge – entry point from YAML.
+    Returns ONLY a float strictly between 0 and 1.
+    """
+    import json
+    import os
+    from typing import List
+    from pydantic import TypeAdapter
+    metadata_path = os.path.join(repo_path, "grading_metadata.json")
+    # Default fallback values for out-of-band grading
+    task_id = "unknown"
+    total_steps = 1
+    max_steps = 20
+    reviews = []
+    reflections = []
+    required_keywords = []
+    if os.path.exists(metadata_path):
+        try:
+            with open(metadata_path, "r") as f:
+                meta = json.load(f)
+            task_id = meta.get("task_id", task_id)
+            total_steps = meta.get("total_steps", total_steps)
+            max_steps = meta.get("max_steps", max_steps)
+            # Use TypeAdapter for robust Pydantic deserialization
+            from models import ReviewArtifact, ReflectionArtifact
+            reviews = TypeAdapter(List[ReviewArtifact]).validate_python(meta.get("reviews", []))
+            reflections = TypeAdapter(List[ReflectionArtifact]).validate_python(meta.get("reflections", []))
+            required_keywords = meta.get("required_keywords", [])
+        except Exception:
+            pass
+    result = grade_episode(
+        repo_path=repo_path,
+        task_id=task_id,
+        total_steps=total_steps,
+        max_steps=max_steps,
+        reviews=reviews,
+        reflections=reflections,
+        required_keywords=required_keywords,
+    )
+    return float(result.final_score)

inference.py CHANGED Viewed

@@ -193,7 +193,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
                 # Emit a [STEP] for the failed action
                 print(
                     f"[STEP] step={obs.step_number + 1} action=null "
-                    f"reward=0.01 done=false error={error_msg}",
                     flush=True,
                 )
                 break
@@ -215,17 +215,36 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
     except Exception as exc:
         error_msg = str(exc).replace("\n", " ")[:120]
     # Grade the episode
     result  = env.grade()
     score   = result.final_score
     success = result.passed
-    rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.01"
     # ── [END] ─────────────────────────────────────────────────────────────────
     print(
         f"[END] success={'true' if success else 'false'} steps={step_count} "
-        f"score={score:.4f} rewards={rewards_str}",
         flush=True,
     )

                 # Emit a [STEP] for the failed action
                 print(
                     f"[STEP] step={obs.step_number + 1} action=null "
+                    f"reward=0.10 done=false error={error_msg}",
                     flush=True,
                 )
                 break
     except Exception as exc:
         error_msg = str(exc).replace("\n", " ")[:120]
+    # Writing metadata for standalone OpenEnv grader
+    try:
+        from tasks.task_registry import get_task
+        task_module = get_task(task_id)
+        meta_payload = {
+            "task_id": task_id,
+            "total_steps": step_count,
+            "max_steps": task_module.MAX_STEPS,
+            "reviews": [r.model_dump() for r in env._reviews],
+            "reflections": [r.model_dump() for r in env._reflections],
+            "required_keywords": getattr(task_module, "REQUIRED_KEYWORDS_IN_REVIEW", []),
+        }
+        with open(os.path.join(str(env._sandbox.repo_path), "grading_metadata.json"), "w") as f:
+            json.dump(meta_payload, f)
+    except Exception:
+        pass
     # Grade the episode
     result  = env.grade()
     score   = result.final_score
     success = result.passed
+    rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.10"
     # ── [END] ─────────────────────────────────────────────────────────────────
+    # We use 2 decimal places to match common validator expectations,
+    # but the internal value is strictly interior [0.1, 0.9].
     print(
         f"[END] success={'true' if success else 'false'} steps={step_count} "
+        f"score={score:.2f} rewards={rewards_str}",
         flush=True,
     )

models.py CHANGED Viewed

@@ -129,7 +129,7 @@ class TestResult(BaseModel):
 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
-    score: float = 0.99  # 0.99 = clean
 class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
-    reward: float = 0.01
-    cumulative_reward: float = 0.01
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
@@ -188,11 +188,11 @@ class Observation(BaseModel):
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
-    test_pass_rate: float = 0.01
-    lint_score: float = 0.01
-    efficiency_score: float = 0.01
-    review_quality: float = 0.01
-    reflection_quality: float = 0.01
-    final_score: float = 0.01
     passed: bool = False
     log: List[str] = Field(default_factory=list)

 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
+    score: float = 0.90  # 0.9 = clean
 class ReviewArtifact(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
+    reward: float = 0.1
+    cumulative_reward: float = 0.1
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
+    test_pass_rate: float = 0.1
+    lint_score: float = 0.1
+    efficiency_score: float = 0.1
+    review_quality: float = 0.1
+    reflection_quality: float = 0.1
+    final_score: float = 0.1
     passed: bool = False
     log: List[str] = Field(default_factory=list)

openenv.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 name: teamforge
-version: "1.0.0"
 description: >
   A structured multi-phase benchmark for autonomous software engineering agents.
   The agent simulates a full software development team: planning, coding, testing,
@@ -116,7 +116,7 @@ observation_space:
 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
-  range: [0.0, 10.0]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -128,22 +128,22 @@ tasks:
     difficulty: easy
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
-    grader: grader.grade_episode
-    score_range: [0.0, 1.0]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
-    grader: grader.grade_episode
-    score_range: [0.0, 1.0]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
-    grader: grader.grade_episode
-    score_range: [0.0, 1.0]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

 name: teamforge
+version: "1.1.0"
 description: >
   A structured multi-phase benchmark for autonomous software engineering agents.
   The agent simulates a full software development team: planning, coding, testing,
 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
+  range: [0.1, 0.9]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
     difficulty: easy
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
+    grader: grader.grade_task
+    score_range: [0.1, 0.9]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
+    grader: grader.grade_task
+    score_range: [0.1, 0.9]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
+    grader: grader.grade_task
+    score_range: [0.1, 0.9]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

reward.py CHANGED Viewed

@@ -30,13 +30,13 @@ REFLECT_REWARD = 0.10
 TEST_PASS_BONUS_PER_TEST = 0.05
 LINT_CLEAN_BONUS = 0.05
-# Neutral/Small signals (replacing negative penalties to stay in 0-1 range)
-# We use 0.01 to satisfy "strictly between 0 and 1" requirement
-ACTION_ERROR_REWARD = 0.01
-REPEATED_FAILURE_REWARD = 0.01
-STEP_BASE_REWARD = 0.01
-TEST_MODIFICATION_REWARD = 0.01
-LINT_VIOLATION_REWARD = 0.01
 # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
-        }.get(action_type, 0.01)
         # ── Test progress bonus ──
         if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
-        # Final clamp to strictly within (0, 1) per OpenEnv validator requirement
-        return round(max(0.01, min(0.99, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()

 TEST_PASS_BONUS_PER_TEST = 0.05
 LINT_CLEAN_BONUS = 0.05
+# Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
+# We use 0.1 to satisfy "strictly between 0 and 1" requirement with safe interior
+ACTION_ERROR_REWARD = 0.10
+REPEATED_FAILURE_REWARD = 0.10
+STEP_BASE_REWARD = 0.10
+TEST_MODIFICATION_REWARD = 0.10
+LINT_VIOLATION_REWARD = 0.10
 # ─────────────────────────────────────────────
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
+        }.get(action_type, 0.10)
         # ── Test progress bonus ──
         if tests_passed is not None:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
+        # Final clamp to strictly within (0.1, 0.9) per OpenEnv validator requirement
+        return round(max(0.1, min(0.9, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()