Spaces:

PrakashCider
/

teamforge

Sleeping

App Files Files Community

Your Name commited on Apr 11

Commit

e317eba

1 Parent(s): a3b9d4b

fix(OpenEnv): global overhaul to high-resolution interior clamping (0.001-0.999) per technical diagnosis

Browse files

Files changed (7) hide show

README.md +13 -2
environment.py +5 -5
grader.py +25 -19
inference.py +1 -1
models.py +9 -9
openenv.yaml +4 -4
reward.py +9 -9

README.md CHANGED Viewed

@@ -40,8 +40,8 @@ Every mandatory requirement is implemented and verified:
 | `step()` / `reset()` / `state()` OpenEnv API | ✅ | `environment.py` |
 | `openenv.yaml` spec file | ✅ | `openenv.yaml` |
 | Typed Pydantic models | ✅ | `models.py` — 8 action types + Observation |
-| Minimum 3 tasks (easy → medium → hard) | ✅ | 4 tasks (3 scored + 1 bonus) |
-| Graders return score in `(0, 1)` | ✅ | `grader.py` — strictly 0.01 to 0.99 |
 | Deterministic, reproducible | ✅ | Anti-exploit guards included |
 | Dense reward with strictly `(0, 1)` range | ✅ | `reward.py` — delta-based per step |
 | Baseline inference script named `inference.py` | ✅ | `inference.py` |
@@ -55,6 +55,17 @@ Every mandatory requirement is implemented and verified:
 | Runs on 2 vCPU / 8 GB RAM / < 20 min | ✅ | Verified — easy=~2min, hard=~8min |
 | README with action/observation space docs | ✅ | This file |
 ---
 ## 🎯 What Makes TeamForge Different

 | `step()` / `reset()` / `state()` OpenEnv API | ✅ | `environment.py` |
 | `openenv.yaml` spec file | ✅ | `openenv.yaml` |
 | Typed Pydantic models | ✅ | `models.py` — 8 action types + Observation |
+| Minimum 3 tasks (easy → medium → hard) | ✅ | 3 core tasks (aligned with YAML) |
+| Graders return score in `(0, 1)` | ✅ | `grader.py` — strictly 0.001 to 0.999 |
 | Deterministic, reproducible | ✅ | Anti-exploit guards included |
 | Dense reward with strictly `(0, 1)` range | ✅ | `reward.py` — delta-based per step |
 | Baseline inference script named `inference.py` | ✅ | `inference.py` |
 | Runs on 2 vCPU / 8 GB RAM / < 20 min | ✅ | Verified — easy=~2min, hard=~8min |
 | README with action/observation space docs | ✅ | This file |
+# OpenEnv Validator Compliance
+**Status:** Strictly within `(0.001, 0.999)` interior range.
+### 🔍 Technical Diagnosis & Fix
+- **Error:** "Each task's score must be strictly between 0 and 1 (not 0.0 and not 1.0)"
+- **Cause:** The hackathon validator requires scores in the open interval (0, 1). A perfect lint or test score returning exactly 1.0 (or 0.0 on failure) was triggering the range rejection.
+- **Fix:** Implemented a robust `_clamp()` system in `grader.py` and global baselines.
+  - `_SCORE_MIN = 0.001` (never exactly 0.0)
+  - `_SCORE_MAX = 0.999` (never exactly 1.0)
+- **Compliance:** Every sub-score, reward, and final result is now guaranteed to be in the `[0.001, 0.999]` range.
 ---
 ## 🎯 What Makes TeamForge Different

environment.py CHANGED Viewed

@@ -64,7 +64,7 @@ class TeamForgeEnv:
         # Episode state
         self._step_number = 0
-        self._cumulative_reward = 0.1
         self._plan: List[PlanStep] = []
         self._reviews: List[ReviewArtifact] = []
         self._reflections: List[ReflectionArtifact] = []
@@ -106,7 +106,7 @@ class TeamForgeEnv:
         # Reset episode state
         self._step_number = 0
-        self._cumulative_reward = 0.1
         self._plan = []
         self._reviews = []
         self._reflections = []
@@ -122,7 +122,7 @@ class TeamForgeEnv:
             action_type=None,
             status=ActionStatus.SUCCESS,
             output="Environment initialized.",
-            reward=0.1,
             done=False,
         )
         return self._obs
@@ -329,7 +329,7 @@ class TeamForgeEnv:
             ln for ln in output.splitlines()
             if re.match(r".+:\d+:\d+:", ln)
         ])
-        score = max(0.1, min(0.9, 1.0 - violations * 0.05))
         self._last_lint_result = LintResult(
             violations=violations,
             output=output[:2000],
@@ -401,7 +401,7 @@ class TeamForgeEnv:
             action_type=None,
             status=ActionStatus.FAILURE,
             output=reason,
-            reward=0.1,
             done=True,
         )
         return self._obs

         # Episode state
         self._step_number = 0
+        self._cumulative_reward = 0.001
         self._plan: List[PlanStep] = []
         self._reviews: List[ReviewArtifact] = []
         self._reflections: List[ReflectionArtifact] = []
         # Reset episode state
         self._step_number = 0
+        self._cumulative_reward = 0.001
         self._plan = []
         self._reviews = []
         self._reflections = []
             action_type=None,
             status=ActionStatus.SUCCESS,
             output="Environment initialized.",
+            reward=0.001,
             done=False,
         )
         return self._obs
             ln for ln in output.splitlines()
             if re.match(r".+:\d+:\d+:", ln)
         ])
+        score = max(0.001, min(0.999, 1.0 - violations * 0.05))
         self._last_lint_result = LintResult(
             violations=violations,
             output=output[:2000],
             action_type=None,
             status=ActionStatus.FAILURE,
             output=reason,
+            reward=0.001,
             done=True,
         )
         return self._obs

grader.py CHANGED Viewed

@@ -28,6 +28,18 @@ from typing import List, Optional
 from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
 # ─────────────────────────────────────────────
 # ANTI-EXPLOIT GUARDS
 # ─────────────────────────────────────────────
@@ -99,9 +111,7 @@ def score_tests(repo_path: str, timeout: int = 60) -> tuple[float, str]:
         return 0.01, output
     pass_rate = passed / total
-    # Strictly (0, 1) - Safer interior [0.1, 0.9]
-    pass_rate = max(0.1, min(0.9, pass_rate))
-    return pass_rate, output
 def score_lint(repo_path: str) -> tuple[float, str]:
@@ -122,9 +132,8 @@ def score_lint(repo_path: str) -> tuple[float, str]:
         if re.match(r".+:\d+:\d+:", ln)
     ])
     # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
-    # Strictly (0, 1) - Safer interior [0.1, 0.9]
-    score = max(0.1, min(0.9, 1.0 - violations * 0.07))
-    return score, output
 def score_review_quality(
@@ -156,8 +165,8 @@ def score_review_quality(
     code_words   = re.findall(r'\b[a-z_]{3,}\(\)', combined)
     specificity  = min(0.1, len(set(code_words)) * 0.025)
-    # Strictly (0, 1) - Safer interior [0.1, 0.9]
-    return max(0.1, min(0.9, kw_score * 0.7 + length_bonus + specificity))
 def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
@@ -177,17 +186,15 @@ def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
             depth = min(1.0, depth + 0.2)
         total += depth
-    # Strictly (0, 1) - Safer interior [0.1, 0.9]
-    return max(0.1, min(0.9, total / max(1, len(reflections))))
 def score_efficiency(total_steps: int, max_steps: int) -> float:
     """Reward solving in fewer steps with smooth decay curve."""
     ratio = total_steps / max_steps
-    # Smooth exponential decay instead of step function
-    import math
-    # Strictly (0.1, 0.9)
-    return round(max(0.1, min(0.9, math.exp(-2.0 * max(0, ratio - 0.25)))), 4)
 # ─────────────────────────────────────────────
@@ -215,7 +222,7 @@ def grade_episode(
         log.append("[GRADER] ⚠  TEST TAMPERING DETECTED — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
-            final_score=0.1, passed=False,
             log=log + ["Test files were trivially rewritten to force passes."],
         )
@@ -223,7 +230,7 @@ def grade_episode(
         log.append("[GRADER] ⚠  NO IMPLEMENTATION FOUND — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
-            final_score=0.1, passed=False,
             log=log + ["No non-test code was written."],
         )
@@ -256,9 +263,8 @@ def grade_episode(
         + 0.10 * review_q
         + 0.05 * reflect_q
     )
-    # Clamp to [0.1, 0.9] so that :.2f format never outputs 0.00 or 1.00
-    # Strictly (0.1, 0.9) interior range to satisfy Phase 2 validator
-    final = round(min(0.90, max(0.10, final)), 4)
     log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
     return EpisodeResult(

 from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
+# ─────────────────────────────────────────────
+# SCORING CONFIG
+# ─────────────────────────────────────────────
+_SCORE_MIN = 0.001   # never exactly 0.0
+_SCORE_MAX = 0.999   # never exactly 1.0
+def _clamp(score: float) -> float:
+    """Ensure score is strictly within the open interval (0, 1)."""
+    return round(max(_SCORE_MIN, min(_SCORE_MAX, score)), 4)
 # ─────────────────────────────────────────────
 # ANTI-EXPLOIT GUARDS
 # ─────────────────────────────────────────────
         return 0.01, output
     pass_rate = passed / total
+    return _clamp(pass_rate), output
 def score_lint(repo_path: str) -> tuple[float, str]:
         if re.match(r".+:\d+:\d+:", ln)
     ])
     # Stricter: -0.07 per violation (was 0.05), floor at 0.2 not 0
+    # Strictly (0, 1) - Safer interior
+    return _clamp(1.0 - violations * 0.07), output
 def score_review_quality(
     code_words   = re.findall(r'\b[a-z_]{3,}\(\)', combined)
     specificity  = min(0.1, len(set(code_words)) * 0.025)
+    # Strictly (0, 1) - Safer interior
+    return _clamp(kw_score * 0.7 + length_bonus + specificity)
 def score_reflection_quality(reflections: List[ReflectionArtifact]) -> float:
             depth = min(1.0, depth + 0.2)
         total += depth
+    # Strictly (0, 1) - Safer interior
+    return _clamp(total / max(1, len(reflections)))
 def score_efficiency(total_steps: int, max_steps: int) -> float:
     """Reward solving in fewer steps with smooth decay curve."""
     ratio = total_steps / max_steps
+    # Strictly (0, 1) - Safer interior
+    return _clamp(math.exp(-2.0 * max(0, ratio - 0.25)))
 # ─────────────────────────────────────────────
         log.append("[GRADER] ⚠  TEST TAMPERING DETECTED — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
+            final_score=_SCORE_MIN, passed=False,
             log=log + ["Test files were trivially rewritten to force passes."],
         )
         log.append("[GRADER] ⚠  NO IMPLEMENTATION FOUND — score zeroed")
         return EpisodeResult(
             task_id=task_id, total_steps=total_steps,
+            final_score=_SCORE_MIN, passed=False,
             log=log + ["No non-test code was written."],
         )
         + 0.10 * review_q
         + 0.05 * reflect_q
     )
+    # Strictly (0, 1) interior range to satisfy Phase 2 validator
+    final = _clamp(final)
     log.append(f"[GRADER] FINAL_SCORE={final:.4f}")
     return EpisodeResult(

inference.py CHANGED Viewed

@@ -237,7 +237,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
     score   = result.final_score
     success = result.passed
-    rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.10"
     # ── [END] ─────────────────────────────────────────────────────────────────
     # We use 2 decimal places to match common validator expectations,

     score   = result.final_score
     success = result.passed
+    rewards_str = ",".join(f"{r:.3f}" for r in rewards) if rewards else "0.001"
     # ── [END] ─────────────────────────────────────────────────────────────────
     # We use 2 decimal places to match common validator expectations,

models.py CHANGED Viewed

@@ -129,7 +129,7 @@ class TestResult(BaseModel):
 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
-    score: float = 0.90  # 0.9 = clean
 class ReviewArtifact(BaseModel):
@@ -175,8 +175,8 @@ class Observation(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
-    reward: float = 0.1
-    cumulative_reward: float = 0.1
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
@@ -188,11 +188,11 @@ class Observation(BaseModel):
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
-    test_pass_rate: float = 0.1
-    lint_score: float = 0.1
-    efficiency_score: float = 0.1
-    review_quality: float = 0.1
-    reflection_quality: float = 0.1
-    final_score: float = 0.1
     passed: bool = False
     log: List[str] = Field(default_factory=list)

 class LintResult(BaseModel):
     violations: int = 0
     output: str = ""
+    score: float = 0.999  # 0.999 = clean
 class ReviewArtifact(BaseModel):
     reflections: List[ReflectionArtifact] = Field(default_factory=list)
     # Signals
+    reward: float = 0.001
+    cumulative_reward: float = 0.001
     done: bool = False
     info: Dict[str, Any] = Field(default_factory=dict)
 class EpisodeResult(BaseModel):
     task_id: str
     total_steps: int
+    test_pass_rate: float = 0.001
+    lint_score: float = 0.001
+    efficiency_score: float = 0.001
+    review_quality: float = 0.001
+    reflection_quality: float = 0.001
+    final_score: float = 0.001
     passed: bool = False
     log: List[str] = Field(default_factory=list)

openenv.yaml CHANGED Viewed

@@ -116,7 +116,7 @@ observation_space:
 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
-  range: [0.1, 0.9]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -129,21 +129,21 @@ tasks:
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
     grader: grader.grade_task
-    score_range: [0.1, 0.9]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
     grader: grader.grade_task
-    score_range: [0.1, 0.9]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_task
-    score_range: [0.1, 0.9]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
+  range: [0.001, 0.999]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
     grader: grader.grade_task
+    score_range: [0.001, 0.999]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
     grader: grader.grade_task
+    score_range: [0.001, 0.999]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_task
+    score_range: [0.001, 0.999]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

reward.py CHANGED Viewed

@@ -31,12 +31,12 @@ TEST_PASS_BONUS_PER_TEST = 0.05
 LINT_CLEAN_BONUS = 0.05
 # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
-# We use 0.1 to satisfy "strictly between 0 and 1" requirement with safe interior
-ACTION_ERROR_REWARD = 0.10
-REPEATED_FAILURE_REWARD = 0.10
-STEP_BASE_REWARD = 0.10
-TEST_MODIFICATION_REWARD = 0.10
-LINT_VIOLATION_REWARD = 0.10
 # ─────────────────────────────────────────────
@@ -95,7 +95,7 @@ class RewardCalculator:
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
-        }.get(action_type, 0.10)
         # ── Test progress bonus ──
         if tests_passed is not None:
@@ -114,8 +114,8 @@ class RewardCalculator:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
-        # Final clamp to strictly within (0.1, 0.9) per OpenEnv validator requirement
-        return round(max(0.1, min(0.9, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()

 LINT_CLEAN_BONUS = 0.05
 # Neutral/Small signals (replacing negative penalties to stay strictly in 0-1 range)
+# We use 0.001 to satisfy "strictly between 0 and 1" requirement with high resolution
+ACTION_ERROR_REWARD = 0.001
+REPEATED_FAILURE_REWARD = 0.001
+STEP_BASE_REWARD = 0.001
+TEST_MODIFICATION_REWARD = 0.001
+LINT_VIOLATION_REWARD = 0.001
 # ─────────────────────────────────────────────
             "run_tests":        0.02,
             "run_lint":         0.02,
             "request_iteration": 0.02,
+        }.get(action_type, 0.001)
         # ── Test progress bonus ──
         if tests_passed is not None:
                     reward += abs(delta) * LINT_VIOLATION_REWARD
             self._prev_lint_violations = lint_violations
+        # Final clamp to strictly within (0.001, 0.999) per OpenEnv validator requirement
+        return round(max(0.001, min(0.999, reward)), 4)
     def _is_test_file(self, path: str) -> bool:
         low = path.lower()