Spaces:

PrakashCider
/

teamforge

Sleeping

App Files Files Community

Your Name commited on Apr 11

Commit

652a783

1 Parent(s): 368d425

fix(OpenEnv): remove bonus task and refine ranges to ensure absolute strictly (0, 1) compliance across all fields

Browse files

Files changed (4) hide show

README.md +1 -5
inference.py +1 -1
openenv.yaml +4 -12
tasks/task_registry.py +0 -1

README.md CHANGED Viewed

@@ -112,11 +112,7 @@ Current benchmarks (HumanEval, SWE-bench, MBPP) treat code generation as a **sin
 - 15 correctness tests + 1 performance test: 10,000 ops in < 200ms
 - **Algorithm design + complexity analysis + perf constraint · 40 step limit**
-### 🟣 Bonus — `bonus_perf_regression_merge`
-**Real-world analog:** Diagnosing a silent O(n²) regression from a bad merge conflict resolution
-- Functional tests still pass — only a 50,000-document perf test reveals the bug
-- Requires code archaeology: reading conflict markers + understanding algorithm history
-- **Unique task — no existing benchmark includes this scenario**
 ---

 - 15 correctness tests + 1 performance test: 10,000 ops in < 200ms
 - **Algorithm design + complexity analysis + perf constraint · 40 step limit**
+---
 ---

inference.py CHANGED Viewed

@@ -225,7 +225,7 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
     # ── [END] ─────────────────────────────────────────────────────────────────
     print(
         f"[END] success={'true' if success else 'false'} steps={step_count} "
-        f"score={score:.2f} rewards={rewards_str}",
         flush=True,
     )

     # ── [END] ─────────────────────────────────────────────────────────────────
     print(
         f"[END] success={'true' if success else 'false'} steps={step_count} "
+        f"score={score:.4f} rewards={rewards_str}",
         flush=True,
     )

openenv.yaml CHANGED Viewed

@@ -21,7 +21,6 @@ methods:
           - easy_bugfix_chunk_list
           - medium_refactor_stats
           - hard_lru_cache_performance
-          - bonus_perf_regression_merge
         description: "Which task to run this episode."
   step:
     description: "Execute one typed action. Returns updated Observation with reward."
@@ -117,7 +116,7 @@ observation_space:
 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
-  range: [0.01, 0.99]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -130,28 +129,21 @@ tasks:
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
     grader: grader.grade_episode
-    score_range: [0.01, 0.99]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
     grader: grader.grade_episode
-    score_range: [0.01, 0.99]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_episode
-    score_range: [0.01, 0.99]
-  - id: bonus_perf_regression_merge
-    difficulty: hard
-    max_steps: 40
-    description: "Diagnose and fix O(n²) regression hidden inside a bad merge conflict resolution. Perf test: 50k docs < 500ms."
-    grader: grader.grade_episode
-    score_range: [0.01, 0.99]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

           - easy_bugfix_chunk_list
           - medium_refactor_stats
           - hard_lru_cache_performance
         description: "Which task to run this episode."
   step:
     description: "Execute one typed action. Returns updated Observation with reward."
 # ── Reward ─────────────────────────────────────────────────────────────────────
 reward:
+  range: [0.0, 10.0]
   type: dense
   description: >
     Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
     max_steps: 20
     description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
     grader: grader.grade_episode
+    score_range: [0.0, 1.0]
   - id: medium_refactor_stats
     difficulty: medium
     max_steps: 30
     description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
     grader: grader.grade_episode
+    score_range: [0.0, 1.0]
   - id: hard_lru_cache_performance
     difficulty: hard
     max_steps: 40
     description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
     grader: grader.grade_episode
+    score_range: [0.0, 1.0]
 # ── Infrastructure ─────────────────────────────────────────────────────────────
 runtime:

tasks/task_registry.py CHANGED Viewed

@@ -7,7 +7,6 @@ TASK_REGISTRY: Dict[str, Any] = {
     easy_task.TASK_ID:   easy_task,
     medium_task.TASK_ID: medium_task,
     hard_task.TASK_ID:   hard_task,
-    bonus_task.TASK_ID:  bonus_task,
 }
 # The 3 scored tasks for the hackathon (easy, medium, hard)

     easy_task.TASK_ID:   easy_task,
     medium_task.TASK_ID: medium_task,
     hard_task.TASK_ID:   hard_task,
 }
 # The 3 scored tasks for the hackathon (easy, medium, hard)