Spaces:
Sleeping
Sleeping
Your Name commited on
Commit Β·
4f893da
1
Parent(s): 8842df9
fix(OpenEnv): fix rounding bug in inference.py log and ensure safe [0.1, 0.9] interior scores
Browse files- grader.py +15 -13
- inference.py +3 -3
- openenv.yaml +4 -4
grader.py
CHANGED
|
@@ -32,8 +32,8 @@ from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
|
|
| 32 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
# SCORING CONFIG
|
| 34 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
-
_SCORE_MIN = 0.
|
| 36 |
-
_SCORE_MAX = 0.
|
| 37 |
|
| 38 |
|
| 39 |
def _clamp(score: float) -> float:
|
|
@@ -166,7 +166,6 @@ def score_review_quality(
|
|
| 166 |
code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
|
| 167 |
specificity = min(0.1, len(set(code_words)) * 0.025)
|
| 168 |
|
| 169 |
-
# Strictly (0, 1) - Safer interior
|
| 170 |
return _clamp(kw_score * 0.7 + length_bonus + specificity)
|
| 171 |
|
| 172 |
|
|
@@ -318,13 +317,16 @@ def grade_task(repo_path: str, **kwargs) -> float:
|
|
| 318 |
except Exception:
|
| 319 |
pass
|
| 320 |
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
# SCORING CONFIG
|
| 34 |
# βββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
_SCORE_MIN = 0.10 # deep interior to avoid all boundary issues
|
| 36 |
+
_SCORE_MAX = 0.90 # deep interior to avoid all boundary issues
|
| 37 |
|
| 38 |
|
| 39 |
def _clamp(score: float) -> float:
|
|
|
|
| 166 |
code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
|
| 167 |
specificity = min(0.1, len(set(code_words)) * 0.025)
|
| 168 |
|
|
|
|
| 169 |
return _clamp(kw_score * 0.7 + length_bonus + specificity)
|
| 170 |
|
| 171 |
|
|
|
|
| 317 |
except Exception:
|
| 318 |
pass
|
| 319 |
|
| 320 |
+
try:
|
| 321 |
+
result = grade_episode(
|
| 322 |
+
repo_path=repo_path,
|
| 323 |
+
task_id=task_id,
|
| 324 |
+
total_steps=total_steps,
|
| 325 |
+
max_steps=max_steps,
|
| 326 |
+
reviews=reviews,
|
| 327 |
+
reflections=reflections,
|
| 328 |
+
required_keywords=required_keywords,
|
| 329 |
+
)
|
| 330 |
+
return float(result.final_score)
|
| 331 |
+
except Exception:
|
| 332 |
+
return _SCORE_MIN
|
inference.py
CHANGED
|
@@ -237,11 +237,11 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
|
|
| 237 |
score = result.final_score
|
| 238 |
success = result.passed
|
| 239 |
|
| 240 |
-
rewards_str = ",".join(f"{r:.
|
| 241 |
|
| 242 |
# ββ [END] βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
-
# We use
|
| 244 |
-
#
|
| 245 |
print(
|
| 246 |
f"[END] success={'true' if success else 'false'} steps={step_count} "
|
| 247 |
f"score={score:.4f} rewards={rewards_str}",
|
|
|
|
| 237 |
score = result.final_score
|
| 238 |
success = result.passed
|
| 239 |
|
| 240 |
+
rewards_str = ",".join(f"{r:.4f}" for r in rewards) if rewards else "0.1000"
|
| 241 |
|
| 242 |
# ββ [END] βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 243 |
+
# We use 4 decimal places to ensure that interior scores (e.g. 0.999)
|
| 244 |
+
# are never rounded to illegal boundary values (1.00) in the logs.
|
| 245 |
print(
|
| 246 |
f"[END] success={'true' if success else 'false'} steps={step_count} "
|
| 247 |
f"score={score:.4f} rewards={rewards_str}",
|
openenv.yaml
CHANGED
|
@@ -116,7 +116,7 @@ observation_space:
|
|
| 116 |
|
| 117 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
reward:
|
| 119 |
-
range: [0.
|
| 120 |
type: dense
|
| 121 |
description: >
|
| 122 |
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
|
|
@@ -129,21 +129,21 @@ tasks:
|
|
| 129 |
max_steps: 20
|
| 130 |
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
|
| 131 |
grader: grader.grade_task
|
| 132 |
-
score_range: [0.
|
| 133 |
|
| 134 |
- id: medium_refactor_stats
|
| 135 |
difficulty: medium
|
| 136 |
max_steps: 30
|
| 137 |
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
|
| 138 |
grader: grader.grade_task
|
| 139 |
-
score_range: [0.
|
| 140 |
|
| 141 |
- id: hard_lru_cache_performance
|
| 142 |
difficulty: hard
|
| 143 |
max_steps: 40
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
-
score_range: [0.
|
| 147 |
|
| 148 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
runtime:
|
|
|
|
| 116 |
|
| 117 |
# ββ Reward βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 118 |
reward:
|
| 119 |
+
range: [0.0, 1.0]
|
| 120 |
type: dense
|
| 121 |
description: >
|
| 122 |
Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
|
|
|
|
| 129 |
max_steps: 20
|
| 130 |
description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
|
| 131 |
grader: grader.grade_task
|
| 132 |
+
score_range: [0.0, 1.0]
|
| 133 |
|
| 134 |
- id: medium_refactor_stats
|
| 135 |
difficulty: medium
|
| 136 |
max_steps: 30
|
| 137 |
description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
|
| 138 |
grader: grader.grade_task
|
| 139 |
+
score_range: [0.0, 1.0]
|
| 140 |
|
| 141 |
- id: hard_lru_cache_performance
|
| 142 |
difficulty: hard
|
| 143 |
max_steps: 40
|
| 144 |
description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
|
| 145 |
grader: grader.grade_task
|
| 146 |
+
score_range: [0.0, 1.0]
|
| 147 |
|
| 148 |
# ββ Infrastructure βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 149 |
runtime:
|