Your Name commited on
Commit
4f893da
Β·
1 Parent(s): 8842df9

fix(OpenEnv): fix rounding bug in inference.py log and ensure safe [0.1, 0.9] interior scores

Browse files
Files changed (3) hide show
  1. grader.py +15 -13
  2. inference.py +3 -3
  3. openenv.yaml +4 -4
grader.py CHANGED
@@ -32,8 +32,8 @@ from models import EpisodeResult, ReviewArtifact, ReflectionArtifact
32
  # ─────────────────────────────────────────────
33
  # SCORING CONFIG
34
  # ─────────────────────────────────────────────
35
- _SCORE_MIN = 0.001 # never exactly 0.0
36
- _SCORE_MAX = 0.999 # never exactly 1.0
37
 
38
 
39
  def _clamp(score: float) -> float:
@@ -166,7 +166,6 @@ def score_review_quality(
166
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
167
  specificity = min(0.1, len(set(code_words)) * 0.025)
168
 
169
- # Strictly (0, 1) - Safer interior
170
  return _clamp(kw_score * 0.7 + length_bonus + specificity)
171
 
172
 
@@ -318,13 +317,16 @@ def grade_task(repo_path: str, **kwargs) -> float:
318
  except Exception:
319
  pass
320
 
321
- result = grade_episode(
322
- repo_path=repo_path,
323
- task_id=task_id,
324
- total_steps=total_steps,
325
- max_steps=max_steps,
326
- reviews=reviews,
327
- reflections=reflections,
328
- required_keywords=required_keywords,
329
- )
330
- return float(result.final_score)
 
 
 
 
32
  # ─────────────────────────────────────────────
33
  # SCORING CONFIG
34
  # ─────────────────────────────────────────────
35
+ _SCORE_MIN = 0.10 # deep interior to avoid all boundary issues
36
+ _SCORE_MAX = 0.90 # deep interior to avoid all boundary issues
37
 
38
 
39
  def _clamp(score: float) -> float:
 
166
  code_words = re.findall(r'\b[a-z_]{3,}\(\)', combined)
167
  specificity = min(0.1, len(set(code_words)) * 0.025)
168
 
 
169
  return _clamp(kw_score * 0.7 + length_bonus + specificity)
170
 
171
 
 
317
  except Exception:
318
  pass
319
 
320
+ try:
321
+ result = grade_episode(
322
+ repo_path=repo_path,
323
+ task_id=task_id,
324
+ total_steps=total_steps,
325
+ max_steps=max_steps,
326
+ reviews=reviews,
327
+ reflections=reflections,
328
+ required_keywords=required_keywords,
329
+ )
330
+ return float(result.final_score)
331
+ except Exception:
332
+ return _SCORE_MIN
inference.py CHANGED
@@ -237,11 +237,11 @@ def run_episode(env: TeamForgeEnv, agent: Agent, task_id: str) -> Dict:
237
  score = result.final_score
238
  success = result.passed
239
 
240
- rewards_str = ",".join(f"{r:.3f}" for r in rewards) if rewards else "0.001"
241
 
242
  # ── [END] ─────────────────────────────────────────────────────────────────
243
- # We use 2 decimal places to match common validator expectations,
244
- # but the internal value is strictly interior [0.1, 0.9].
245
  print(
246
  f"[END] success={'true' if success else 'false'} steps={step_count} "
247
  f"score={score:.4f} rewards={rewards_str}",
 
237
  score = result.final_score
238
  success = result.passed
239
 
240
+ rewards_str = ",".join(f"{r:.4f}" for r in rewards) if rewards else "0.1000"
241
 
242
  # ── [END] ─────────────────────────────────────────────────────────────────
243
+ # We use 4 decimal places to ensure that interior scores (e.g. 0.999)
244
+ # are never rounded to illegal boundary values (1.00) in the logs.
245
  print(
246
  f"[END] success={'true' if success else 'false'} steps={step_count} "
247
  f"score={score:.4f} rewards={rewards_str}",
openenv.yaml CHANGED
@@ -116,7 +116,7 @@ observation_space:
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
- range: [0.001, 0.999]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
@@ -129,21 +129,21 @@ tasks:
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
  grader: grader.grade_task
132
- score_range: [0.001, 0.999]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
  grader: grader.grade_task
139
- score_range: [0.001, 0.999]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
- score_range: [0.001, 0.999]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime:
 
116
 
117
  # ── Reward ─────────────────────────────────────────────────────────────────────
118
  reward:
119
+ range: [0.0, 1.0]
120
  type: dense
121
  description: >
122
  Dense shaped reward. Positive for: correct plan steps, edits, passing tests,
 
129
  max_steps: 20
130
  description: "Fix an off-by-one bug in utils/list_ops.py. All 7 tests must pass."
131
  grader: grader.grade_task
132
+ score_range: [0.0, 1.0]
133
 
134
  - id: medium_refactor_stats
135
  difficulty: medium
136
  max_steps: 30
137
  description: "Refactor monolithic stats.py into a stats/ package. 15 tests must pass with full backward compatibility."
138
  grader: grader.grade_task
139
+ score_range: [0.0, 1.0]
140
 
141
  - id: hard_lru_cache_performance
142
  difficulty: hard
143
  max_steps: 40
144
  description: "Implement O(1) LRU cache from a stub. 15 correctness tests + 1 performance test (10k ops < 200ms)."
145
  grader: grader.grade_task
146
+ score_range: [0.0, 1.0]
147
 
148
  # ── Infrastructure ─────────────────────────────────────────────────────────────
149
  runtime: