Spaces:
Sleeping
Sleeping
fix: reward scores are updated to be between 0 and 1
Browse files- inference.py +3 -3
- server/graders.py +2 -2
inference.py
CHANGED
|
@@ -210,7 +210,7 @@ async def run_episode(
|
|
| 210 |
print(f"[STEP] step={step} action={action.action_type} reward=0.00 done=true error={e}", flush=True)
|
| 211 |
break
|
| 212 |
obs = result.observation
|
| 213 |
-
reward = result.reward or 0.
|
| 214 |
done = result.done
|
| 215 |
if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
|
| 216 |
source = action.action_type.replace("inspect_", "")
|
|
@@ -252,8 +252,8 @@ async def run_episode(
|
|
| 252 |
|
| 253 |
finally:
|
| 254 |
steps_taken = len(rewards)
|
| 255 |
-
|
| 256 |
-
print(f"[END] success={str(success).lower()} steps={steps_taken}
|
| 257 |
|
| 258 |
return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
|
| 259 |
|
|
|
|
| 210 |
print(f"[STEP] step={step} action={action.action_type} reward=0.00 done=true error={e}", flush=True)
|
| 211 |
break
|
| 212 |
obs = result.observation
|
| 213 |
+
reward = round(min(0.99, result.reward or 0.01), 2)
|
| 214 |
done = result.done
|
| 215 |
if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
|
| 216 |
source = action.action_type.replace("inspect_", "")
|
|
|
|
| 252 |
|
| 253 |
finally:
|
| 254 |
steps_taken = len(rewards)
|
| 255 |
+
final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
|
| 256 |
+
print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score}", flush=True)
|
| 257 |
|
| 258 |
return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
|
| 259 |
|
server/graders.py
CHANGED
|
@@ -210,7 +210,7 @@ def grade(
|
|
| 210 |
max_steps = len(required) * 3 + 2 # hard ceiling; exceeding it = total failure
|
| 211 |
|
| 212 |
if steps_taken > max_steps:
|
| 213 |
-
return 0.
|
| 214 |
|
| 215 |
d_score = _diagnosis_score(diagnosis, scenario)
|
| 216 |
ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
|
|
@@ -221,4 +221,4 @@ def grade(
|
|
| 221 |
|
| 222 |
total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
|
| 223 |
|
| 224 |
-
return round(max(0.
|
|
|
|
| 210 |
max_steps = len(required) * 3 + 2 # hard ceiling; exceeding it = total failure
|
| 211 |
|
| 212 |
if steps_taken > max_steps:
|
| 213 |
+
return 0.01
|
| 214 |
|
| 215 |
d_score = _diagnosis_score(diagnosis, scenario)
|
| 216 |
ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
|
|
|
|
| 221 |
|
| 222 |
total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
|
| 223 |
|
| 224 |
+
return round(max(0.01, min(0.99, total)), 2)
|