samrat-rm commited on
Commit
c130122
·
1 Parent(s): 2014a9f

fix: reward scores are updated to be between 0 and 1

Browse files
Files changed (2) hide show
  1. inference.py +3 -3
  2. server/graders.py +2 -2
inference.py CHANGED
@@ -210,7 +210,7 @@ async def run_episode(
210
  print(f"[STEP] step={step} action={action.action_type} reward=0.00 done=true error={e}", flush=True)
211
  break
212
  obs = result.observation
213
- reward = result.reward or 0.0
214
  done = result.done
215
  if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
216
  source = action.action_type.replace("inspect_", "")
@@ -252,8 +252,8 @@ async def run_episode(
252
 
253
  finally:
254
  steps_taken = len(rewards)
255
- rewards_str = ",".join(f"{r:.2f}" for r in rewards) if rewards else "0.00"
256
- print(f"[END] success={str(success).lower()} steps={steps_taken} rewards={rewards_str}", flush=True)
257
 
258
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
259
 
 
210
  print(f"[STEP] step={step} action={action.action_type} reward=0.00 done=true error={e}", flush=True)
211
  break
212
  obs = result.observation
213
+ reward = round(min(0.99, result.reward or 0.01), 2)
214
  done = result.done
215
  if action.action_type in ("inspect_logs", "inspect_config", "inspect_gradients"):
216
  source = action.action_type.replace("inspect_", "")
 
252
 
253
  finally:
254
  steps_taken = len(rewards)
255
+ final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
256
+ print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score}", flush=True)
257
 
258
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
259
 
server/graders.py CHANGED
@@ -210,7 +210,7 @@ def grade(
210
  max_steps = len(required) * 3 + 2 # hard ceiling; exceeding it = total failure
211
 
212
  if steps_taken > max_steps:
213
- return 0.0
214
 
215
  d_score = _diagnosis_score(diagnosis, scenario)
216
  ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
@@ -221,4 +221,4 @@ def grade(
221
 
222
  total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
223
 
224
- return round(max(0.0, min(1.0, total)), 4)
 
210
  max_steps = len(required) * 3 + 2 # hard ceiling; exceeding it = total failure
211
 
212
  if steps_taken > max_steps:
213
+ return 0.01
214
 
215
  d_score = _diagnosis_score(diagnosis, scenario)
216
  ed_penalty = _evidence_diagnosis_penalty(diagnosis, scenario, inspection_order)
 
221
 
222
  total = d_score + ed_penalty + e_score + f_score + b_score + o_bonus
223
 
224
+ return round(max(0.01, min(0.99, total)), 2)