samrat-rm commited on
Commit
c348367
·
1 Parent(s): 0252dc5

fix: logs in inference

Browse files
Files changed (2) hide show
  1. inference.py +3 -8
  2. openenv.yaml +15 -12
inference.py CHANGED
@@ -254,8 +254,8 @@ async def run_episode(
254
 
255
  finally:
256
  steps_taken = len(rewards)
257
- final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
258
- print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score:.2f}", flush=True)
259
 
260
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
261
 
@@ -280,9 +280,6 @@ async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEn
280
  results.append(res)
281
  # print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
282
 
283
- avg_score = sum(r["score"] for r in results) / len(results)
284
- pass_rate = sum(1 for r in results if r["success"]) / len(results)
285
- # print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
286
  return [r["score"] for r in results]
287
 
288
 
@@ -295,9 +292,7 @@ async def main() -> None:
295
  scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
296
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
297
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
298
- overall = max(0.01, min(0.99, sum(scores) / len(scores))) if scores else 0.01
299
- # print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
300
- print(f"[END] score={overall:.3f}", flush=True)
301
  finally:
302
  try:
303
  await env.close()
 
254
 
255
  finally:
256
  steps_taken = len(rewards)
257
+ final_reward = f"{rewards[-1]:.2f}" if rewards else "0.01"
258
+ print(f"[END] success={str(success).lower()} steps={steps_taken} rewards={final_reward}", flush=True)
259
 
260
  return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
261
 
 
280
  results.append(res)
281
  # print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
282
 
 
 
 
283
  return [r["score"] for r in results]
284
 
285
 
 
292
  scores += await run_task("task_easy", EASY_SCENARIOS, env, client)
293
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
294
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
295
+ pass # scoring is handled by the yaml grader, not stdout
 
 
296
  finally:
297
  try:
298
  await env.close()
openenv.yaml CHANGED
@@ -24,13 +24,14 @@ tasks:
24
  Agent response:
25
  {response}
26
 
27
- Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
28
- - 0.99: Correct failure mode with reasoning that cites specific numeric values from the logs
 
29
  - 0.70: Correct failure mode but reasoning is vague or missing specific numbers
30
  - 0.30: Wrong label but description matches a related concept
31
- - 0.01: Wrong failure mode or no diagnosis submitted
32
 
33
- Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.
34
 
35
  - id: task_medium
36
  difficulty: medium
@@ -50,13 +51,14 @@ tasks:
50
  Agent response:
51
  {response}
52
 
53
- Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
54
- - 0.99: Correct failure mode with reasoning citing both log values AND config parameters
 
55
  - 0.70: Correct failure mode but reasoning only references logs or config, not both
56
  - 0.30: Wrong label but description matches a related concept
57
- - 0.01: Wrong failure mode or no diagnosis submitted
58
 
59
- Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.
60
 
61
  - id: task_hard
62
  difficulty: hard
@@ -77,11 +79,12 @@ tasks:
77
  Agent response:
78
  {response}
79
 
80
- Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
81
- - 0.99: Correct failure mode AND a specific actionable fix addressing the root cause
 
82
  - 0.80: Correct failure mode with a reasonable fix that lacks specifics
83
  - 0.50: Correct failure mode but fix is vague, wrong, or missing
84
  - 0.20: Wrong failure mode but fix is incidentally relevant
85
- - 0.01: Wrong failure mode and no useful fix
86
 
87
- Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.
 
24
  Agent response:
25
  {response}
26
 
27
+ IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
28
+ Use these exact values:
29
+ - 0.95: Correct failure mode with reasoning that cites specific numeric values from the logs
30
  - 0.70: Correct failure mode but reasoning is vague or missing specific numbers
31
  - 0.30: Wrong label but description matches a related concept
32
+ - 0.05: Wrong failure mode or no diagnosis submitted
33
 
34
+ Reply with a single decimal number from the list above. No explanation. No other text.
35
 
36
  - id: task_medium
37
  difficulty: medium
 
51
  Agent response:
52
  {response}
53
 
54
+ IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
55
+ Use these exact values:
56
+ - 0.95: Correct failure mode with reasoning citing both log values AND config parameters
57
  - 0.70: Correct failure mode but reasoning only references logs or config, not both
58
  - 0.30: Wrong label but description matches a related concept
59
+ - 0.05: Wrong failure mode or no diagnosis submitted
60
 
61
+ Reply with a single decimal number from the list above. No explanation. No other text.
62
 
63
  - id: task_hard
64
  difficulty: hard
 
79
  Agent response:
80
  {response}
81
 
82
+ IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
83
+ Use these exact values:
84
+ - 0.95: Correct failure mode AND a specific actionable fix addressing the root cause
85
  - 0.80: Correct failure mode with a reasonable fix that lacks specifics
86
  - 0.50: Correct failure mode but fix is vague, wrong, or missing
87
  - 0.20: Wrong failure mode but fix is incidentally relevant
88
+ - 0.05: Wrong failure mode and no useful fix
89
 
90
+ Reply with a single decimal number from the list above. No explanation. No other text.