Spaces:

samrat-rm
/

WhyDidItFail

Sleeping

App Files Files Community

samrat-rm commited on 8 days ago

Commit

c348367

1 Parent(s): 0252dc5

fix: logs in inference

Browse files

Files changed (2) hide show

inference.py +3 -8
openenv.yaml +15 -12

inference.py CHANGED Viewed

@@ -254,8 +254,8 @@ async def run_episode(
     finally:
         steps_taken = len(rewards)
-        final_score = round(max(0.01, min(0.99, sum(rewards))), 2) if rewards else 0.01
-        print(f"[END] success={str(success).lower()} steps={steps_taken} reward={final_score:.2f}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
@@ -280,9 +280,6 @@ async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEn
         results.append(res)
         # print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
-    avg_score = sum(r["score"] for r in results) / len(results)
-    pass_rate = sum(1 for r in results if r["success"]) / len(results)
-    # print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
     return [r["score"] for r in results]
@@ -295,9 +292,7 @@ async def main() -> None:
         scores += await run_task("task_easy",   EASY_SCENARIOS,   env, client)
         scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
         scores += await run_task("task_hard",   HARD_SCENARIOS,   env, client)
-        overall = max(0.01, min(0.99, sum(scores) / len(scores))) if scores else 0.01
-        # print(f"  [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
-        print(f"[END] score={overall:.3f}", flush=True)
     finally:
         try:
             await env.close()

     finally:
         steps_taken = len(rewards)
+        final_reward = f"{rewards[-1]:.2f}" if rewards else "0.01"
+        print(f"[END] success={str(success).lower()} steps={steps_taken} rewards={final_reward}", flush=True)
     return {"scenario_key": scenario_key, "score": score, "steps": steps_taken, "success": success}, env
         results.append(res)
         # print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
     return [r["score"] for r in results]
         scores += await run_task("task_easy",   EASY_SCENARIOS,   env, client)
         scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
         scores += await run_task("task_hard",   HARD_SCENARIOS,   env, client)
+        pass  # scoring is handled by the yaml grader, not stdout
     finally:
         try:
             await env.close()

openenv.yaml CHANGED Viewed

@@ -24,13 +24,14 @@ tasks:
         Agent response:
         {response}
-        Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
-        - 0.99: Correct failure mode with reasoning that cites specific numeric values from the logs
         - 0.70: Correct failure mode but reasoning is vague or missing specific numbers
         - 0.30: Wrong label but description matches a related concept
-        - 0.01: Wrong failure mode or no diagnosis submitted
-        Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.
   - id: task_medium
     difficulty: medium
@@ -50,13 +51,14 @@ tasks:
         Agent response:
         {response}
-        Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
-        - 0.99: Correct failure mode with reasoning citing both log values AND config parameters
         - 0.70: Correct failure mode but reasoning only references logs or config, not both
         - 0.30: Wrong label but description matches a related concept
-        - 0.01: Wrong failure mode or no diagnosis submitted
-        Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.
   - id: task_hard
     difficulty: hard
@@ -77,11 +79,12 @@ tasks:
         Agent response:
         {response}
-        Score strictly between 0.0 and 1.0 (exclusive — never return exactly 0.0 or 1.0):
-        - 0.99: Correct failure mode AND a specific actionable fix addressing the root cause
         - 0.80: Correct failure mode with a reasonable fix that lacks specifics
         - 0.50: Correct failure mode but fix is vague, wrong, or missing
         - 0.20: Wrong failure mode but fix is incidentally relevant
-        - 0.01: Wrong failure mode and no useful fix
-        Reply with a single float strictly between 0.0 and 1.0 (e.g. 0.99, not 1.0). No explanation.

         Agent response:
         {response}
+        IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
+        Use these exact values:
+        - 0.95: Correct failure mode with reasoning that cites specific numeric values from the logs
         - 0.70: Correct failure mode but reasoning is vague or missing specific numbers
         - 0.30: Wrong label but description matches a related concept
+        - 0.05: Wrong failure mode or no diagnosis submitted
+        Reply with a single decimal number from the list above. No explanation. No other text.
   - id: task_medium
     difficulty: medium
         Agent response:
         {response}
+        IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
+        Use these exact values:
+        - 0.95: Correct failure mode with reasoning citing both log values AND config parameters
         - 0.70: Correct failure mode but reasoning only references logs or config, not both
         - 0.30: Wrong label but description matches a related concept
+        - 0.05: Wrong failure mode or no diagnosis submitted
+        Reply with a single decimal number from the list above. No explanation. No other text.
   - id: task_hard
     difficulty: hard
         Agent response:
         {response}
+        IMPORTANT: You must return a decimal number strictly between 0 and 1. Never return 0, 1, 0.0, or 1.0.
+        Use these exact values:
+        - 0.95: Correct failure mode AND a specific actionable fix addressing the root cause
         - 0.80: Correct failure mode with a reasonable fix that lacks specifics
         - 0.50: Correct failure mode but fix is vague, wrong, or missing
         - 0.20: Wrong failure mode but fix is incidentally relevant
+        - 0.05: Wrong failure mode and no useful fix
+        Reply with a single decimal number from the list above. No explanation. No other text.