Spaces:
Sleeping
Sleeping
chore: restrict stdout to START/STEP/END for eval compliance
Browse filesComment out JUDGE, RESULT, SUMMARY, DEBUG, WARN, INFO, OVERALL logs
so the hackathon evaluator only sees the expected line format.
Logs preserved for local use.
- inference.py +11 -11
inference.py
CHANGED
|
@@ -159,7 +159,7 @@ def _get_action(client: OpenAI, step: int, obs_summary: str, history: List[str])
|
|
| 159 |
filtered = {k: v for k, v in data.items() if k in WhyDidItFailAction.model_fields}
|
| 160 |
return WhyDidItFailAction(**filtered)
|
| 161 |
except Exception as exc:
|
| 162 |
-
print(f" [DEBUG] parse error: {exc}", file=sys.stderr, flush=True)
|
| 163 |
if step <= 2:
|
| 164 |
return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None,reasoning=None)
|
| 165 |
return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None,reasoning=None)
|
|
@@ -184,7 +184,7 @@ async def run_episode(
|
|
| 184 |
try:
|
| 185 |
result = await env.reset(scenario_key=scenario_key)
|
| 186 |
except ConnectionClosedError:
|
| 187 |
-
print(f" [WARN] scenario={scenario_key} reconnecting WebSocket...", file=sys.stderr, flush=True)
|
| 188 |
env = await _make_env()
|
| 189 |
result = await env.reset(scenario_key=scenario_key)
|
| 190 |
|
|
@@ -245,10 +245,10 @@ async def run_episode(
|
|
| 245 |
)
|
| 246 |
if judge_score is None:
|
| 247 |
score = round(keyword_score, 4)
|
| 248 |
-
print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
|
| 249 |
else:
|
| 250 |
score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
|
| 251 |
-
print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 252 |
|
| 253 |
success = score >= SUCCESS_THRESHOLD
|
| 254 |
|
|
@@ -262,7 +262,7 @@ async def run_episode(
|
|
| 262 |
|
| 263 |
async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEnv, client: OpenAI) -> List[float]:
|
| 264 |
if not scenario_keys:
|
| 265 |
-
print(f" [INFO] task={task_name} — no scenarios defined yet", flush=True)
|
| 266 |
return []
|
| 267 |
|
| 268 |
if USE_LOCAL:
|
|
@@ -278,11 +278,11 @@ async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEn
|
|
| 278 |
for key in scenario_keys:
|
| 279 |
res, env = await run_episode(env, client, key, task_name, effective_model)
|
| 280 |
results.append(res)
|
| 281 |
-
print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
|
| 282 |
|
| 283 |
avg_score = sum(r["score"] for r in results) / len(results)
|
| 284 |
pass_rate = sum(1 for r in results if r["success"]) / len(results)
|
| 285 |
-
print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
|
| 286 |
return [r["score"] for r in results]
|
| 287 |
|
| 288 |
|
|
@@ -296,14 +296,14 @@ async def main() -> None:
|
|
| 296 |
scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
|
| 297 |
scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
|
| 298 |
overall = sum(scores) / len(scores) if scores else 0.0
|
| 299 |
-
print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
|
| 300 |
-
print(f"[END] score={overall:.3f}", flush=True)
|
| 301 |
finally:
|
| 302 |
try:
|
| 303 |
await env.close()
|
| 304 |
except Exception as e:
|
| 305 |
-
print(f" [DEBUG] env.close() error: {e}", file=sys.stderr, flush=True)
|
| 306 |
-
|
| 307 |
|
| 308 |
if __name__ == "__main__":
|
| 309 |
asyncio.run(main())
|
|
|
|
| 159 |
filtered = {k: v for k, v in data.items() if k in WhyDidItFailAction.model_fields}
|
| 160 |
return WhyDidItFailAction(**filtered)
|
| 161 |
except Exception as exc:
|
| 162 |
+
# print(f" [DEBUG] parse error: {exc}", file=sys.stderr, flush=True)
|
| 163 |
if step <= 2:
|
| 164 |
return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None,reasoning=None)
|
| 165 |
return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None,reasoning=None)
|
|
|
|
| 184 |
try:
|
| 185 |
result = await env.reset(scenario_key=scenario_key)
|
| 186 |
except ConnectionClosedError:
|
| 187 |
+
# print(f" [WARN] scenario={scenario_key} reconnecting WebSocket...", file=sys.stderr, flush=True)
|
| 188 |
env = await _make_env()
|
| 189 |
result = await env.reset(scenario_key=scenario_key)
|
| 190 |
|
|
|
|
| 245 |
)
|
| 246 |
if judge_score is None:
|
| 247 |
score = round(keyword_score, 4)
|
| 248 |
+
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
|
| 249 |
else:
|
| 250 |
score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
|
| 251 |
+
# print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
|
| 252 |
|
| 253 |
success = score >= SUCCESS_THRESHOLD
|
| 254 |
|
|
|
|
| 262 |
|
| 263 |
async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEnv, client: OpenAI) -> List[float]:
|
| 264 |
if not scenario_keys:
|
| 265 |
+
# print(f" [INFO] task={task_name} — no scenarios defined yet", flush=True)
|
| 266 |
return []
|
| 267 |
|
| 268 |
if USE_LOCAL:
|
|
|
|
| 278 |
for key in scenario_keys:
|
| 279 |
res, env = await run_episode(env, client, key, task_name, effective_model)
|
| 280 |
results.append(res)
|
| 281 |
+
# print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
|
| 282 |
|
| 283 |
avg_score = sum(r["score"] for r in results) / len(results)
|
| 284 |
pass_rate = sum(1 for r in results if r["success"]) / len(results)
|
| 285 |
+
# print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
|
| 286 |
return [r["score"] for r in results]
|
| 287 |
|
| 288 |
|
|
|
|
| 296 |
scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
|
| 297 |
scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
|
| 298 |
overall = sum(scores) / len(scores) if scores else 0.0
|
| 299 |
+
# print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
|
| 300 |
+
# print(f"[END] score={overall:.3f}", flush=True)
|
| 301 |
finally:
|
| 302 |
try:
|
| 303 |
await env.close()
|
| 304 |
except Exception as e:
|
| 305 |
+
# print(f" [DEBUG] env.close() error: {e}", file=sys.stderr, flush=True)
|
| 306 |
+
pass
|
| 307 |
|
| 308 |
if __name__ == "__main__":
|
| 309 |
asyncio.run(main())
|