samrat-rm commited on
Commit
87b840b
·
1 Parent(s): 26630c7

chore: restrict stdout to START/STEP/END for eval compliance

Browse files

Comment out JUDGE, RESULT, SUMMARY, DEBUG, WARN, INFO, OVERALL logs
so the hackathon evaluator only sees the expected line format.
Logs preserved for local use.

Files changed (1) hide show
  1. inference.py +11 -11
inference.py CHANGED
@@ -159,7 +159,7 @@ def _get_action(client: OpenAI, step: int, obs_summary: str, history: List[str])
159
  filtered = {k: v for k, v in data.items() if k in WhyDidItFailAction.model_fields}
160
  return WhyDidItFailAction(**filtered)
161
  except Exception as exc:
162
- print(f" [DEBUG] parse error: {exc}", file=sys.stderr, flush=True)
163
  if step <= 2:
164
  return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None,reasoning=None)
165
  return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None,reasoning=None)
@@ -184,7 +184,7 @@ async def run_episode(
184
  try:
185
  result = await env.reset(scenario_key=scenario_key)
186
  except ConnectionClosedError:
187
- print(f" [WARN] scenario={scenario_key} reconnecting WebSocket...", file=sys.stderr, flush=True)
188
  env = await _make_env()
189
  result = await env.reset(scenario_key=scenario_key)
190
 
@@ -245,10 +245,10 @@ async def run_episode(
245
  )
246
  if judge_score is None:
247
  score = round(keyword_score, 4)
248
- print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
249
  else:
250
  score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
251
- print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
252
 
253
  success = score >= SUCCESS_THRESHOLD
254
 
@@ -262,7 +262,7 @@ async def run_episode(
262
 
263
  async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEnv, client: OpenAI) -> List[float]:
264
  if not scenario_keys:
265
- print(f" [INFO] task={task_name} — no scenarios defined yet", flush=True)
266
  return []
267
 
268
  if USE_LOCAL:
@@ -278,11 +278,11 @@ async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEn
278
  for key in scenario_keys:
279
  res, env = await run_episode(env, client, key, task_name, effective_model)
280
  results.append(res)
281
- print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
282
 
283
  avg_score = sum(r["score"] for r in results) / len(results)
284
  pass_rate = sum(1 for r in results if r["success"]) / len(results)
285
- print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
286
  return [r["score"] for r in results]
287
 
288
 
@@ -296,14 +296,14 @@ async def main() -> None:
296
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
297
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
298
  overall = sum(scores) / len(scores) if scores else 0.0
299
- print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
300
- print(f"[END] score={overall:.3f}", flush=True)
301
  finally:
302
  try:
303
  await env.close()
304
  except Exception as e:
305
- print(f" [DEBUG] env.close() error: {e}", file=sys.stderr, flush=True)
306
-
307
 
308
  if __name__ == "__main__":
309
  asyncio.run(main())
 
159
  filtered = {k: v for k, v in data.items() if k in WhyDidItFailAction.model_fields}
160
  return WhyDidItFailAction(**filtered)
161
  except Exception as exc:
162
+ # print(f" [DEBUG] parse error: {exc}", file=sys.stderr, flush=True)
163
  if step <= 2:
164
  return WhyDidItFailAction(action_type="inspect_logs", diagnosis=None, suggested_fix=None,reasoning=None)
165
  return WhyDidItFailAction(action_type="submit_diagnosis", diagnosis="unknown", suggested_fix=None,reasoning=None)
 
184
  try:
185
  result = await env.reset(scenario_key=scenario_key)
186
  except ConnectionClosedError:
187
+ # print(f" [WARN] scenario={scenario_key} reconnecting WebSocket...", file=sys.stderr, flush=True)
188
  env = await _make_env()
189
  result = await env.reset(scenario_key=scenario_key)
190
 
 
245
  )
246
  if judge_score is None:
247
  score = round(keyword_score, 4)
248
+ # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning=n/a total={score:.3f}", file=sys.stderr, flush=True)
249
  else:
250
  score = round(0.85 * keyword_score + 0.15 * judge_score, 4)
251
+ # print(f" [JUDGE] scenario={scenario_key} keyword={keyword_score:.3f} reasoning={judge_score:.3f} total={score:.3f}", file=sys.stderr, flush=True)
252
 
253
  success = score >= SUCCESS_THRESHOLD
254
 
 
262
 
263
  async def run_task(task_name: str, scenario_keys: List[str], env: WhyDidItFailEnv, client: OpenAI) -> List[float]:
264
  if not scenario_keys:
265
+ # print(f" [INFO] task={task_name} — no scenarios defined yet", flush=True)
266
  return []
267
 
268
  if USE_LOCAL:
 
278
  for key in scenario_keys:
279
  res, env = await run_episode(env, client, key, task_name, effective_model)
280
  results.append(res)
281
+ # print(f"[RESULT] scenario={res['scenario_key']} score={res['score']:.3f} steps={res['steps']} success={str(res['success']).lower()}", flush=True)
282
 
283
  avg_score = sum(r["score"] for r in results) / len(results)
284
  pass_rate = sum(1 for r in results if r["success"]) / len(results)
285
+ # print(f"[SUMMARY] task={task_name} avg_score={avg_score:.3f} pass_rate={pass_rate:.2f}", flush=True)
286
  return [r["score"] for r in results]
287
 
288
 
 
296
  scores += await run_task("task_medium", MEDIUM_SCENARIOS, env, client)
297
  scores += await run_task("task_hard", HARD_SCENARIOS, env, client)
298
  overall = sum(scores) / len(scores) if scores else 0.0
299
+ # print(f" [OVERALL] avg_score={overall:.3f}", file=sys.stderr, flush=True)
300
+ # print(f"[END] score={overall:.3f}", flush=True)
301
  finally:
302
  try:
303
  await env.close()
304
  except Exception as e:
305
+ # print(f" [DEBUG] env.close() error: {e}", file=sys.stderr, flush=True)
306
+ pass
307
 
308
  if __name__ == "__main__":
309
  asyncio.run(main())