vedkdev commited on
Commit
2d4a907
·
verified ·
1 Parent(s): de8fc0f

Deploy FlakyGym UI + inference updates (minimal upload)

Browse files
Files changed (2) hide show
  1. inference.py +14 -2
  2. inference_debug.py +19 -0
inference.py CHANGED
@@ -115,6 +115,14 @@ def _to_single_line(text: str) -> str:
115
  return " ".join(str(text).split())
116
 
117
 
 
 
 
 
 
 
 
 
118
  def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
119
  print(f"[START] task={task} env={benchmark} model={model}", flush=True)
120
 
@@ -529,10 +537,14 @@ def run_episode(
529
  if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
530
  step_error = str(obs.tool_output)
531
  if step_fallback_reason:
 
 
 
 
532
  if step_error:
533
- step_error = f"{step_error}; llm_fallback:{step_fallback_reason}"
534
  else:
535
- step_error = f"llm_fallback:{step_fallback_reason}"
536
 
537
  if compliance_stdout:
538
  _compliance_log_step(
 
115
  return " ".join(str(text).split())
116
 
117
 
118
+ def _short_error(text: str, max_chars: int = 220) -> str:
119
+ one_line = _to_single_line(text)
120
+ if len(one_line) <= max_chars:
121
+ return one_line
122
+ hidden = len(one_line) - max_chars
123
+ return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
124
+
125
+
126
  def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
127
  print(f"[START] task={task} env={benchmark} model={model}", flush=True)
128
 
 
537
  if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
538
  step_error = str(obs.tool_output)
539
  if step_fallback_reason:
540
+ fallback_detail = ""
541
+ if llm_meta.get("error"):
542
+ fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
543
+ fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
544
  if step_error:
545
+ step_error = f"{step_error}; {fallback_suffix}"
546
  else:
547
+ step_error = fallback_suffix
548
 
549
  if compliance_stdout:
550
  _compliance_log_step(
inference_debug.py CHANGED
@@ -113,6 +113,14 @@ def _to_single_line(text: str) -> str:
113
  return " ".join(str(text).split())
114
 
115
 
 
 
 
 
 
 
 
 
116
  def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
117
  print(f"[START] task={task} env={benchmark} model={model}", flush=True)
118
 
@@ -424,6 +432,7 @@ def run_episode(
424
  action: FlakySleuthAction
425
  action_source = "heuristic"
426
  llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
 
427
  try:
428
  candidate, llm_meta = llm_action(messages)
429
  if candidate is not None:
@@ -467,6 +476,7 @@ def run_episode(
467
  reason_key = "empty_or_invalid_response"
468
  else:
469
  reason_key = "heuristic_default"
 
470
  fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
471
 
472
  if trace_agent and not compliance_stdout:
@@ -519,6 +529,15 @@ def run_episode(
519
  step_error = str(raw_err)
520
  if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
521
  step_error = str(obs.tool_output)
 
 
 
 
 
 
 
 
 
522
 
523
  if compliance_stdout:
524
  _compliance_log_step(
 
113
  return " ".join(str(text).split())
114
 
115
 
116
+ def _short_error(text: str, max_chars: int = 220) -> str:
117
+ one_line = _to_single_line(text)
118
+ if len(one_line) <= max_chars:
119
+ return one_line
120
+ hidden = len(one_line) - max_chars
121
+ return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
122
+
123
+
124
  def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
125
  print(f"[START] task={task} env={benchmark} model={model}", flush=True)
126
 
 
432
  action: FlakySleuthAction
433
  action_source = "heuristic"
434
  llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
435
+ step_fallback_reason: str | None = None
436
  try:
437
  candidate, llm_meta = llm_action(messages)
438
  if candidate is not None:
 
476
  reason_key = "empty_or_invalid_response"
477
  else:
478
  reason_key = "heuristic_default"
479
+ step_fallback_reason = reason_key
480
  fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
481
 
482
  if trace_agent and not compliance_stdout:
 
529
  step_error = str(raw_err)
530
  if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
531
  step_error = str(obs.tool_output)
532
+ if step_fallback_reason:
533
+ fallback_detail = ""
534
+ if llm_meta.get("error"):
535
+ fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
536
+ fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
537
+ if step_error:
538
+ step_error = f"{step_error}; {fallback_suffix}"
539
+ else:
540
+ step_error = fallback_suffix
541
 
542
  if compliance_stdout:
543
  _compliance_log_step(