Spaces:
Sleeping
Sleeping
Deploy FlakyGym UI + inference updates (minimal upload)
Browse files- inference.py +14 -2
- inference_debug.py +19 -0
inference.py
CHANGED
|
@@ -115,6 +115,14 @@ def _to_single_line(text: str) -> str:
|
|
| 115 |
return " ".join(str(text).split())
|
| 116 |
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
|
| 119 |
print(f"[START] task={task} env={benchmark} model={model}", flush=True)
|
| 120 |
|
|
@@ -529,10 +537,14 @@ def run_episode(
|
|
| 529 |
if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
|
| 530 |
step_error = str(obs.tool_output)
|
| 531 |
if step_fallback_reason:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
if step_error:
|
| 533 |
-
step_error = f"{step_error};
|
| 534 |
else:
|
| 535 |
-
step_error =
|
| 536 |
|
| 537 |
if compliance_stdout:
|
| 538 |
_compliance_log_step(
|
|
|
|
| 115 |
return " ".join(str(text).split())
|
| 116 |
|
| 117 |
|
| 118 |
+
def _short_error(text: str, max_chars: int = 220) -> str:
|
| 119 |
+
one_line = _to_single_line(text)
|
| 120 |
+
if len(one_line) <= max_chars:
|
| 121 |
+
return one_line
|
| 122 |
+
hidden = len(one_line) - max_chars
|
| 123 |
+
return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
|
| 124 |
+
|
| 125 |
+
|
| 126 |
def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
|
| 127 |
print(f"[START] task={task} env={benchmark} model={model}", flush=True)
|
| 128 |
|
|
|
|
| 537 |
if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
|
| 538 |
step_error = str(obs.tool_output)
|
| 539 |
if step_fallback_reason:
|
| 540 |
+
fallback_detail = ""
|
| 541 |
+
if llm_meta.get("error"):
|
| 542 |
+
fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
|
| 543 |
+
fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
|
| 544 |
if step_error:
|
| 545 |
+
step_error = f"{step_error}; {fallback_suffix}"
|
| 546 |
else:
|
| 547 |
+
step_error = fallback_suffix
|
| 548 |
|
| 549 |
if compliance_stdout:
|
| 550 |
_compliance_log_step(
|
inference_debug.py
CHANGED
|
@@ -113,6 +113,14 @@ def _to_single_line(text: str) -> str:
|
|
| 113 |
return " ".join(str(text).split())
|
| 114 |
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
|
| 117 |
print(f"[START] task={task} env={benchmark} model={model}", flush=True)
|
| 118 |
|
|
@@ -424,6 +432,7 @@ def run_episode(
|
|
| 424 |
action: FlakySleuthAction
|
| 425 |
action_source = "heuristic"
|
| 426 |
llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
|
|
|
|
| 427 |
try:
|
| 428 |
candidate, llm_meta = llm_action(messages)
|
| 429 |
if candidate is not None:
|
|
@@ -467,6 +476,7 @@ def run_episode(
|
|
| 467 |
reason_key = "empty_or_invalid_response"
|
| 468 |
else:
|
| 469 |
reason_key = "heuristic_default"
|
|
|
|
| 470 |
fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
|
| 471 |
|
| 472 |
if trace_agent and not compliance_stdout:
|
|
@@ -519,6 +529,15 @@ def run_episode(
|
|
| 519 |
step_error = str(raw_err)
|
| 520 |
if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
|
| 521 |
step_error = str(obs.tool_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
if compliance_stdout:
|
| 524 |
_compliance_log_step(
|
|
|
|
| 113 |
return " ".join(str(text).split())
|
| 114 |
|
| 115 |
|
| 116 |
+
def _short_error(text: str, max_chars: int = 220) -> str:
|
| 117 |
+
one_line = _to_single_line(text)
|
| 118 |
+
if len(one_line) <= max_chars:
|
| 119 |
+
return one_line
|
| 120 |
+
hidden = len(one_line) - max_chars
|
| 121 |
+
return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
|
| 122 |
+
|
| 123 |
+
|
| 124 |
def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
|
| 125 |
print(f"[START] task={task} env={benchmark} model={model}", flush=True)
|
| 126 |
|
|
|
|
| 432 |
action: FlakySleuthAction
|
| 433 |
action_source = "heuristic"
|
| 434 |
llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
|
| 435 |
+
step_fallback_reason: str | None = None
|
| 436 |
try:
|
| 437 |
candidate, llm_meta = llm_action(messages)
|
| 438 |
if candidate is not None:
|
|
|
|
| 476 |
reason_key = "empty_or_invalid_response"
|
| 477 |
else:
|
| 478 |
reason_key = "heuristic_default"
|
| 479 |
+
step_fallback_reason = reason_key
|
| 480 |
fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
|
| 481 |
|
| 482 |
if trace_agent and not compliance_stdout:
|
|
|
|
| 529 |
step_error = str(raw_err)
|
| 530 |
if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
|
| 531 |
step_error = str(obs.tool_output)
|
| 532 |
+
if step_fallback_reason:
|
| 533 |
+
fallback_detail = ""
|
| 534 |
+
if llm_meta.get("error"):
|
| 535 |
+
fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
|
| 536 |
+
fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
|
| 537 |
+
if step_error:
|
| 538 |
+
step_error = f"{step_error}; {fallback_suffix}"
|
| 539 |
+
else:
|
| 540 |
+
step_error = fallback_suffix
|
| 541 |
|
| 542 |
if compliance_stdout:
|
| 543 |
_compliance_log_step(
|