Spaces:

vedkdev
/

FlakyTestSleuthOpenEnvRL

Sleeping

App Files Files Community

vedkdev commited on Apr 8

Commit

2d4a907

verified ·

1 Parent(s): de8fc0f

Deploy FlakyGym UI + inference updates (minimal upload)

Browse files

Files changed (2) hide show

inference.py +14 -2
inference_debug.py +19 -0

inference.py CHANGED Viewed

@@ -115,6 +115,14 @@ def _to_single_line(text: str) -> str:
     return " ".join(str(text).split())
 def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
     print(f"[START] task={task} env={benchmark} model={model}", flush=True)
@@ -529,10 +537,14 @@ def run_episode(
             if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
                 step_error = str(obs.tool_output)
             if step_fallback_reason:
                 if step_error:
-                    step_error = f"{step_error}; llm_fallback:{step_fallback_reason}"
                 else:
-                    step_error = f"llm_fallback:{step_fallback_reason}"
             if compliance_stdout:
                 _compliance_log_step(

     return " ".join(str(text).split())
+def _short_error(text: str, max_chars: int = 220) -> str:
+    one_line = _to_single_line(text)
+    if len(one_line) <= max_chars:
+        return one_line
+    hidden = len(one_line) - max_chars
+    return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
 def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
     print(f"[START] task={task} env={benchmark} model={model}", flush=True)
             if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
                 step_error = str(obs.tool_output)
             if step_fallback_reason:
+                fallback_detail = ""
+                if llm_meta.get("error"):
+                    fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
+                fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
                 if step_error:
+                    step_error = f"{step_error}; {fallback_suffix}"
                 else:
+                    step_error = fallback_suffix
             if compliance_stdout:
                 _compliance_log_step(

inference_debug.py CHANGED Viewed

@@ -113,6 +113,14 @@ def _to_single_line(text: str) -> str:
     return " ".join(str(text).split())
 def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
     print(f"[START] task={task} env={benchmark} model={model}", flush=True)
@@ -424,6 +432,7 @@ def run_episode(
             action: FlakySleuthAction
             action_source = "heuristic"
             llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
             try:
                 candidate, llm_meta = llm_action(messages)
                 if candidate is not None:
@@ -467,6 +476,7 @@ def run_episode(
                     reason_key = "empty_or_invalid_response"
                 else:
                     reason_key = "heuristic_default"
                 fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
             if trace_agent and not compliance_stdout:
@@ -519,6 +529,15 @@ def run_episode(
                     step_error = str(raw_err)
             if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
                 step_error = str(obs.tool_output)
             if compliance_stdout:
                 _compliance_log_step(

     return " ".join(str(text).split())
+def _short_error(text: str, max_chars: int = 220) -> str:
+    one_line = _to_single_line(text)
+    if len(one_line) <= max_chars:
+        return one_line
+    hidden = len(one_line) - max_chars
+    return f"{one_line[:max_chars]}...[truncated {hidden} chars]"
 def _compliance_log_start(task: str, benchmark: str, model: str) -> None:
     print(f"[START] task={task} env={benchmark} model={model}", flush=True)
             action: FlakySleuthAction
             action_source = "heuristic"
             llm_meta: dict[str, Any] = {"attempted": False, "raw_output": "", "error": ""}
+            step_fallback_reason: str | None = None
             try:
                 candidate, llm_meta = llm_action(messages)
                 if candidate is not None:
                     reason_key = "empty_or_invalid_response"
                 else:
                     reason_key = "heuristic_default"
+                step_fallback_reason = reason_key
                 fallback_reasons[reason_key] = fallback_reasons.get(reason_key, 0) + 1
             if trace_agent and not compliance_stdout:
                     step_error = str(raw_err)
             if not step_error and obs.tool_output and str(obs.tool_output).startswith("ERROR:"):
                 step_error = str(obs.tool_output)
+            if step_fallback_reason:
+                fallback_detail = ""
+                if llm_meta.get("error"):
+                    fallback_detail = f" detail={_short_error(str(llm_meta['error']))}"
+                fallback_suffix = f"llm_fallback:{step_fallback_reason}{fallback_detail}"
+                if step_error:
+                    step_error = f"{step_error}; {fallback_suffix}"
+                else:
+                    step_error = fallback_suffix
             if compliance_stdout:
                 _compliance_log_step(