.*)$"
)
TASK_MAP: dict[str, dict[str, Any]] = {
str(task.get("name", "")): task
for task in ALL_TASKS
if isinstance(task, dict) and task.get("name")
}
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;900&family=JetBrains+Mono:wght@400;600;800&display=swap');
:root {
--bg-top: #0a0a0a;
--bg-bottom: #111111;
--panel: #0a0a0a;
--panel-border: rgba(255, 255, 255, 0.15);
--text-main: #f5f5f5;
--text-dim: #999;
--accent: #E60012;
}
.gradio-container {
font-family: 'Inter', sans-serif !important;
background: var(--bg-top);
color: var(--text-main);
padding: 0px !important;
}
#header-wrap {
margin-bottom: 2px;
border: 1px solid var(--panel-border);
background: #000;
border-radius: 0px;
padding: 8px 12px;
text-transform: uppercase;
}
#header-wrap h1 {
margin: 0;
letter-spacing: 1px;
font-weight: 700;
color: #fff;
font-size: 20px;
}
#header-wrap p {
margin: 2px 0 0;
color: var(--text-dim);
font-weight: 500;
font-size: 13px;
}
.panel {
border: 1px solid var(--panel-border);
border-radius: 0px !important;
background: var(--panel) !important;
overflow: hidden;
padding: 0px !important;
}
.panel-title {
padding: 6px 10px;
border-bottom: 1px solid var(--panel-border);
color: var(--text-dim);
font-size: 12px;
letter-spacing: 0.05em;
text-transform: uppercase;
font-weight: bold;
}
#execute-btn {
background: #2b2b2b !important;
color: #fff !important;
border-radius: 0px !important;
font-weight: 700 !important;
font-size: 16px !important;
text-transform: uppercase !important;
border: 2px solid #fff !important;
transition: all 0.2s ease !important;
height: 40px !important;
}
#execute-btn:hover {
background: #801a1a !important;
border-color: #ff4a4a !important;
}
#execute-btn-running {
background: #801a1a !important;
color: #fff !important;
border-radius: 0px !important;
font-weight: 700 !important;
font-size: 16px !important;
text-transform: uppercase !important;
border: 2px solid #ff4a4a !important;
height: 40px !important;
}
.code-panel * {
font-family: 'JetBrains Mono', monospace !important;
}
.terminal-wrap {
height: 45vh;
overflow-y: auto;
padding: 8px;
font-family: 'JetBrains Mono', monospace;
font-size: 13px;
line-height: 1.5;
background: #050505;
border: 1px solid var(--panel-border);
}
.term-line {
white-space: pre-wrap;
word-break: break-word;
}
/* Base Log Colors */
.c-start { color: #fff; font-weight: bold; }
.c-end { color: #fff; font-weight: bold; }
.c-step { color: #39ff14; font-weight: bold; }
.c-thought { color: #5b7a96; font-style: italic; }
.c-error { color: #ff4a4a; }
.c-muted { color: var(--text-dim); }
.metric {
background: #000;
padding: 4px;
}
@keyframes pulse-border {
0% { border-color: #ff4a4a; box-shadow: 0 0 10px #ff4a4a; }
50% { border-color: #2b2b2b; box-shadow: none; }
100% { border-color: #ff4a4a; box-shadow: 0 0 10px #ff4a4a; }
}
.token-alert > div > input {
animation: pulse-border 1.5s infinite;
}
"""
def _code_from_task_name(task_name: str) -> str:
task = TASK_MAP.get((task_name or "").strip())
if not task:
return (
"# Waiting for selection...\n"
"# Tip: Select a target from the Task Selection Grid\n"
)
return "\n".join(task.get("code", []))
def _normalize_base_url(base_url: str) -> str:
candidate = (base_url or "").strip()
if not candidate:
return f"http://{BACKEND_HOST}:{BACKEND_PORT}"
if not candidate.startswith(("http://", "https://")):
candidate = f"http://{candidate}"
return candidate.rstrip("/")
def load_code(task_name: str, env_base_url: str) -> str:
local_code = _code_from_task_name(task_name)
if "Waiting for selection" not in local_code:
return local_code
return (
"# Unable to load code for the selected task.\n"
"# Verify OpenEnv API is reachable."
)
def _solution_from_task_name(task_name: str) -> str | None:
task = TASK_MAP.get((task_name or "").strip())
if not task:
return None
return "\n".join(task.get("solution", []))
def _terminal_html(lines: list[tuple[str, str]]) -> str:
rendered: list[str] = []
for css_class, text in lines:
safe = html.escape(text)
rendered.append(f"{safe}
")
content = "\n".join(rendered) if rendered else "Idle. Configure parameters and run agent.
"
return (
""
f"{content}"
"
"
""
)
def _update_hud_badge(task_name: str, difficulty: str) -> str:
if not task_name:
return "Waiting for Task Selection...
"
return f"""
Active Task: {html.escape(task_name)} | Difficulty: {difficulty.capitalize()}
"""
def _large_metric_html(success: bool, score: float, steps: int, reward: str) -> str:
color = "#39ff14" if success else "#ff4a4a"
status_text = "SUCCESS" if success else "FAILED"
return f"""
{status_text}
Score: {score:.2f} | Steps: {steps}
Rewards: {html.escape(reward)}
"""
def _reader_thread(stream: Any, source: str, out_q: queue.Queue[tuple[str, str | None]]) -> None:
try:
for raw in iter(stream.readline, ""):
out_q.put((source, raw.rstrip("\n")))
finally:
try:
stream.close()
except Exception:
pass
out_q.put((source, None))
def _build_env(
hf_token: str,
api_base_url: str,
model_name: str,
env_base_url: str,
task_name: str,
max_steps: int,
success_score_threshold: float,
) -> dict[str, str]:
env = os.environ.copy()
updates = {
"HF_TOKEN": hf_token,
"API_BASE_URL": api_base_url,
"MODEL_NAME": model_name,
"ENV_BASE_URL": _normalize_base_url(env_base_url),
"TASK_NAME": task_name,
"MAX_STEPS": str(int(max_steps)),
"SUCCESS_SCORE_THRESHOLD": str(float(success_score_threshold)),
}
for key, value in updates.items():
cleaned = (value or "").strip()
if cleaned:
env[key] = cleaned
elif key in env:
env.pop(key, None)
return env
def sync_tasks(selected, grid_name):
if not selected:
return (
gr.skip(),
gr.skip(),
gr.skip(),
gr.skip(),
gr.skip(),
gr.skip(),
gr.skip()
)
if grid_name == "easy":
easy_val = selected
med_val = None
hard_val = None
diff = "easy"
elif grid_name == "medium":
easy_val = None
med_val = selected
hard_val = None
diff = "medium"
else:
easy_val = None
med_val = None
hard_val = selected
diff = "hard"
code_content = _code_from_task_name(selected)
hud_content = _update_hud_badge(selected, diff)
title_content = "Target Source Code (Buggy)
"
return (
selected,
gr.update(value=easy_val),
gr.update(value=med_val),
gr.update(value=hard_val),
hud_content,
title_content,
code_content
)
def validate_and_start(token):
if not token or not token.strip():
return (
gr.update(elem_classes=["token-alert"]),
gr.update(value="ERROR: Token Required"),
False
)
return (
gr.update(elem_classes=[]),
gr.update(value="RUNNING...", elem_id="execute-btn-running", interactive=False),
True
)
def _reset_run_state():
return (
_terminal_html([("c-muted", "Boot sequence initialized...")]),
"Running...
"
)
def run_agent(
task_name: str,
hf_token: str,
api_base_url: str,
model_name: str,
env_base_url: str,
max_steps: int,
success_score_threshold: float,
show_thought: bool,
proceed: bool
) -> Generator[tuple[Any, str, str, dict, Any], None, None]:
if not proceed:
yield (gr.skip(), gr.skip(), gr.skip(), gr.update(value="INITIATE TRACE RESOLUTION", interactive=True), gr.skip())
return
terminal_lines: list[tuple[str, str]] = []
terminal_lines.append(("c-muted", "Agent initialized... infiltrating target."))
result_html = "Awaiting end...
"
yield gr.skip(), _terminal_html(terminal_lines), result_html, gr.update(), gr.skip()
cmd = [sys.executable, str(INFERENCE_PATH)]
if show_thought:
cmd.append("--thought")
env = _build_env(
hf_token, api_base_url, model_name, env_base_url,
task_name, max_steps, success_score_threshold
)
process = subprocess.Popen(
cmd,
cwd=str(ROOT_DIR),
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1,
)
out_q: queue.Queue[tuple[str, str | None]] = queue.Queue()
stdout_thread = threading.Thread(target=_reader_thread, args=(process.stdout, "stdout", out_q), daemon=True)
stderr_thread = threading.Thread(target=_reader_thread, args=(process.stderr, "stderr", out_q), daemon=True)
stdout_thread.start()
stderr_thread.start()
ended_streams: set[str] = set()
thought_mode = False
final_success = False
final_solved_code = None
while True:
try:
source, line = out_q.get(timeout=0.15)
except queue.Empty:
if process.poll() is not None and ended_streams == {"stdout", "stderr"}:
break
continue
if line is None:
ended_streams.add(source)
if process.poll() is not None and ended_streams == {"stdout", "stderr"}:
break
continue
if source == "stderr":
if line.strip() == "[THOUGHT]":
thought_mode = True
if show_thought:
terminal_lines.append(("c-thought", "[THOUGHT]"))
elif line.startswith("[") and line.endswith("]"):
thought_mode = False
terminal_lines.append(("c-muted", line))
elif thought_mode:
if show_thought:
terminal_lines.append(("c-thought", line))
else:
if not show_thought:
if not any(tag in line for tag in ["[START]", "[STEP]", "[END]"]):
continue
terminal_lines.append(("c-error", line))
else:
if not show_thought:
if not any(tag in line for tag in ["[START]", "[STEP]", "[END]"]):
continue
start_match = START_RE.match(line)
step_match = STEP_RE.match(line)
end_match = END_RE.match(line)
if start_match:
terminal_lines.append(("c-start", line))
elif step_match:
err = step_match.group("error")
css = "c-step" if err == "null" else "c-error"
terminal_lines.append((css, line))
elif end_match:
success = end_match.group("success") == "true"
final_steps = int(end_match.group("steps"))
score_value = float(end_match.group("score"))
rewards_raw = end_match.group("rewards").strip()
terminal_lines.append(("c-end", line))
result_html = _large_metric_html(success, score_value, final_steps, rewards_raw or 'none')
if success:
final_success = True
solved = _solution_from_task_name(task_name)
if solved:
final_solved_code = solved
else:
terminal_lines.append(("c-muted", line))
if len(terminal_lines) > 500:
terminal_lines = terminal_lines[-500:]
yield gr.skip(), _terminal_html(terminal_lines), result_html, gr.update(), gr.skip()
return_code = process.wait(timeout=2)
if return_code != 0:
terminal_lines.append(("c-error", f"Process exited with code {return_code}."))
result_html = _large_metric_html(False, 0.0, 0, f"Error code {return_code}")
if len(terminal_lines) > 500:
terminal_lines = terminal_lines[-500:]
code_update = gr.skip()
title_update = gr.skip()
if final_success and final_solved_code is not None:
code_update = final_solved_code
title_update = "Target Source Code (Resolved)
"
yield code_update, _terminal_html(terminal_lines), result_html, gr.update(value="INITIATE TRACE RESOLUTION", elem_id="execute-btn", interactive=True), title_update
with gr.Blocks(title="TraceFix-RL") as demo:
gr.HTML(
f"""
"""
)
selected_task_state = gr.State(value="")
if hasattr(gr, "Sidebar"):
sidebar_context = gr.Sidebar()
else:
sidebar_context = gr.Column()
with sidebar_context:
gr.Markdown("### Authentication")
hf_token = gr.Textbox(label="HF Token", type="password", placeholder="hf_xxx", elem_classes=[])
with gr.Accordion("Engine Parameters", open=False):
model_name = gr.Textbox(label="Model Name", value=os.getenv("MODEL_NAME", "openai/gpt-oss-20b"))
api_base_url = gr.Textbox(label="API Base URL", value=os.getenv("API_BASE_URL", "https://router.huggingface.co/v1"))
env_base_url = gr.Textbox(label="Env Base URL", value=os.getenv("ENV_BASE_URL", f"http://127.0.0.1:{BACKEND_PORT}"))
max_steps = gr.Number(label="Max Steps", value=int(os.getenv("MAX_STEPS", "50")), precision=0)
success_score_threshold = gr.Number(
label="Success Score Threshold",
value=float(os.getenv("SUCCESS_SCORE_THRESHOLD", "0.98")),
precision=2,
)
show_thought = gr.Checkbox(label="Stream Thought Trace", value=False)
gr.HTML("Task Selection Grid
")
with gr.Row(elem_classes=["panel"]):
easy_radio = gr.Radio(choices=EASY_CHOICES, label="Easy Targets", elem_id="easy-radio")
medium_radio = gr.Radio(choices=MEDIUM_CHOICES, label="Medium Targets", elem_id="medium-radio")
hard_radio = gr.Radio(choices=HARD_CHOICES, label="Hard Targets", elem_id="hard-radio")
hud_badge = gr.HTML(_update_hud_badge("", ""))
run_button = gr.Button("INITIATE TRACE RESOLUTION", elem_id="execute-btn", variant="primary")
with gr.Row(equal_height=True):
with gr.Column(scale=1, elem_classes=["panel", "code-panel"]):
code_panel_title = gr.HTML("Target Source Code (Buggy)
")
code_view = gr.Code(
language="python",
interactive=False,
value=_code_from_task_name(""),
)
# Override height via CSS
gr.HTML("")
with gr.Column(scale=1, elem_classes=["panel"]):
gr.HTML("Terminal Trace
")
terminal = gr.HTML(_terminal_html([]))
with gr.Row(elem_classes=["panel"]):
result_block = gr.HTML("Awaiting Execution
")
easy_radio.change(lambda x: sync_tasks(x, "easy"), inputs=[easy_radio], outputs=[selected_task_state, easy_radio, medium_radio, hard_radio, hud_badge, code_panel_title, code_view])
medium_radio.change(lambda x: sync_tasks(x, "medium"), inputs=[medium_radio], outputs=[selected_task_state, easy_radio, medium_radio, hard_radio, hud_badge, code_panel_title, code_view])
hard_radio.change(lambda x: sync_tasks(x, "hard"), inputs=[hard_radio], outputs=[selected_task_state, easy_radio, medium_radio, hard_radio, hud_badge, code_panel_title, code_view])
# Run Sequence
run_state = gr.State(value=True)
validate_step = run_button.click(
validate_and_start,
inputs=[hf_token],
outputs=[hf_token, run_button, run_state],
queue=False
)
reset_step = validate_step.then(
_reset_run_state,
inputs=[],
outputs=[terminal, result_block],
queue=False,
)
reset_step.then(
run_agent,
inputs=[
selected_task_state,
hf_token,
api_base_url,
model_name,
env_base_url,
max_steps,
success_score_threshold,
show_thought,
run_state
],
outputs=[code_view, terminal, result_block, run_button, code_panel_title],
)