vedkdev commited on
Commit
4a1d6d9
·
verified ·
1 Parent(s): 4c6255d

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +7 -4
  2. server/app.py +48 -7
  3. server/inference_runner.py +282 -0
  4. server/ui.py +521 -0
README.md CHANGED
@@ -68,6 +68,11 @@ export MODEL_NAME=qwen/qwen3.6-plus:free
68
  python inference.py --dataset-path dataset/py_tasks.csv --episodes-per-task 5
69
  ```
70
 
 
 
 
 
 
71
  ### `inference.py` flags
72
 
73
  | Flag | Type | Default | Description |
@@ -75,10 +80,8 @@ python inference.py --dataset-path dataset/py_tasks.csv --episodes-per-task 5
75
  | `--dataset-path` | `str` | `dataset/py_tasks.csv` | Processed task CSV used by env |
76
  | `--episodes-per-task` | `int` | `5` | Episodes per selected task type |
77
  | `--task-types` | `str` | `classify,root_cause,fix_proposal` | Comma-separated task types |
78
- | `--no-progress` | bool | `False` | Disable progress bars |
79
- | `--trace-agent` | bool | `False` | Print model output, action/tool call, and step results |
80
- | `--trace-prompts` | bool | `False` | Also print prompts sent to the model |
81
- | `--trace-max-chars` | `int` | `2500` | Max chars per traced block |
82
 
83
  Trace to log:
84
  ```bash
 
68
  python inference.py --dataset-path dataset/py_tasks.csv --episodes-per-task 5
69
  ```
70
 
71
+ ### Run Inference From Space UI
72
+
73
+ When deployed, the Space homepage serves a UI at `/` (also `/web`) that starts
74
+ `inference.py` in the background and streams logs live.
75
+
76
  ### `inference.py` flags
77
 
78
  | Flag | Type | Default | Description |
 
80
  | `--dataset-path` | `str` | `dataset/py_tasks.csv` | Processed task CSV used by env |
81
  | `--episodes-per-task` | `int` | `5` | Episodes per selected task type |
82
  | `--task-types` | `str` | `classify,root_cause,fix_proposal` | Comma-separated task types |
83
+ | `--max-steps` | `int` | `20` | Max steps per episode |
84
+ | `--benchmark-name` | `str` | `flakysleuth` | Label printed in `[START]` logs |
 
 
85
 
86
  Trace to log:
87
  ```bash
server/app.py CHANGED
@@ -1,16 +1,20 @@
1
  from __future__ import annotations
2
 
 
3
  from typing import Any
4
 
5
- from fastapi import Body, FastAPI, HTTPException
6
- from fastapi.responses import RedirectResponse
7
- from pydantic import BaseModel, ValidationError
8
 
9
  from env.environment import FlakySleuthEnv
10
  from env.models import FlakySleuthAction, FlakySleuthObservation
 
 
11
 
12
  app = FastAPI(title="FlakySleuth Environment")
13
  env = FlakySleuthEnv()
 
14
 
15
 
16
  class FlakySleuthState(BaseModel):
@@ -22,6 +26,17 @@ class FlakySleuthState(BaseModel):
22
  cumulative_progress: float
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
25
  @app.post("/reset")
26
  def reset() -> dict[str, Any]:
27
  observation = env.reset()
@@ -74,13 +89,39 @@ def health() -> dict[str, str]:
74
 
75
 
76
  @app.get("/", include_in_schema=False)
77
- def root() -> RedirectResponse:
78
- return RedirectResponse(url="/docs", status_code=307)
79
 
80
 
81
  @app.get("/web", include_in_schema=False)
82
- def web() -> RedirectResponse:
83
- return RedirectResponse(url="/docs", status_code=307)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
 
86
  @app.get("/metadata")
 
1
  from __future__ import annotations
2
 
3
+ from pathlib import Path
4
  from typing import Any
5
 
6
+ from fastapi import Body, FastAPI, HTTPException, Query
7
+ from fastapi.responses import HTMLResponse
8
+ from pydantic import BaseModel, Field, ValidationError
9
 
10
  from env.environment import FlakySleuthEnv
11
  from env.models import FlakySleuthAction, FlakySleuthObservation
12
+ from server.inference_runner import InferenceRunner
13
+ from server.ui import render_home_page
14
 
15
  app = FastAPI(title="FlakySleuth Environment")
16
  env = FlakySleuthEnv()
17
+ inference_runner = InferenceRunner(Path(__file__).resolve().parent.parent)
18
 
19
 
20
  class FlakySleuthState(BaseModel):
 
26
  cumulative_progress: float
27
 
28
 
29
+ class InferenceRunRequest(BaseModel):
30
+ dataset_path: str = Field(default="dataset/py_tasks.csv")
31
+ episodes_per_task: int = Field(default=1, ge=1, le=50)
32
+ task_types: str = Field(default="classify,root_cause,fix_proposal")
33
+ max_steps: int = Field(default=20, ge=1, le=100)
34
+ benchmark_name: str = Field(default="flakysleuth")
35
+ api_base_url: str | None = None
36
+ model_name: str | None = None
37
+ api_key: str | None = None
38
+
39
+
40
  @app.post("/reset")
41
  def reset() -> dict[str, Any]:
42
  observation = env.reset()
 
89
 
90
 
91
  @app.get("/", include_in_schema=False)
92
+ def root() -> HTMLResponse:
93
+ return HTMLResponse(render_home_page())
94
 
95
 
96
  @app.get("/web", include_in_schema=False)
97
+ def web() -> HTMLResponse:
98
+ return HTMLResponse(render_home_page())
99
+
100
+
101
+ @app.post("/web/inference/start", include_in_schema=False)
102
+ def start_inference(payload: InferenceRunRequest) -> dict[str, Any]:
103
+ request_payload = payload.model_dump()
104
+ try:
105
+ return inference_runner.start(request_payload)
106
+ except FileNotFoundError as exc:
107
+ raise HTTPException(status_code=404, detail=str(exc)) from exc
108
+ except ValueError as exc:
109
+ raise HTTPException(status_code=422, detail=str(exc)) from exc
110
+ except RuntimeError as exc:
111
+ raise HTTPException(status_code=409, detail=str(exc)) from exc
112
+
113
+
114
+ @app.get("/web/inference/status", include_in_schema=False)
115
+ def inference_status(tail: int = Query(default=450, ge=20, le=2000)) -> dict[str, Any]:
116
+ return inference_runner.snapshot(tail=tail)
117
+
118
+
119
+ @app.post("/web/inference/stop", include_in_schema=False)
120
+ def stop_inference() -> dict[str, Any]:
121
+ stopped = inference_runner.stop()
122
+ snapshot = inference_runner.snapshot(tail=450)
123
+ snapshot["stopped"] = stopped
124
+ return snapshot
125
 
126
 
127
  @app.get("/metadata")
server/inference_runner.py ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import subprocess
5
+ import sys
6
+ import threading
7
+ import time
8
+ import uuid
9
+ from dataclasses import dataclass, field
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+
14
+ @dataclass
15
+ class InferenceJob:
16
+ job_id: str
17
+ status: str
18
+ started_at: float
19
+ command: list[str]
20
+ config: dict[str, Any]
21
+ logs: list[str] = field(default_factory=list)
22
+ return_code: int | None = None
23
+ finished_at: float | None = None
24
+ error: str | None = None
25
+ stop_requested: bool = False
26
+ summaries: list[dict[str, Any]] = field(default_factory=list)
27
+
28
+
29
+ class InferenceRunner:
30
+ """Run inference.py in the background and expose live status."""
31
+
32
+ def __init__(self, repo_root: Path):
33
+ self._repo_root = repo_root.resolve()
34
+ self._lock = threading.Lock()
35
+ self._job: InferenceJob | None = None
36
+ self._proc: subprocess.Popen[str] | None = None
37
+
38
+ def start(self, payload: dict[str, Any]) -> dict[str, Any]:
39
+ with self._lock:
40
+ if self._job and self._job.status in {"starting", "running"}:
41
+ raise RuntimeError("An inference run is already in progress.")
42
+
43
+ dataset_rel = str(payload.get("dataset_path", "dataset/py_tasks.csv")).strip()
44
+ episodes = int(payload.get("episodes_per_task", 1))
45
+ max_steps = int(payload.get("max_steps", 20))
46
+ task_types = str(payload.get("task_types", "classify,root_cause,fix_proposal")).strip()
47
+ benchmark_name = str(payload.get("benchmark_name", "flakysleuth")).strip()
48
+
49
+ if not dataset_rel:
50
+ raise ValueError("dataset_path must not be empty.")
51
+ if episodes < 1 or episodes > 50:
52
+ raise ValueError("episodes_per_task must be between 1 and 50.")
53
+ if max_steps < 1 or max_steps > 100:
54
+ raise ValueError("max_steps must be between 1 and 100.")
55
+ if not task_types:
56
+ raise ValueError("task_types must not be empty.")
57
+ if not benchmark_name:
58
+ raise ValueError("benchmark_name must not be empty.")
59
+
60
+ dataset_path = self._resolve_dataset_path(dataset_rel)
61
+ command = [
62
+ sys.executable,
63
+ "inference.py",
64
+ "--dataset-path",
65
+ os.path.relpath(dataset_path, self._repo_root),
66
+ "--episodes-per-task",
67
+ str(episodes),
68
+ "--task-types",
69
+ task_types,
70
+ "--max-steps",
71
+ str(max_steps),
72
+ "--benchmark-name",
73
+ benchmark_name,
74
+ ]
75
+
76
+ job = InferenceJob(
77
+ job_id=uuid.uuid4().hex[:12],
78
+ status="starting",
79
+ started_at=time.time(),
80
+ command=command,
81
+ config={
82
+ "dataset_path": os.path.relpath(dataset_path, self._repo_root),
83
+ "episodes_per_task": episodes,
84
+ "task_types": task_types,
85
+ "max_steps": max_steps,
86
+ "benchmark_name": benchmark_name,
87
+ "api_base_url": _clean_optional_text(payload.get("api_base_url")),
88
+ "model_name": _clean_optional_text(payload.get("model_name")),
89
+ "api_key_provided": bool(_clean_optional_text(payload.get("api_key"))),
90
+ },
91
+ )
92
+ self._append_log(job, f"[UI] Starting run {job.job_id}")
93
+ self._append_log(job, f"[UI] Command: {' '.join(command)}")
94
+
95
+ with self._lock:
96
+ self._job = job
97
+
98
+ worker = threading.Thread(
99
+ target=self._run_job,
100
+ args=(job, payload),
101
+ daemon=True,
102
+ )
103
+ worker.start()
104
+ return self.snapshot(tail=300)
105
+
106
+ def stop(self) -> bool:
107
+ with self._lock:
108
+ job = self._job
109
+ proc = self._proc
110
+ if not job or not proc or job.status not in {"starting", "running"}:
111
+ return False
112
+ job.stop_requested = True
113
+
114
+ if proc.poll() is None:
115
+ proc.terminate()
116
+ try:
117
+ proc.wait(timeout=8)
118
+ except subprocess.TimeoutExpired:
119
+ proc.kill()
120
+ proc.wait(timeout=8)
121
+ return True
122
+
123
+ def snapshot(self, tail: int = 300) -> dict[str, Any]:
124
+ with self._lock:
125
+ if self._job is None:
126
+ return {
127
+ "has_job": False,
128
+ "status": "idle",
129
+ "logs": [],
130
+ }
131
+
132
+ job = self._job
133
+ logs_tail = job.logs[-max(20, min(tail, 2000)) :]
134
+ return {
135
+ "has_job": True,
136
+ "job_id": job.job_id,
137
+ "status": job.status,
138
+ "started_at": job.started_at,
139
+ "finished_at": job.finished_at,
140
+ "return_code": job.return_code,
141
+ "error": job.error,
142
+ "config": job.config,
143
+ "command": job.command,
144
+ "summaries": job.summaries,
145
+ "logs": logs_tail,
146
+ }
147
+
148
+ def _run_job(self, job: InferenceJob, payload: dict[str, Any]) -> None:
149
+ env = os.environ.copy()
150
+ api_key = _clean_optional_text(payload.get("api_key"))
151
+ api_base_url = _clean_optional_text(payload.get("api_base_url"))
152
+ model_name = _clean_optional_text(payload.get("model_name"))
153
+
154
+ if api_key:
155
+ env["API_KEY"] = api_key
156
+ if api_base_url:
157
+ env["API_BASE_URL"] = api_base_url
158
+ if model_name:
159
+ env["MODEL_NAME"] = model_name
160
+
161
+ with self._lock:
162
+ job.status = "running"
163
+
164
+ process: subprocess.Popen[str] | None = None
165
+ try:
166
+ process = subprocess.Popen(
167
+ job.command,
168
+ cwd=self._repo_root,
169
+ stdout=subprocess.PIPE,
170
+ stderr=subprocess.STDOUT,
171
+ text=True,
172
+ bufsize=1,
173
+ env=env,
174
+ )
175
+ with self._lock:
176
+ self._proc = process
177
+
178
+ assert process.stdout is not None
179
+ for raw_line in process.stdout:
180
+ line = raw_line.rstrip("\n")
181
+ if not line:
182
+ continue
183
+ self._append_log(job, line)
184
+ summary = _parse_end_line(line)
185
+ if summary:
186
+ with self._lock:
187
+ job.summaries.append(summary)
188
+
189
+ return_code = process.wait()
190
+ extra_log: str | None = None
191
+ with self._lock:
192
+ job.return_code = return_code
193
+ job.finished_at = time.time()
194
+ if job.stop_requested:
195
+ job.status = "stopped"
196
+ extra_log = "[UI] Run stopped by user request."
197
+ elif return_code == 0:
198
+ job.status = "completed"
199
+ else:
200
+ job.status = "failed"
201
+ extra_log = f"[UI] Process exited with code {return_code}."
202
+ self._proc = None
203
+ if extra_log:
204
+ self._append_log(job, extra_log)
205
+ except Exception as exc:
206
+ extra_log = f"[UI] Runner failed: {exc}"
207
+ with self._lock:
208
+ job.error = str(exc)
209
+ job.finished_at = time.time()
210
+ job.status = "failed"
211
+ self._proc = None
212
+ self._append_log(job, extra_log)
213
+ finally:
214
+ if process and process.stdout:
215
+ process.stdout.close()
216
+
217
+ def _append_log(self, job: InferenceJob, line: str) -> None:
218
+ with self._lock:
219
+ job.logs.append(line)
220
+ if len(job.logs) > 3000:
221
+ del job.logs[: len(job.logs) - 3000]
222
+
223
+ def _resolve_dataset_path(self, dataset_path: str) -> Path:
224
+ candidate = Path(dataset_path)
225
+ if not candidate.is_absolute():
226
+ candidate = (self._repo_root / candidate).resolve()
227
+ else:
228
+ candidate = candidate.resolve()
229
+
230
+ # Keep data access bounded to the repository.
231
+ if os.path.commonpath([str(self._repo_root), str(candidate)]) != str(self._repo_root):
232
+ raise ValueError("dataset_path must point to a file inside the repository.")
233
+ if not candidate.exists():
234
+ raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
235
+ if not candidate.is_file():
236
+ raise ValueError(f"dataset_path is not a file: {dataset_path}")
237
+ return candidate
238
+
239
+
240
+ def _clean_optional_text(value: Any) -> str | None:
241
+ if value is None:
242
+ return None
243
+ text = str(value).strip()
244
+ return text or None
245
+
246
+
247
+ def _parse_end_line(line: str) -> dict[str, Any] | None:
248
+ # Example:
249
+ # [END] success=true steps=3 score=1.00 rewards=0.00,0.20,1.00
250
+ if not line.startswith("[END] "):
251
+ return None
252
+
253
+ payload: dict[str, str] = {}
254
+ for token in line[len("[END] ") :].split(" "):
255
+ if "=" not in token:
256
+ continue
257
+ key, value = token.split("=", 1)
258
+ payload[key.strip()] = value.strip()
259
+
260
+ if "success" not in payload or "steps" not in payload or "score" not in payload:
261
+ return None
262
+
263
+ rewards_raw = payload.get("rewards", "")
264
+ rewards: list[float] = []
265
+ for token in rewards_raw.split(","):
266
+ token = token.strip()
267
+ if not token:
268
+ continue
269
+ try:
270
+ rewards.append(float(token))
271
+ except ValueError:
272
+ continue
273
+
274
+ try:
275
+ return {
276
+ "success": payload["success"].lower() == "true",
277
+ "steps": int(payload["steps"]),
278
+ "score": float(payload["score"]),
279
+ "rewards": rewards,
280
+ }
281
+ except Exception:
282
+ return None
server/ui.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+
4
+ def render_home_page() -> str:
5
+ return """
6
+ <!doctype html>
7
+ <html lang="en">
8
+ <head>
9
+ <meta charset="utf-8" />
10
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
11
+ <title>FlakySleuth Run Studio</title>
12
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
13
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
14
+ <link href="https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@500;600;700&family=IBM+Plex+Mono:wght@400;500&display=swap" rel="stylesheet" />
15
+ <style>
16
+ :root {
17
+ --bg-top: #edf7f1;
18
+ --bg-bottom: #d2ead8;
19
+ --ink: #17211d;
20
+ --muted: #4c6359;
21
+ --accent: #0f8b63;
22
+ --accent-2: #e5783b;
23
+ --panel: rgba(255, 255, 255, 0.86);
24
+ --border: rgba(15, 139, 99, 0.22);
25
+ --card-shadow: 0 22px 46px rgba(14, 51, 37, 0.16);
26
+ --display: "Space Grotesk", "Avenir Next", "Segoe UI", sans-serif;
27
+ --mono: "IBM Plex Mono", "SFMono-Regular", Consolas, monospace;
28
+ }
29
+
30
+ * {
31
+ box-sizing: border-box;
32
+ }
33
+
34
+ body {
35
+ margin: 0;
36
+ color: var(--ink);
37
+ font-family: var(--display);
38
+ min-height: 100vh;
39
+ background:
40
+ radial-gradient(circle at 12% 16%, rgba(230, 120, 59, 0.16), transparent 42%),
41
+ radial-gradient(circle at 86% 12%, rgba(15, 139, 99, 0.2), transparent 40%),
42
+ linear-gradient(164deg, var(--bg-top), var(--bg-bottom));
43
+ animation: backdropFade 700ms ease-out;
44
+ }
45
+
46
+ @keyframes backdropFade {
47
+ from { opacity: 0; transform: translateY(4px); }
48
+ to { opacity: 1; transform: translateY(0); }
49
+ }
50
+
51
+ .shell {
52
+ max-width: 1100px;
53
+ margin: 24px auto;
54
+ padding: 0 16px 24px;
55
+ display: grid;
56
+ gap: 16px;
57
+ }
58
+
59
+ .hero {
60
+ border: 1px solid var(--border);
61
+ background: var(--panel);
62
+ border-radius: 20px;
63
+ box-shadow: var(--card-shadow);
64
+ padding: 22px 22px 18px;
65
+ animation: slideIn 500ms ease-out;
66
+ }
67
+
68
+ @keyframes slideIn {
69
+ from { opacity: 0; transform: translateY(12px); }
70
+ to { opacity: 1; transform: translateY(0); }
71
+ }
72
+
73
+ .eyebrow {
74
+ display: inline-flex;
75
+ align-items: center;
76
+ gap: 8px;
77
+ font-size: 12px;
78
+ color: var(--muted);
79
+ letter-spacing: 0.08em;
80
+ text-transform: uppercase;
81
+ }
82
+
83
+ .dot {
84
+ width: 10px;
85
+ height: 10px;
86
+ border-radius: 50%;
87
+ background: var(--accent);
88
+ box-shadow: 0 0 0 6px rgba(15, 139, 99, 0.15);
89
+ }
90
+
91
+ h1 {
92
+ margin: 10px 0 8px;
93
+ font-size: clamp(1.6rem, 2.6vw, 2.35rem);
94
+ line-height: 1.1;
95
+ letter-spacing: -0.02em;
96
+ }
97
+
98
+ .hero p {
99
+ margin: 0;
100
+ color: var(--muted);
101
+ max-width: 760px;
102
+ line-height: 1.5;
103
+ }
104
+
105
+ .panel-grid {
106
+ display: grid;
107
+ grid-template-columns: 1fr;
108
+ gap: 16px;
109
+ }
110
+
111
+ .panel {
112
+ border: 1px solid var(--border);
113
+ background: var(--panel);
114
+ border-radius: 20px;
115
+ box-shadow: var(--card-shadow);
116
+ padding: 18px;
117
+ animation: slideIn 560ms ease-out;
118
+ }
119
+
120
+ .panel h2 {
121
+ margin: 0 0 12px;
122
+ font-size: 1.1rem;
123
+ letter-spacing: -0.01em;
124
+ }
125
+
126
+ .form-grid {
127
+ display: grid;
128
+ gap: 12px;
129
+ grid-template-columns: repeat(2, minmax(0, 1fr));
130
+ }
131
+
132
+ .field {
133
+ display: grid;
134
+ gap: 6px;
135
+ }
136
+
137
+ .field.span-2 {
138
+ grid-column: span 2;
139
+ }
140
+
141
+ label {
142
+ font-size: 13px;
143
+ color: var(--muted);
144
+ }
145
+
146
+ input {
147
+ width: 100%;
148
+ border: 1px solid rgba(18, 88, 63, 0.22);
149
+ border-radius: 10px;
150
+ padding: 10px 11px;
151
+ font: 500 14px/1.2 var(--mono);
152
+ color: var(--ink);
153
+ background: rgba(255, 255, 255, 0.92);
154
+ transition: border-color 180ms ease, box-shadow 180ms ease;
155
+ }
156
+
157
+ input:focus {
158
+ outline: none;
159
+ border-color: var(--accent);
160
+ box-shadow: 0 0 0 4px rgba(15, 139, 99, 0.14);
161
+ }
162
+
163
+ .actions {
164
+ margin-top: 6px;
165
+ display: flex;
166
+ flex-wrap: wrap;
167
+ gap: 10px;
168
+ }
169
+
170
+ button {
171
+ border: 0;
172
+ border-radius: 11px;
173
+ font: 600 14px/1 var(--display);
174
+ padding: 11px 14px;
175
+ cursor: pointer;
176
+ transition: transform 180ms ease, opacity 180ms ease, filter 180ms ease;
177
+ }
178
+
179
+ button:hover {
180
+ transform: translateY(-1px);
181
+ }
182
+
183
+ button:disabled {
184
+ opacity: 0.55;
185
+ cursor: not-allowed;
186
+ transform: none;
187
+ }
188
+
189
+ .btn-run {
190
+ background: var(--accent);
191
+ color: #fff;
192
+ }
193
+
194
+ .btn-stop {
195
+ background: #f2b38f;
196
+ color: #431d05;
197
+ }
198
+
199
+ .btn-docs {
200
+ background: #dce8e1;
201
+ color: #234437;
202
+ text-decoration: none;
203
+ display: inline-flex;
204
+ align-items: center;
205
+ border-radius: 11px;
206
+ padding: 11px 14px;
207
+ font: 600 14px/1 var(--display);
208
+ }
209
+
210
+ .status-row {
211
+ display: grid;
212
+ grid-template-columns: 1fr;
213
+ gap: 10px;
214
+ }
215
+
216
+ .pill {
217
+ display: inline-flex;
218
+ align-items: center;
219
+ gap: 8px;
220
+ width: fit-content;
221
+ border-radius: 999px;
222
+ padding: 6px 11px;
223
+ font: 600 13px/1 var(--display);
224
+ background: #e1ece6;
225
+ color: #2d4b3e;
226
+ }
227
+
228
+ .pill .status-dot {
229
+ width: 8px;
230
+ height: 8px;
231
+ border-radius: 50%;
232
+ background: #60756a;
233
+ }
234
+
235
+ .pill.running .status-dot {
236
+ background: var(--accent);
237
+ box-shadow: 0 0 0 7px rgba(15, 139, 99, 0.12);
238
+ }
239
+
240
+ .pill.failed .status-dot {
241
+ background: #af4020;
242
+ }
243
+
244
+ .pill.completed .status-dot {
245
+ background: #1d724e;
246
+ }
247
+
248
+ .pill.stopped .status-dot {
249
+ background: var(--accent-2);
250
+ }
251
+
252
+ .meta {
253
+ display: grid;
254
+ grid-template-columns: repeat(2, minmax(0, 1fr));
255
+ gap: 8px;
256
+ font: 500 12px/1.4 var(--mono);
257
+ color: #2f4f43;
258
+ }
259
+
260
+ .meta strong {
261
+ color: #193428;
262
+ }
263
+
264
+ .log-wrap {
265
+ margin-top: 8px;
266
+ border-radius: 14px;
267
+ border: 1px solid rgba(20, 66, 50, 0.2);
268
+ overflow: hidden;
269
+ background: #0f1a16;
270
+ }
271
+
272
+ .log-head {
273
+ display: flex;
274
+ justify-content: space-between;
275
+ align-items: center;
276
+ color: #b7d5c8;
277
+ font: 500 12px/1 var(--mono);
278
+ padding: 10px 12px;
279
+ border-bottom: 1px solid rgba(170, 208, 193, 0.16);
280
+ background: #13201b;
281
+ }
282
+
283
+ pre {
284
+ margin: 0;
285
+ padding: 12px;
286
+ color: #d8f3e7;
287
+ font: 400 12.5px/1.45 var(--mono);
288
+ max-height: 360px;
289
+ overflow: auto;
290
+ white-space: pre-wrap;
291
+ word-break: break-word;
292
+ }
293
+
294
+ .help {
295
+ margin-top: 8px;
296
+ color: #37594b;
297
+ font-size: 12px;
298
+ line-height: 1.45;
299
+ }
300
+
301
+ @media (max-width: 880px) {
302
+ .form-grid {
303
+ grid-template-columns: 1fr;
304
+ }
305
+
306
+ .field.span-2 {
307
+ grid-column: span 1;
308
+ }
309
+
310
+ .meta {
311
+ grid-template-columns: 1fr;
312
+ }
313
+ }
314
+ </style>
315
+ </head>
316
+ <body>
317
+ <main class="shell">
318
+ <section class="hero">
319
+ <span class="eyebrow"><span class="dot"></span>FlakySleuth Space</span>
320
+ <h1>Run Inference From The Browser</h1>
321
+ <p>Launch <code>inference.py</code>, monitor stdout live, and inspect episode summaries without leaving this Space. The API endpoints for OpenEnv stay available at the same time.</p>
322
+ </section>
323
+
324
+ <section class="panel-grid">
325
+ <div class="panel">
326
+ <h2>Run Configuration</h2>
327
+ <form id="run-form" class="form-grid">
328
+ <div class="field span-2">
329
+ <label for="dataset_path">Dataset Path</label>
330
+ <input id="dataset_path" name="dataset_path" value="dataset/py_tasks.csv" />
331
+ </div>
332
+
333
+ <div class="field">
334
+ <label for="episodes_per_task">Episodes Per Task</label>
335
+ <input id="episodes_per_task" name="episodes_per_task" type="number" min="1" max="50" value="1" />
336
+ </div>
337
+
338
+ <div class="field">
339
+ <label for="max_steps">Max Steps</label>
340
+ <input id="max_steps" name="max_steps" type="number" min="1" max="100" value="20" />
341
+ </div>
342
+
343
+ <div class="field span-2">
344
+ <label for="task_types">Task Types (comma-separated)</label>
345
+ <input id="task_types" name="task_types" value="classify,root_cause,fix_proposal" />
346
+ </div>
347
+
348
+ <div class="field span-2">
349
+ <label for="benchmark_name">Benchmark Label</label>
350
+ <input id="benchmark_name" name="benchmark_name" value="flakysleuth" />
351
+ </div>
352
+
353
+ <div class="field span-2">
354
+ <label for="api_base_url">API Base URL (optional)</label>
355
+ <input id="api_base_url" name="api_base_url" placeholder="https://api.openai.com/v1 or provider endpoint" />
356
+ </div>
357
+
358
+ <div class="field">
359
+ <label for="model_name">Model Name (optional)</label>
360
+ <input id="model_name" name="model_name" placeholder="gpt-4o-mini, qwen/qwen3.6-plus:free, etc." />
361
+ </div>
362
+
363
+ <div class="field">
364
+ <label for="api_key">API Key (optional)</label>
365
+ <input id="api_key" name="api_key" type="password" placeholder="Uses server env vars if empty" />
366
+ </div>
367
+ </form>
368
+
369
+ <div class="actions">
370
+ <button id="btn-run" class="btn-run" type="button">Start Inference</button>
371
+ <button id="btn-stop" class="btn-stop" type="button">Stop Run</button>
372
+ <a class="btn-docs" href="/docs" target="_blank" rel="noreferrer">Open API Docs</a>
373
+ </div>
374
+ <p class="help">Tip: if no API key is provided, <code>inference.py</code> falls back to its heuristic agent.</p>
375
+ </div>
376
+
377
+ <div class="panel">
378
+ <h2>Run Status</h2>
379
+ <div class="status-row">
380
+ <div id="status-pill" class="pill"><span class="status-dot"></span><span id="status-text">idle</span></div>
381
+ <div class="meta">
382
+ <div><strong>Job ID:</strong> <span id="meta-job-id">-</span></div>
383
+ <div><strong>Return Code:</strong> <span id="meta-return-code">-</span></div>
384
+ <div><strong>Started:</strong> <span id="meta-started">-</span></div>
385
+ <div><strong>Finished:</strong> <span id="meta-finished">-</span></div>
386
+ </div>
387
+ <div class="log-wrap">
388
+ <div class="log-head">
389
+ <span>Live Logs</span>
390
+ <span id="log-count">0 lines</span>
391
+ </div>
392
+ <pre id="log-output">No run started yet.</pre>
393
+ </div>
394
+ <div class="help" id="summary-line"></div>
395
+ </div>
396
+ </div>
397
+ </section>
398
+ </main>
399
+
400
+ <script>
401
+ const form = document.getElementById("run-form");
402
+ const runButton = document.getElementById("btn-run");
403
+ const stopButton = document.getElementById("btn-stop");
404
+ const statusPill = document.getElementById("status-pill");
405
+ const statusText = document.getElementById("status-text");
406
+ const jobIdEl = document.getElementById("meta-job-id");
407
+ const returnCodeEl = document.getElementById("meta-return-code");
408
+ const startedEl = document.getElementById("meta-started");
409
+ const finishedEl = document.getElementById("meta-finished");
410
+ const logEl = document.getElementById("log-output");
411
+ const logCountEl = document.getElementById("log-count");
412
+ const summaryEl = document.getElementById("summary-line");
413
+
414
+ function readFormPayload() {
415
+ return {
416
+ dataset_path: form.dataset_path.value.trim(),
417
+ episodes_per_task: Number(form.episodes_per_task.value),
418
+ task_types: form.task_types.value.trim(),
419
+ max_steps: Number(form.max_steps.value),
420
+ benchmark_name: form.benchmark_name.value.trim(),
421
+ api_base_url: form.api_base_url.value.trim() || null,
422
+ model_name: form.model_name.value.trim() || null,
423
+ api_key: form.api_key.value.trim() || null,
424
+ };
425
+ }
426
+
427
+ function formatTime(epoch) {
428
+ if (!epoch) return "-";
429
+ try {
430
+ return new Date(epoch * 1000).toLocaleString();
431
+ } catch (_) {
432
+ return "-";
433
+ }
434
+ }
435
+
436
+ function setStatus(status) {
437
+ const normalized = (status || "idle").toLowerCase();
438
+ statusPill.classList.remove("running", "failed", "completed", "stopped");
439
+ if (["running", "failed", "completed", "stopped"].includes(normalized)) {
440
+ statusPill.classList.add(normalized);
441
+ }
442
+ statusText.textContent = normalized;
443
+ runButton.disabled = normalized === "running" || normalized === "starting";
444
+ stopButton.disabled = !(normalized === "running" || normalized === "starting");
445
+ }
446
+
447
+ function renderSummary(summaries) {
448
+ if (!Array.isArray(summaries) || summaries.length === 0) {
449
+ summaryEl.textContent = "";
450
+ return;
451
+ }
452
+ const last = summaries[summaries.length - 1];
453
+ summaryEl.textContent = `Latest episode: success=${last.success} score=${last.score} steps=${last.steps}`;
454
+ }
455
+
456
+ function renderStatus(state) {
457
+ setStatus(state.status || "idle");
458
+ jobIdEl.textContent = state.job_id || "-";
459
+ returnCodeEl.textContent = state.return_code === null || state.return_code === undefined ? "-" : String(state.return_code);
460
+ startedEl.textContent = formatTime(state.started_at);
461
+ finishedEl.textContent = formatTime(state.finished_at);
462
+
463
+ const logs = Array.isArray(state.logs) ? state.logs : [];
464
+ logCountEl.textContent = `${logs.length} lines`;
465
+ logEl.textContent = logs.length ? logs.join("\\n") : "No logs yet.";
466
+ logEl.scrollTop = logEl.scrollHeight;
467
+
468
+ renderSummary(state.summaries || []);
469
+ }
470
+
471
+ async function fetchStatus() {
472
+ try {
473
+ const response = await fetch("/web/inference/status?tail=450", { method: "GET" });
474
+ if (!response.ok) return;
475
+ const state = await response.json();
476
+ renderStatus(state);
477
+ } catch (_) {}
478
+ }
479
+
480
+ async function startRun() {
481
+ runButton.disabled = true;
482
+ try {
483
+ const response = await fetch("/web/inference/start", {
484
+ method: "POST",
485
+ headers: { "Content-Type": "application/json" },
486
+ body: JSON.stringify(readFormPayload()),
487
+ });
488
+
489
+ const payload = await response.json();
490
+ if (!response.ok) {
491
+ const detail = typeof payload.detail === "string" ? payload.detail : "Could not start inference.";
492
+ alert(detail);
493
+ return;
494
+ }
495
+ renderStatus(payload);
496
+ } catch (_) {
497
+ alert("Could not start inference. Check logs and try again.");
498
+ } finally {
499
+ form.api_key.value = "";
500
+ }
501
+ }
502
+
503
+ async function stopRun() {
504
+ stopButton.disabled = true;
505
+ try {
506
+ const response = await fetch("/web/inference/stop", { method: "POST" });
507
+ if (!response.ok) return;
508
+ const state = await response.json();
509
+ renderStatus(state);
510
+ } catch (_) {}
511
+ }
512
+
513
+ runButton.addEventListener("click", startRun);
514
+ stopButton.addEventListener("click", stopRun);
515
+
516
+ fetchStatus();
517
+ window.setInterval(fetchStatus, 2200);
518
+ </script>
519
+ </body>
520
+ </html>
521
+ """