Zayne Rea Sprague commited on
Commit
b630916
·
1 Parent(s): cdf803d
backend/api/presets.py CHANGED
@@ -8,7 +8,7 @@ from flask import Blueprint, request, jsonify
8
  bp = Blueprint("presets", __name__, url_prefix="/api/presets")
9
 
10
  PRESETS_REPO = "reasoning-degeneration-dev/AGG_VIS_PRESETS"
11
- VALID_TYPES = {"model", "arena", "rlm", "harbor"}
12
  LOCAL_PRESETS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "presets")
13
 
14
  # In-memory cache: vis_type -> list[dict]
@@ -133,7 +133,7 @@ def create_preset(vis_type):
133
 
134
  if vis_type == "model":
135
  preset["column"] = data.get("column", "model_responses")
136
- elif vis_type == "rlm":
137
  preset["config"] = data.get("config", "rlm_call_traces")
138
 
139
  presets = _get_presets(vis_type)
 
8
  bp = Blueprint("presets", __name__, url_prefix="/api/presets")
9
 
10
  PRESETS_REPO = "reasoning-degeneration-dev/AGG_VIS_PRESETS"
11
+ VALID_TYPES = {"model", "arena", "rlm", "rlm-eval", "harbor"}
12
  LOCAL_PRESETS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "presets")
13
 
14
  # In-memory cache: vis_type -> list[dict]
 
133
 
134
  if vis_type == "model":
135
  preset["column"] = data.get("column", "model_responses")
136
+ elif vis_type in ("rlm", "rlm-eval"):
137
  preset["config"] = data.get("config", "rlm_call_traces")
138
 
139
  presets = _get_presets(vis_type)
backend/api/rlm_eval_datasets.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import hashlib
3
+ from flask import Blueprint, request, jsonify
4
+ from datasets import load_dataset
5
+
6
+ bp = Blueprint("rlm_eval_datasets", __name__, url_prefix="/api/rlm-eval/datasets")
7
+
8
+ _cache: dict[str, dict] = {}
9
+
10
+
11
+ def _make_id(repo: str, config: str, split: str) -> str:
12
+ key = f"{repo}:{config}:{split}"
13
+ return hashlib.md5(key.encode()).hexdigest()[:12]
14
+
15
+
16
+ def _build_hierarchy(rows: list[dict]) -> dict:
17
+ """Reconstruct hierarchy from flat rows: examples -> iterations."""
18
+ examples: dict[int, dict] = {}
19
+
20
+ for row in rows:
21
+ ei = row.get("example_idx", 0)
22
+ ri = row.get("rlm_iter", 0)
23
+
24
+ if ei not in examples:
25
+ examples[ei] = {
26
+ "example_idx": ei,
27
+ "question_text": row.get("question_text", ""),
28
+ "eval_correct": row.get("eval_correct"),
29
+ "iterations": {},
30
+ "total_input_tokens": 0,
31
+ "total_output_tokens": 0,
32
+ "total_execution_time": 0.0,
33
+ "final_answer": None,
34
+ "final_answer_preview": "",
35
+ }
36
+
37
+ ex = examples[ei]
38
+
39
+ # Parse code blocks
40
+ code_blocks = []
41
+ cbj = row.get("code_blocks_json", "")
42
+ if cbj and cbj != "[]":
43
+ try:
44
+ code_blocks = json.loads(cbj) if isinstance(cbj, str) else cbj
45
+ except (json.JSONDecodeError, TypeError):
46
+ code_blocks = []
47
+
48
+ iteration = {
49
+ "rlm_iter": ri,
50
+ "prompt": row.get("prompt", ""),
51
+ "response": row.get("response", ""),
52
+ "model": row.get("model", ""),
53
+ "input_tokens": row.get("input_tokens", 0),
54
+ "output_tokens": row.get("output_tokens", 0),
55
+ "execution_time": row.get("execution_time", 0.0),
56
+ "has_code_blocks": row.get("has_code_blocks", False),
57
+ "code_blocks": code_blocks,
58
+ "final_answer": row.get("final_answer"),
59
+ "timestamp": row.get("timestamp", ""),
60
+ }
61
+
62
+ ex["iterations"][ri] = iteration
63
+ ex["total_input_tokens"] += iteration["input_tokens"] or 0
64
+ ex["total_output_tokens"] += iteration["output_tokens"] or 0
65
+ ex["total_execution_time"] += iteration["execution_time"] or 0.0
66
+
67
+ if iteration["final_answer"]:
68
+ ex["final_answer"] = iteration["final_answer"]
69
+ ex["final_answer_preview"] = (iteration["final_answer"] or "")[:200]
70
+
71
+ # Sort and convert dicts to lists
72
+ result = []
73
+ for ei_key in sorted(examples.keys()):
74
+ ex = examples[ei_key]
75
+ iters_list = []
76
+ for ri_key in sorted(ex["iterations"].keys()):
77
+ iters_list.append(ex["iterations"][ri_key])
78
+ ex["iterations"] = iters_list
79
+ result.append(ex)
80
+
81
+ return {"examples": result}
82
+
83
+
84
+ @bp.route("/load", methods=["POST"])
85
+ def load_dataset_endpoint():
86
+ data = request.get_json()
87
+ repo = data.get("repo", "").strip()
88
+ if not repo:
89
+ return jsonify({"error": "repo is required"}), 400
90
+
91
+ config = data.get("config", "rlm_call_traces")
92
+ split = data.get("split", "train")
93
+
94
+ try:
95
+ ds = load_dataset(repo, config, split=split)
96
+ except Exception as e:
97
+ return jsonify({"error": f"Failed to load dataset: {e}"}), 400
98
+
99
+ ds_id = _make_id(repo, config, split)
100
+ rows = [ds[i] for i in range(len(ds))]
101
+ hierarchy = _build_hierarchy(rows)
102
+
103
+ # Extract metadata from first row
104
+ first_row = rows[0] if rows else {}
105
+ metadata = {
106
+ "run_id": first_row.get("run_id", ""),
107
+ "method": first_row.get("method", ""),
108
+ "model": first_row.get("model", ""),
109
+ }
110
+
111
+ _cache[ds_id] = {
112
+ "repo": repo,
113
+ "config": config,
114
+ "split": split,
115
+ "hierarchy": hierarchy,
116
+ "metadata": metadata,
117
+ "n_rows": len(rows),
118
+ }
119
+
120
+ short_name = repo.rsplit("/", 1)[-1] if "/" in repo else repo
121
+
122
+ return jsonify({
123
+ "id": ds_id,
124
+ "repo": repo,
125
+ "name": short_name,
126
+ "config": config,
127
+ "split": split,
128
+ "metadata": metadata,
129
+ "n_examples": len(hierarchy["examples"]),
130
+ "n_rows": len(rows),
131
+ })
132
+
133
+
134
+ @bp.route("/", methods=["GET"])
135
+ def list_datasets():
136
+ result = []
137
+ for ds_id, info in _cache.items():
138
+ result.append({
139
+ "id": ds_id,
140
+ "repo": info["repo"],
141
+ "name": info["repo"].rsplit("/", 1)[-1] if "/" in info["repo"] else info["repo"],
142
+ "config": info["config"],
143
+ "split": info["split"],
144
+ "metadata": info["metadata"],
145
+ "n_rows": info["n_rows"],
146
+ "n_examples": len(info["hierarchy"]["examples"]),
147
+ })
148
+ return jsonify(result)
149
+
150
+
151
+ @bp.route("/<ds_id>/overview", methods=["GET"])
152
+ def get_overview(ds_id):
153
+ """Level 1: Summary of all examples."""
154
+ if ds_id not in _cache:
155
+ return jsonify({"error": "Dataset not loaded"}), 404
156
+
157
+ info = _cache[ds_id]
158
+ hierarchy = info["hierarchy"]
159
+
160
+ summaries = []
161
+ for ex in hierarchy["examples"]:
162
+ summaries.append({
163
+ "example_idx": ex["example_idx"],
164
+ "question_text": (ex["question_text"] or "")[:300],
165
+ "eval_correct": ex["eval_correct"],
166
+ "n_iterations": len(ex["iterations"]),
167
+ "total_input_tokens": ex["total_input_tokens"],
168
+ "total_output_tokens": ex["total_output_tokens"],
169
+ "total_execution_time": ex["total_execution_time"],
170
+ "final_answer_preview": ex["final_answer_preview"],
171
+ })
172
+
173
+ return jsonify({
174
+ "metadata": info["metadata"],
175
+ "examples": summaries,
176
+ })
177
+
178
+
179
+ @bp.route("/<ds_id>/example/<int:example_idx>", methods=["GET"])
180
+ def get_example_detail(ds_id, example_idx):
181
+ """Level 2: Iteration timeline for one example."""
182
+ if ds_id not in _cache:
183
+ return jsonify({"error": "Dataset not loaded"}), 404
184
+
185
+ info = _cache[ds_id]
186
+ hierarchy = info["hierarchy"]
187
+
188
+ ex_data = None
189
+ for ex in hierarchy["examples"]:
190
+ if ex["example_idx"] == example_idx:
191
+ ex_data = ex
192
+ break
193
+
194
+ if ex_data is None:
195
+ return jsonify({"error": f"Example {example_idx} not found"}), 404
196
+
197
+ iters = []
198
+ for it in ex_data["iterations"]:
199
+ iters.append({
200
+ "rlm_iter": it["rlm_iter"],
201
+ "model": it["model"],
202
+ "input_tokens": it["input_tokens"],
203
+ "output_tokens": it["output_tokens"],
204
+ "execution_time": it["execution_time"],
205
+ "has_code_blocks": it["has_code_blocks"],
206
+ "n_code_blocks": len(it["code_blocks"]),
207
+ "response_preview": (it["response"] or "")[:300],
208
+ "has_final_answer": it["final_answer"] is not None,
209
+ "timestamp": it["timestamp"],
210
+ })
211
+
212
+ return jsonify({
213
+ "example_idx": example_idx,
214
+ "question_text": ex_data["question_text"],
215
+ "eval_correct": ex_data["eval_correct"],
216
+ "total_input_tokens": ex_data["total_input_tokens"],
217
+ "total_output_tokens": ex_data["total_output_tokens"],
218
+ "total_execution_time": ex_data["total_execution_time"],
219
+ "final_answer": ex_data["final_answer"],
220
+ "iterations": iters,
221
+ })
222
+
223
+
224
+ @bp.route("/<ds_id>/example/<int:example_idx>/iter/<int:rlm_iter>", methods=["GET"])
225
+ def get_iter_detail(ds_id, example_idx, rlm_iter):
226
+ """Full detail for a specific RLM iteration within an example."""
227
+ if ds_id not in _cache:
228
+ return jsonify({"error": "Dataset not loaded"}), 404
229
+
230
+ info = _cache[ds_id]
231
+ hierarchy = info["hierarchy"]
232
+
233
+ for ex in hierarchy["examples"]:
234
+ if ex["example_idx"] != example_idx:
235
+ continue
236
+ for it in ex["iterations"]:
237
+ if it["rlm_iter"] == rlm_iter:
238
+ return jsonify(it)
239
+
240
+ return jsonify({"error": "Iteration not found"}), 404
241
+
242
+
243
+ @bp.route("/<ds_id>", methods=["DELETE"])
244
+ def unload_dataset(ds_id):
245
+ if ds_id in _cache:
246
+ del _cache[ds_id]
247
+ return jsonify({"status": "ok"})
backend/app.py CHANGED
@@ -6,10 +6,11 @@ def create_app():
6
  app = Flask(__name__, static_folder="../frontend/dist", static_url_path="/")
7
  CORS(app)
8
 
9
- from backend.api import model_datasets, arena_datasets, rlm_datasets, harbor_datasets, presets
10
  app.register_blueprint(model_datasets.bp)
11
  app.register_blueprint(arena_datasets.bp)
12
  app.register_blueprint(rlm_datasets.bp)
 
13
  app.register_blueprint(harbor_datasets.bp)
14
  app.register_blueprint(presets.bp)
15
 
 
6
  app = Flask(__name__, static_folder="../frontend/dist", static_url_path="/")
7
  CORS(app)
8
 
9
+ from backend.api import model_datasets, arena_datasets, rlm_datasets, rlm_eval_datasets, harbor_datasets, presets
10
  app.register_blueprint(model_datasets.bp)
11
  app.register_blueprint(arena_datasets.bp)
12
  app.register_blueprint(rlm_datasets.bp)
13
+ app.register_blueprint(rlm_eval_datasets.bp)
14
  app.register_blueprint(harbor_datasets.bp)
15
  app.register_blueprint(presets.bp)
16
 
frontend/src/App.tsx CHANGED
@@ -2,15 +2,17 @@ import { useState, lazy, Suspense } from "react";
2
 
3
  const ModelApp = lazy(() => import("./model/ModelApp"));
4
  const ArenaApp = lazy(() => import("./arena/ArenaApp"));
 
5
  const RlmApp = lazy(() => import("./rlm/RlmApp"));
6
  const HarborApp = lazy(() => import("./harbor/HarborApp"));
7
 
8
- type TabId = "model" | "arena" | "rlm" | "harbor";
9
 
10
  const TABS: { id: TabId; label: string; color: string; activeClass: string }[] = [
11
  { id: "model", label: "Model Trace", color: "blue", activeClass: "border-blue-500 text-blue-400" },
12
  { id: "arena", label: "Arena", color: "purple", activeClass: "border-purple-500 text-purple-400" },
13
- { id: "rlm", label: "RLM", color: "orange", activeClass: "border-orange-500 text-orange-400" },
 
14
  { id: "harbor", label: "Harbor", color: "teal", activeClass: "border-teal-500 text-teal-400" },
15
  ];
16
 
@@ -56,6 +58,11 @@ export default function App() {
56
  <ArenaApp />
57
  </div>
58
  )}
 
 
 
 
 
59
  {activeTab === "rlm" && (
60
  <div className="theme-rlm h-full">
61
  <RlmApp />
 
2
 
3
  const ModelApp = lazy(() => import("./model/ModelApp"));
4
  const ArenaApp = lazy(() => import("./arena/ArenaApp"));
5
+ const RlmEvalApp = lazy(() => import("./rlm-eval/RlmEvalApp"));
6
  const RlmApp = lazy(() => import("./rlm/RlmApp"));
7
  const HarborApp = lazy(() => import("./harbor/HarborApp"));
8
 
9
+ type TabId = "model" | "arena" | "rlm-eval" | "rlm" | "harbor";
10
 
11
  const TABS: { id: TabId; label: string; color: string; activeClass: string }[] = [
12
  { id: "model", label: "Model Trace", color: "blue", activeClass: "border-blue-500 text-blue-400" },
13
  { id: "arena", label: "Arena", color: "purple", activeClass: "border-purple-500 text-purple-400" },
14
+ { id: "rlm-eval", label: "RLM", color: "emerald", activeClass: "border-emerald-500 text-emerald-400" },
15
+ { id: "rlm", label: "RLM+GEPA", color: "orange", activeClass: "border-orange-500 text-orange-400" },
16
  { id: "harbor", label: "Harbor", color: "teal", activeClass: "border-teal-500 text-teal-400" },
17
  ];
18
 
 
58
  <ArenaApp />
59
  </div>
60
  )}
61
+ {activeTab === "rlm-eval" && (
62
+ <div className="theme-rlm-eval h-full">
63
+ <RlmEvalApp />
64
+ </div>
65
+ )}
66
  {activeTab === "rlm" && (
67
  <div className="theme-rlm h-full">
68
  <RlmApp />
frontend/src/rlm-eval/RlmEvalApp.tsx ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect } from "react";
2
+ import { useAppState } from "./store";
3
+ import Sidebar from "./components/Sidebar";
4
+ import Panel from "./components/Panel";
5
+
6
+ function RlmEvalApp() {
7
+ const state = useAppState();
8
+
9
+ const handleSelectDataset = (id: string) => {
10
+ state.navigatePanel("A", { datasetId: id, level: 1 });
11
+ };
12
+
13
+ // Keyboard shortcuts
14
+ useEffect(() => {
15
+ const handler = (e: KeyboardEvent) => {
16
+ if (
17
+ e.target instanceof HTMLInputElement ||
18
+ e.target instanceof HTMLTextAreaElement ||
19
+ e.target instanceof HTMLSelectElement
20
+ )
21
+ return;
22
+
23
+ switch (e.key) {
24
+ case "Escape":
25
+ state.goUp("A");
26
+ break;
27
+ case "c":
28
+ state.toggleComparison();
29
+ break;
30
+ }
31
+ };
32
+ window.addEventListener("keydown", handler);
33
+ return () => window.removeEventListener("keydown", handler);
34
+ }, [state.goUp, state.toggleComparison]);
35
+
36
+ return (
37
+ <div className="flex h-full bg-gray-950 text-gray-100">
38
+ {/* Sidebar */}
39
+ <Sidebar
40
+ datasets={state.datasets}
41
+ presets={state.presets}
42
+ setPresets={state.setPresets}
43
+ loading={state.loading}
44
+ onAddDataset={state.addDataset}
45
+ onRemoveDataset={state.removeDataset}
46
+ onToggleDataset={state.toggleDataset}
47
+ onSelectDataset={handleSelectDataset}
48
+ onUpdateDatasetPresetName={state.updateDatasetPresetName}
49
+ onClearDatasetPreset={state.clearDatasetPreset}
50
+ />
51
+
52
+ {/* Main content */}
53
+ <div className="flex-1 flex flex-col overflow-hidden">
54
+ {/* Error banner */}
55
+ {state.error && (
56
+ <div className="bg-red-900 border-b border-red-700 px-4 py-2 text-sm text-red-200 flex justify-between">
57
+ <span>{state.error}</span>
58
+ <button onClick={() => state.setError(null)} className="text-red-300 hover:text-red-100">
59
+ x
60
+ </button>
61
+ </div>
62
+ )}
63
+
64
+ {/* Toolbar */}
65
+ <div className="flex items-center justify-between px-4 py-2 border-b border-gray-700 bg-gray-900">
66
+ <div className="text-sm text-gray-400">
67
+ {state.activeDatasets.length} dataset{state.activeDatasets.length !== 1 ? "s" : ""} loaded
68
+ </div>
69
+ <button
70
+ className={`text-sm px-3 py-1 rounded ${
71
+ state.comparisonMode
72
+ ? "bg-emerald-600 text-white"
73
+ : "bg-gray-800 text-gray-300 hover:bg-gray-700"
74
+ }`}
75
+ onClick={state.toggleComparison}
76
+ >
77
+ {state.comparisonMode ? "Exit Compare" : "Compare"}
78
+ </button>
79
+ </div>
80
+
81
+ {/* Panels */}
82
+ <div className="flex-1 flex gap-2 p-2 overflow-hidden">
83
+ {state.panelA ? (
84
+ <div className={state.comparisonMode ? "w-1/2" : "w-full"}>
85
+ <Panel
86
+ nav={state.panelA}
87
+ dataset={state.datasets.find((d) => d.id === state.panelA?.datasetId)}
88
+ panelLabel={state.comparisonMode ? "A" : undefined}
89
+ onNavigate={(nav) => state.navigatePanel("A", nav)}
90
+ onGoUp={() => state.goUp("A")}
91
+ fetchOverview={state.fetchOverview}
92
+ fetchExampleDetail={state.fetchExampleDetail}
93
+ fetchIterDetail={state.fetchIterDetail}
94
+ />
95
+ </div>
96
+ ) : (
97
+ <div className="flex-1 flex items-center justify-center text-gray-500">
98
+ <div className="text-center">
99
+ <p className="text-lg mb-2">No dataset loaded</p>
100
+ <p className="text-sm">Add a dataset from the sidebar to get started</p>
101
+ </div>
102
+ </div>
103
+ )}
104
+
105
+ {state.comparisonMode && state.panelB && (
106
+ <div className="w-1/2">
107
+ <Panel
108
+ nav={state.panelB}
109
+ dataset={state.datasets.find((d) => d.id === state.panelB?.datasetId)}
110
+ panelLabel="B"
111
+ datasets={state.datasets}
112
+ onNavigate={(nav) => state.navigatePanel("B", nav)}
113
+ onGoUp={() => state.goUp("B")}
114
+ onSwitchDataset={(id) => state.navigatePanel("B", { datasetId: id, level: 1 })}
115
+ fetchOverview={state.fetchOverview}
116
+ fetchExampleDetail={state.fetchExampleDetail}
117
+ fetchIterDetail={state.fetchIterDetail}
118
+ />
119
+ </div>
120
+ )}
121
+ </div>
122
+
123
+ {/* Keyboard hints */}
124
+ <div className="flex items-center gap-4 px-4 py-1 border-t border-gray-800 text-xs text-gray-600">
125
+ <span>Esc: Go up</span>
126
+ <span>C: Toggle compare</span>
127
+ </div>
128
+ </div>
129
+ </div>
130
+ );
131
+ }
132
+
133
+ export default RlmEvalApp;
frontend/src/rlm-eval/api.ts ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const BASE = "/api/rlm-eval";
2
+
3
+ async function fetchJson<T>(url: string, init?: RequestInit): Promise<T> {
4
+ const res = await fetch(`${BASE}${url}`, {
5
+ headers: { "Content-Type": "application/json" },
6
+ ...init,
7
+ });
8
+ if (!res.ok) {
9
+ const body = await res.json().catch(() => ({}));
10
+ throw new Error(body.error || `HTTP ${res.status}`);
11
+ }
12
+ return res.json();
13
+ }
14
+
15
+ const PRESETS_BASE = "/api/presets/rlm-eval";
16
+ async function fetchPresetsJson<T>(url: string, init?: RequestInit): Promise<T> {
17
+ const res = await fetch(`${PRESETS_BASE}${url}`, {
18
+ headers: { "Content-Type": "application/json" }, ...init,
19
+ });
20
+ if (!res.ok) { const body = await res.json().catch(() => ({})); throw new Error(body.error || `HTTP ${res.status}`); }
21
+ return res.json();
22
+ }
23
+
24
+ export const api = {
25
+ loadDataset: (repo: string, config?: string, split?: string) =>
26
+ fetchJson<{
27
+ id: string;
28
+ repo: string;
29
+ name: string;
30
+ config: string;
31
+ split: string;
32
+ metadata: Record<string, unknown>;
33
+ n_examples: number;
34
+ n_rows: number;
35
+ }>("/datasets/load", {
36
+ method: "POST",
37
+ body: JSON.stringify({
38
+ repo,
39
+ config: config || "rlm_call_traces",
40
+ split: split || "train",
41
+ }),
42
+ }),
43
+
44
+ getOverview: (dsId: string) =>
45
+ fetchJson<Record<string, unknown>>(`/datasets/${dsId}/overview`),
46
+
47
+ getExampleDetail: (dsId: string, exampleIdx: number) =>
48
+ fetchJson<Record<string, unknown>>(`/datasets/${dsId}/example/${exampleIdx}`),
49
+
50
+ getIterDetail: (dsId: string, exampleIdx: number, rlmIter: number) =>
51
+ fetchJson<Record<string, unknown>>(
52
+ `/datasets/${dsId}/example/${exampleIdx}/iter/${rlmIter}`
53
+ ),
54
+
55
+ unloadDataset: (dsId: string) =>
56
+ fetchJson<{ status: string }>(`/datasets/${dsId}`, { method: "DELETE" }),
57
+
58
+ listPresets: () => fetchPresetsJson<Record<string, unknown>[]>(""),
59
+
60
+ createPreset: (preset: { name: string; repo: string; config: string; split: string }) =>
61
+ fetchPresetsJson<Record<string, unknown>>("", {
62
+ method: "POST",
63
+ body: JSON.stringify(preset),
64
+ }),
65
+
66
+ updatePreset: (id: string, data: { name: string }) =>
67
+ fetchPresetsJson<Record<string, unknown>>(`/${id}`, {
68
+ method: "PUT",
69
+ body: JSON.stringify(data),
70
+ }),
71
+
72
+ deletePreset: (id: string) =>
73
+ fetchPresetsJson<{ status: string }>(`/${id}`, { method: "DELETE" }),
74
+ };
frontend/src/rlm-eval/components/Breadcrumb.tsx ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { PanelNav, DatasetInfo } from "../types";
2
+
3
+ interface BreadcrumbProps {
4
+ nav: PanelNav;
5
+ dataset: DatasetInfo | undefined;
6
+ onNavigate: (nav: PanelNav) => void;
7
+ }
8
+
9
+ export default function Breadcrumb({ nav, dataset, onNavigate }: BreadcrumbProps) {
10
+ const parts: { label: string; nav: PanelNav }[] = [];
11
+
12
+ if (dataset) {
13
+ parts.push({
14
+ label: dataset.name,
15
+ nav: { datasetId: nav.datasetId, level: 1 },
16
+ });
17
+ }
18
+
19
+ if (nav.level >= 2 && nav.exampleIdx !== undefined) {
20
+ parts.push({
21
+ label: `Example ${nav.exampleIdx}`,
22
+ nav: { datasetId: nav.datasetId, level: 2, exampleIdx: nav.exampleIdx },
23
+ });
24
+ }
25
+
26
+ return (
27
+ <div className="flex items-center gap-1 text-sm">
28
+ {parts.map((p, i) => (
29
+ <span key={i} className="flex items-center gap-1">
30
+ {i > 0 && <span className="text-gray-500">/</span>}
31
+ {i < parts.length - 1 ? (
32
+ <button
33
+ className="text-emerald-400 hover:text-emerald-300"
34
+ onClick={() => onNavigate(p.nav)}
35
+ >
36
+ {p.label}
37
+ </button>
38
+ ) : (
39
+ <span className="text-gray-200">{p.label}</span>
40
+ )}
41
+ </span>
42
+ ))}
43
+ </div>
44
+ );
45
+ }
frontend/src/rlm-eval/components/DatasetSelector.tsx ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { DatasetInfo } from "../types";
2
+
3
+ interface DatasetSelectorProps {
4
+ datasets: DatasetInfo[];
5
+ currentId: string;
6
+ onSelect: (id: string) => void;
7
+ }
8
+
9
+ export default function DatasetSelector({ datasets, currentId, onSelect }: DatasetSelectorProps) {
10
+ return (
11
+ <select
12
+ className="bg-gray-800 text-gray-200 text-xs rounded px-2 py-0.5 border border-gray-600 focus:border-emerald-500 outline-none"
13
+ value={currentId}
14
+ onChange={(e) => onSelect(e.target.value)}
15
+ >
16
+ {datasets.map((ds) => (
17
+ <option key={ds.id} value={ds.id}>
18
+ {ds.name}
19
+ </option>
20
+ ))}
21
+ </select>
22
+ );
23
+ }
frontend/src/rlm-eval/components/ExampleDetailLevel.tsx ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState } from "react";
2
+ import type { ExampleDetailData, RlmIterDetail } from "../types";
3
+ import IterationDetail from "./IterationDetail";
4
+
5
+ interface ExampleDetailLevelProps {
6
+ datasetId: string;
7
+ exampleIdx: number;
8
+ fetchExampleDetail: (dsId: string, exampleIdx: number) => Promise<ExampleDetailData>;
9
+ fetchIterDetail: (dsId: string, exampleIdx: number, rlmIter: number) => Promise<RlmIterDetail>;
10
+ }
11
+
12
+ export default function ExampleDetailLevel({
13
+ datasetId,
14
+ exampleIdx,
15
+ fetchExampleDetail,
16
+ fetchIterDetail,
17
+ }: ExampleDetailLevelProps) {
18
+ const [data, setData] = useState<ExampleDetailData | null>(null);
19
+ const [expandedIter, setExpandedIter] = useState<number | null>(null);
20
+ const [iterDetail, setIterDetail] = useState<RlmIterDetail | null>(null);
21
+
22
+ useEffect(() => {
23
+ fetchExampleDetail(datasetId, exampleIdx).then(setData).catch(() => {});
24
+ }, [datasetId, exampleIdx, fetchExampleDetail]);
25
+
26
+ useEffect(() => {
27
+ if (expandedIter === null) {
28
+ setIterDetail(null);
29
+ return;
30
+ }
31
+ fetchIterDetail(datasetId, exampleIdx, expandedIter)
32
+ .then(setIterDetail)
33
+ .catch(() => {});
34
+ }, [datasetId, exampleIdx, expandedIter, fetchIterDetail]);
35
+
36
+ if (!data) return <div className="p-4 text-gray-400">Loading example detail...</div>;
37
+
38
+ return (
39
+ <div className="p-4 space-y-4 overflow-y-auto">
40
+ {/* Question text */}
41
+ <div className="bg-gray-800 border border-gray-700 rounded-lg p-4">
42
+ <div className="flex items-center gap-2 mb-2">
43
+ <div className="text-xs font-semibold text-emerald-400">Question</div>
44
+ {data.eval_correct === true && (
45
+ <span className="text-emerald-400 text-xs font-bold">&#10003; Correct</span>
46
+ )}
47
+ {data.eval_correct === false && (
48
+ <span className="text-red-400 text-xs font-bold">&#10007; Incorrect</span>
49
+ )}
50
+ </div>
51
+ <div className="text-sm text-gray-200 whitespace-pre-wrap max-h-40 overflow-y-auto">
52
+ {data.question_text}
53
+ </div>
54
+ </div>
55
+
56
+ {/* Stats row */}
57
+ <div className="flex gap-4 text-xs text-gray-400">
58
+ <span>
59
+ Total tokens:{" "}
60
+ <span className="text-gray-200">
61
+ {((data.total_input_tokens + data.total_output_tokens) / 1000).toFixed(1)}k
62
+ </span>
63
+ </span>
64
+ <span>
65
+ Time: <span className="text-gray-200">{data.total_execution_time.toFixed(1)}s</span>
66
+ </span>
67
+ <span>
68
+ Iterations: <span className="text-gray-200">{data.iterations.length}</span>
69
+ </span>
70
+ </div>
71
+
72
+ {/* Iteration timeline */}
73
+ <div>
74
+ <div className="text-xs font-semibold text-gray-400 mb-2">Iteration Timeline</div>
75
+ <div className="flex gap-2 overflow-x-auto pb-2">
76
+ {data.iterations.map((it) => (
77
+ <div
78
+ key={it.rlm_iter}
79
+ className={`flex-shrink-0 w-56 bg-gray-800 border rounded-lg p-3 cursor-pointer transition-colors hover:border-emerald-500 ${
80
+ expandedIter === it.rlm_iter
81
+ ? "border-emerald-500 ring-1 ring-emerald-500"
82
+ : it.has_final_answer
83
+ ? "border-emerald-600"
84
+ : "border-gray-700"
85
+ }`}
86
+ onClick={() =>
87
+ setExpandedIter(expandedIter === it.rlm_iter ? null : it.rlm_iter)
88
+ }
89
+ >
90
+ <div className="flex items-center justify-between mb-2">
91
+ <span className="bg-gray-700 text-gray-200 text-xs font-mono px-2 py-0.5 rounded">
92
+ iter {it.rlm_iter}
93
+ </span>
94
+ <div className="flex gap-1">
95
+ {it.has_code_blocks && (
96
+ <span className="bg-emerald-900 text-emerald-300 text-xs px-1.5 py-0.5 rounded">
97
+ {it.n_code_blocks} code
98
+ </span>
99
+ )}
100
+ {it.has_final_answer && (
101
+ <span className="bg-amber-900 text-amber-300 text-xs px-1.5 py-0.5 rounded">
102
+ FINAL
103
+ </span>
104
+ )}
105
+ </div>
106
+ </div>
107
+
108
+ <div className="flex justify-between text-xs text-gray-500 mb-2">
109
+ <span>{((it.input_tokens + it.output_tokens) / 1000).toFixed(1)}k tok</span>
110
+ <span>{it.execution_time.toFixed(1)}s</span>
111
+ </div>
112
+
113
+ <div className="text-xs text-gray-400 line-clamp-3 leading-relaxed">
114
+ {it.response_preview || "(empty)"}
115
+ </div>
116
+ </div>
117
+ ))}
118
+ </div>
119
+ </div>
120
+
121
+ {/* Expanded iteration detail */}
122
+ {expandedIter !== null && iterDetail && (
123
+ <IterationDetail data={iterDetail} />
124
+ )}
125
+
126
+ {/* Final answer if present */}
127
+ {data.final_answer && (
128
+ <div className="bg-emerald-950 border border-emerald-700 rounded-lg p-4">
129
+ <div className="text-xs font-semibold text-emerald-400 mb-2">Final Answer</div>
130
+ <div className="text-sm text-gray-200 whitespace-pre-wrap max-h-60 overflow-y-auto">
131
+ {data.final_answer}
132
+ </div>
133
+ </div>
134
+ )}
135
+ </div>
136
+ );
137
+ }
frontend/src/rlm-eval/components/IterationDetail.tsx ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import type { RlmIterDetail } from "../types";
3
+
4
+ interface IterationDetailProps {
5
+ data: RlmIterDetail;
6
+ }
7
+
8
+ function parsePromptMessages(promptStr: string): { role: string; content: string }[] {
9
+ try {
10
+ const parsed = JSON.parse(promptStr);
11
+ if (Array.isArray(parsed)) return parsed;
12
+ } catch { /* not JSON */ }
13
+ return [{ role: "raw", content: promptStr }];
14
+ }
15
+
16
+ const roleColors: Record<string, string> = {
17
+ system: "border-violet-500 bg-violet-950",
18
+ user: "border-emerald-500 bg-emerald-950",
19
+ assistant: "border-sky-500 bg-sky-950",
20
+ raw: "border-gray-500 bg-gray-900",
21
+ };
22
+
23
+ export default function IterationDetail({ data }: IterationDetailProps) {
24
+ const [promptExpanded, setPromptExpanded] = useState(false);
25
+ const messages = parsePromptMessages(data.prompt);
26
+
27
+ return (
28
+ <div className="space-y-4 border border-gray-700 rounded-lg p-4 bg-gray-900">
29
+ {/* Stats */}
30
+ <div className="flex gap-4 text-xs text-gray-400">
31
+ <span>Model: <span className="text-gray-200">{data.model}</span></span>
32
+ <span>In: <span className="text-emerald-300">{(data.input_tokens / 1000).toFixed(1)}k</span></span>
33
+ <span>Out: <span className="text-emerald-300">{(data.output_tokens / 1000).toFixed(1)}k</span></span>
34
+ <span>Time: <span className="text-gray-200">{data.execution_time.toFixed(1)}s</span></span>
35
+ </div>
36
+
37
+ {/* Prompt section (collapsible) */}
38
+ <div>
39
+ <button
40
+ className="flex items-center gap-2 text-sm font-semibold text-gray-300 hover:text-gray-100 mb-2"
41
+ onClick={() => setPromptExpanded(!promptExpanded)}
42
+ >
43
+ <span className={`transform transition-transform ${promptExpanded ? "rotate-90" : ""}`}>
44
+ &#9654;
45
+ </span>
46
+ Prompt ({messages.length} messages)
47
+ </button>
48
+ {promptExpanded && (
49
+ <div className="space-y-2 ml-4">
50
+ {messages.map((msg, i) => (
51
+ <div
52
+ key={i}
53
+ className={`border-l-2 rounded-r-lg px-3 py-2 ${roleColors[msg.role] || roleColors.raw}`}
54
+ >
55
+ <div className="text-xs font-semibold text-gray-400 mb-1 uppercase">{msg.role}</div>
56
+ <div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto">
57
+ {msg.content.length > 8000 ? msg.content.slice(0, 8000) + "\n...(truncated)" : msg.content}
58
+ </div>
59
+ </div>
60
+ ))}
61
+ </div>
62
+ )}
63
+ </div>
64
+
65
+ {/* Response */}
66
+ <div>
67
+ <div className="text-sm font-semibold text-gray-300 mb-2">Response</div>
68
+ <div className="bg-gray-800 border border-gray-700 rounded-lg p-3">
69
+ <div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto font-mono">
70
+ {data.response}
71
+ </div>
72
+ </div>
73
+ </div>
74
+
75
+ {/* Code Blocks */}
76
+ {data.code_blocks.length > 0 && (
77
+ <div>
78
+ <div className="text-sm font-semibold text-gray-300 mb-2">
79
+ Code Blocks ({data.code_blocks.length})
80
+ </div>
81
+ <div className="space-y-3">
82
+ {data.code_blocks.map((cb, i) => (
83
+ <div key={i} className="border border-gray-700 rounded-lg overflow-hidden">
84
+ <div className="bg-gray-800 px-3 py-1.5 text-xs text-gray-400 border-b border-gray-700 flex items-center gap-2">
85
+ <span className="text-emerald-400 font-mono">python</span>
86
+ <span>Block {i + 1}</span>
87
+ </div>
88
+ <pre className="bg-gray-900 p-3 text-sm text-gray-200 overflow-x-auto font-mono leading-relaxed">
89
+ {cb.code}
90
+ </pre>
91
+ {cb.stdout && (
92
+ <div className="border-t border-gray-700">
93
+ <div className="bg-gray-800 px-3 py-1 text-xs text-gray-400">stdout</div>
94
+ <pre className="bg-emerald-950 p-3 text-sm text-emerald-200 overflow-x-auto font-mono">
95
+ {cb.stdout}
96
+ </pre>
97
+ </div>
98
+ )}
99
+ </div>
100
+ ))}
101
+ </div>
102
+ </div>
103
+ )}
104
+
105
+ {/* Final Answer */}
106
+ {data.final_answer && (
107
+ <div className="bg-emerald-950 border border-emerald-700 rounded-lg p-4">
108
+ <div className="text-xs font-semibold text-emerald-400 mb-2">Final Answer</div>
109
+ <div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto">
110
+ {data.final_answer}
111
+ </div>
112
+ </div>
113
+ )}
114
+ </div>
115
+ );
116
+ }
frontend/src/rlm-eval/components/OverviewLevel.tsx ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useEffect, useState } from "react";
2
+ import type { OverviewData, PanelNav } from "../types";
3
+
4
+ interface OverviewLevelProps {
5
+ datasetId: string;
6
+ fetchOverview: (dsId: string) => Promise<OverviewData>;
7
+ onDrillDown: (nav: PanelNav) => void;
8
+ }
9
+
10
+ export default function OverviewLevel({ datasetId, fetchOverview, onDrillDown }: OverviewLevelProps) {
11
+ const [data, setData] = useState<OverviewData | null>(null);
12
+
13
+ useEffect(() => {
14
+ fetchOverview(datasetId).then(setData).catch(() => {});
15
+ }, [datasetId, fetchOverview]);
16
+
17
+ if (!data) return <div className="p-4 text-gray-400">Loading overview...</div>;
18
+
19
+ const correctCount = data.examples.filter((ex) => ex.eval_correct === true).length;
20
+ const incorrectCount = data.examples.filter((ex) => ex.eval_correct === false).length;
21
+ const unknownCount = data.examples.filter((ex) => ex.eval_correct === null || ex.eval_correct === undefined).length;
22
+
23
+ return (
24
+ <div className="p-4 space-y-3">
25
+ {/* Experiment metadata */}
26
+ <div className="flex gap-3 text-xs text-gray-400">
27
+ <span>Model: <span className="text-gray-200">{data.metadata.model}</span></span>
28
+ <span>Method: <span className="text-gray-200">{data.metadata.method}</span></span>
29
+ <span>Run: <span className="text-gray-200">{data.metadata.run_id}</span></span>
30
+ </div>
31
+
32
+ {/* Summary stats */}
33
+ <div className="flex gap-3 text-xs text-gray-400">
34
+ <span>{data.examples.length} examples</span>
35
+ {correctCount > 0 && (
36
+ <span className="text-emerald-400">{correctCount} correct</span>
37
+ )}
38
+ {incorrectCount > 0 && (
39
+ <span className="text-red-400">{incorrectCount} incorrect</span>
40
+ )}
41
+ {unknownCount > 0 && (
42
+ <span className="text-gray-500">{unknownCount} unknown</span>
43
+ )}
44
+ </div>
45
+
46
+ {/* Example cards */}
47
+ <div className="space-y-2">
48
+ {data.examples.map((ex) => (
49
+ <div
50
+ key={ex.example_idx}
51
+ className="bg-gray-800 border border-gray-700 rounded-lg p-4 hover:border-emerald-500 cursor-pointer transition-colors"
52
+ onClick={() =>
53
+ onDrillDown({
54
+ datasetId,
55
+ level: 2,
56
+ exampleIdx: ex.example_idx,
57
+ })
58
+ }
59
+ >
60
+ <div className="flex items-center justify-between mb-2">
61
+ <div className="flex items-center gap-3">
62
+ <span className="bg-emerald-600 text-white text-xs font-bold px-2 py-0.5 rounded-full">
63
+ Ex {ex.example_idx}
64
+ </span>
65
+ <span className="bg-gray-700 text-gray-300 text-xs px-2 py-0.5 rounded">
66
+ {ex.n_iterations} iter{ex.n_iterations !== 1 ? "s" : ""}
67
+ </span>
68
+ {ex.eval_correct === true && (
69
+ <span className="text-emerald-400 text-sm font-bold" title="Correct">
70
+ &#10003;
71
+ </span>
72
+ )}
73
+ {ex.eval_correct === false && (
74
+ <span className="text-red-400 text-sm font-bold" title="Incorrect">
75
+ &#10007;
76
+ </span>
77
+ )}
78
+ </div>
79
+ <span className="text-xs text-gray-400">
80
+ {ex.total_execution_time.toFixed(1)}s
81
+ </span>
82
+ </div>
83
+
84
+ {/* Question preview */}
85
+ <div className="text-sm text-gray-300 line-clamp-2 mb-2 leading-relaxed">
86
+ {ex.question_text || "(no question text)"}
87
+ </div>
88
+
89
+ <div className="flex gap-4 text-xs text-gray-400">
90
+ <span>
91
+ {((ex.total_input_tokens + ex.total_output_tokens) / 1000).toFixed(1)}k tokens
92
+ </span>
93
+ </div>
94
+
95
+ {ex.final_answer_preview && (
96
+ <div className="mt-2 text-xs text-gray-500 truncate">
97
+ Answer: {ex.final_answer_preview}
98
+ </div>
99
+ )}
100
+ </div>
101
+ ))}
102
+ </div>
103
+ </div>
104
+ );
105
+ }
frontend/src/rlm-eval/components/Panel.tsx ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type { PanelNav, DatasetInfo, OverviewData, ExampleDetailData, RlmIterDetail } from "../types";
2
+ import Breadcrumb from "./Breadcrumb";
3
+ import OverviewLevel from "./OverviewLevel";
4
+ import ExampleDetailLevel from "./ExampleDetailLevel";
5
+ import DatasetSelector from "./DatasetSelector";
6
+
7
+ interface PanelProps {
8
+ nav: PanelNav;
9
+ dataset: DatasetInfo | undefined;
10
+ panelLabel?: string;
11
+ datasets?: DatasetInfo[];
12
+ onNavigate: (nav: PanelNav) => void;
13
+ onGoUp: () => void;
14
+ onSwitchDataset?: (id: string) => void;
15
+ fetchOverview: (dsId: string) => Promise<OverviewData>;
16
+ fetchExampleDetail: (dsId: string, exampleIdx: number) => Promise<ExampleDetailData>;
17
+ fetchIterDetail: (
18
+ dsId: string,
19
+ exampleIdx: number,
20
+ rlmIter: number
21
+ ) => Promise<RlmIterDetail>;
22
+ }
23
+
24
+ export default function Panel({
25
+ nav,
26
+ dataset,
27
+ panelLabel,
28
+ datasets,
29
+ onNavigate,
30
+ onGoUp,
31
+ onSwitchDataset,
32
+ fetchOverview,
33
+ fetchExampleDetail,
34
+ fetchIterDetail,
35
+ }: PanelProps) {
36
+ return (
37
+ <div className="flex flex-col h-full border border-gray-700 rounded-lg bg-gray-900 overflow-hidden">
38
+ {/* Panel header */}
39
+ <div className="flex items-center gap-2 px-3 py-2 border-b border-gray-700 bg-gray-800">
40
+ {nav.level > 1 && (
41
+ <button
42
+ className="text-gray-400 hover:text-gray-200 text-sm"
43
+ onClick={onGoUp}
44
+ title="Go up"
45
+ >
46
+ &#8592;
47
+ </button>
48
+ )}
49
+ {panelLabel && (
50
+ <span className="text-xs bg-gray-700 text-gray-300 px-1.5 py-0.5 rounded">
51
+ {panelLabel}
52
+ </span>
53
+ )}
54
+ <Breadcrumb nav={nav} dataset={dataset} onNavigate={onNavigate} />
55
+ {panelLabel === "B" && datasets && onSwitchDataset && (
56
+ <div className="ml-auto">
57
+ <DatasetSelector
58
+ datasets={datasets}
59
+ currentId={nav.datasetId}
60
+ onSelect={(id) => onSwitchDataset(id)}
61
+ />
62
+ </div>
63
+ )}
64
+ </div>
65
+
66
+ {/* Panel content */}
67
+ <div className="flex-1 overflow-y-auto">
68
+ {nav.level === 1 && (
69
+ <OverviewLevel
70
+ datasetId={nav.datasetId}
71
+ fetchOverview={fetchOverview}
72
+ onDrillDown={onNavigate}
73
+ />
74
+ )}
75
+ {nav.level === 2 && nav.exampleIdx !== undefined && (
76
+ <ExampleDetailLevel
77
+ datasetId={nav.datasetId}
78
+ exampleIdx={nav.exampleIdx}
79
+ fetchExampleDetail={fetchExampleDetail}
80
+ fetchIterDetail={fetchIterDetail}
81
+ />
82
+ )}
83
+ </div>
84
+ </div>
85
+ );
86
+ }
frontend/src/rlm-eval/components/Sidebar.tsx ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState } from "react";
2
+ import type { DatasetInfo, Preset } from "../types";
3
+ import { api } from "../api";
4
+
5
+ interface SidebarProps {
6
+ datasets: DatasetInfo[];
7
+ presets: Preset[];
8
+ setPresets: (p: Preset[]) => void;
9
+ loading: Record<string, boolean>;
10
+ onAddDataset: (repo: string, config?: string, split?: string, presetId?: string, presetName?: string) => void;
11
+ onRemoveDataset: (id: string) => void;
12
+ onToggleDataset: (id: string) => void;
13
+ onSelectDataset: (id: string) => void;
14
+ onUpdateDatasetPresetName: (dsId: string, name: string) => void;
15
+ onClearDatasetPreset: (dsId: string) => void;
16
+ }
17
+
18
+ export default function Sidebar({
19
+ datasets,
20
+ presets,
21
+ setPresets,
22
+ loading,
23
+ onAddDataset,
24
+ onRemoveDataset,
25
+ onToggleDataset,
26
+ onSelectDataset,
27
+ onUpdateDatasetPresetName,
28
+ onClearDatasetPreset,
29
+ }: SidebarProps) {
30
+ const [showAddForm, setShowAddForm] = useState(false);
31
+ const [repo, setRepo] = useState("");
32
+ const [config, setConfig] = useState("rlm_call_traces");
33
+ const [split, setSplit] = useState("train");
34
+ const [presetSearch, setPresetSearch] = useState("");
35
+
36
+ // Inline preset saving
37
+ const [savingPresetForId, setSavingPresetForId] = useState<string | null>(null);
38
+ const [presetName, setPresetName] = useState("");
39
+
40
+ // Preset editing panel
41
+ const [editingDatasetId, setEditingDatasetId] = useState<string | null>(null);
42
+ const [editPresetName, setEditPresetName] = useState("");
43
+
44
+ const handleAdd = () => {
45
+ if (!repo.trim()) return;
46
+ onAddDataset(repo.trim(), config, split);
47
+ setRepo("");
48
+ setShowAddForm(false);
49
+ };
50
+
51
+ const handleLoadPreset = (p: Preset) => {
52
+ onAddDataset(p.repo, p.config, p.split || "train", p.id, p.name);
53
+ };
54
+
55
+ const handleSavePresetForRepo = async (ds: DatasetInfo) => {
56
+ if (!presetName.trim()) return;
57
+ try {
58
+ const preset = (await api.createPreset({
59
+ name: presetName.trim(),
60
+ repo: ds.repo,
61
+ config: ds.config,
62
+ split: ds.split,
63
+ })) as unknown as Preset;
64
+ setPresets([...presets, preset]);
65
+ onUpdateDatasetPresetName(ds.id, presetName.trim());
66
+ } catch {
67
+ /* ignore */
68
+ }
69
+ setPresetName("");
70
+ setSavingPresetForId(null);
71
+ };
72
+
73
+ const handleUpdatePreset = async (presetId: string, dsId: string) => {
74
+ if (!editPresetName.trim()) return;
75
+ try {
76
+ await api.updatePreset(presetId, { name: editPresetName.trim() });
77
+ setPresets(
78
+ presets.map((p) => (p.id === presetId ? { ...p, name: editPresetName.trim() } : p))
79
+ );
80
+ onUpdateDatasetPresetName(dsId, editPresetName.trim());
81
+ } catch {
82
+ /* ignore */
83
+ }
84
+ setEditingDatasetId(null);
85
+ };
86
+
87
+ const handleDeletePreset = async (id: string, dsId?: string) => {
88
+ await api.deletePreset(id).catch(() => {});
89
+ setPresets(presets.filter((p) => p.id !== id));
90
+ if (dsId) {
91
+ onClearDatasetPreset(dsId);
92
+ }
93
+ setEditingDatasetId(null);
94
+ };
95
+
96
+ const filteredPresets = presetSearch
97
+ ? presets.filter(
98
+ (p) =>
99
+ p.name.toLowerCase().includes(presetSearch.toLowerCase()) ||
100
+ p.repo.toLowerCase().includes(presetSearch.toLowerCase())
101
+ )
102
+ : presets;
103
+
104
+ return (
105
+ <div className="w-64 min-w-64 bg-gray-900 border-r border-gray-700 flex flex-col h-full overflow-hidden">
106
+ {/* Header */}
107
+ <div className="p-3 border-b border-gray-700">
108
+ <h1 className="text-sm font-bold tracking-wide text-gray-200">RLM Eval Visualizer</h1>
109
+ </div>
110
+
111
+ {/* Presets section */}
112
+ <div className="p-3 border-b border-gray-700">
113
+ <div className="text-xs font-semibold text-gray-400 uppercase tracking-wider mb-2">
114
+ Presets
115
+ </div>
116
+ {presets.length === 0 ? (
117
+ <p className="text-xs text-gray-500 italic">No presets saved</p>
118
+ ) : (
119
+ <>
120
+ {presets.length > 6 && (
121
+ <input
122
+ type="text"
123
+ value={presetSearch}
124
+ onChange={(e) => setPresetSearch(e.target.value)}
125
+ placeholder="Search presets..."
126
+ className="w-full px-2 py-1 mb-2 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
127
+ />
128
+ )}
129
+ <div className="flex flex-wrap gap-1 max-h-32 overflow-y-auto">
130
+ {filteredPresets.map((p) => (
131
+ <div key={p.id} className="group relative">
132
+ <button
133
+ onClick={() => handleLoadPreset(p)}
134
+ className="px-2 py-1 text-xs bg-gray-800 hover:bg-gray-700 rounded border border-gray-600 text-gray-300 transition-colors"
135
+ title={`${p.repo} (${p.config}, ${p.split ?? "train"})`}
136
+ >
137
+ {p.name}
138
+ </button>
139
+ <div className="hidden group-hover:flex absolute top-full left-0 mt-1 z-10 gap-1">
140
+ <button
141
+ onClick={() => handleDeletePreset(p.id)}
142
+ className="px-1.5 py-0.5 text-[10px] bg-red-900 hover:bg-red-800 rounded text-red-300"
143
+ >
144
+ Delete
145
+ </button>
146
+ </div>
147
+ </div>
148
+ ))}
149
+ </div>
150
+ </>
151
+ )}
152
+ </div>
153
+
154
+ {/* Loaded Experiments */}
155
+ <div className="flex-1 overflow-y-auto p-3">
156
+ <div className="text-xs font-semibold text-gray-400 uppercase tracking-wider mb-2">
157
+ Loaded Datasets
158
+ </div>
159
+ {datasets.length === 0 ? (
160
+ <p className="text-xs text-gray-500 italic">No datasets loaded</p>
161
+ ) : (
162
+ <div className="space-y-1">
163
+ {datasets.map((ds) => (
164
+ <div key={ds.id}>
165
+ <div
166
+ onClick={() => {
167
+ if (ds.presetId) {
168
+ setEditingDatasetId(editingDatasetId === ds.id ? null : ds.id);
169
+ setEditPresetName(ds.presetName || "");
170
+ setShowAddForm(false);
171
+ }
172
+ onSelectDataset(ds.id);
173
+ }}
174
+ className={`flex items-center gap-2 px-2 py-1.5 rounded text-sm transition-colors cursor-pointer ${
175
+ ds.active ? "bg-gray-800" : "bg-gray-900 opacity-60"
176
+ } ${editingDatasetId === ds.id ? "ring-1 ring-emerald-500" : "hover:bg-gray-800"}`}
177
+ >
178
+ <input
179
+ type="checkbox"
180
+ checked={ds.active}
181
+ onChange={() => onToggleDataset(ds.id)}
182
+ onClick={(e) => e.stopPropagation()}
183
+ className="accent-emerald-500 shrink-0"
184
+ />
185
+ <div className="flex-1 min-w-0">
186
+ <div
187
+ className="text-xs font-medium text-gray-200 truncate"
188
+ title={ds.presetName ? `${ds.presetName}\n${ds.repo}` : ds.repo}
189
+ >
190
+ {ds.presetName || ds.name}
191
+ </div>
192
+ <div className="text-[10px] text-gray-500">
193
+ {ds.metadata.model} | {ds.n_examples} examples
194
+ </div>
195
+ </div>
196
+ {/* Save as preset bookmark */}
197
+ <button
198
+ onClick={(e) => {
199
+ e.stopPropagation();
200
+ setSavingPresetForId(savingPresetForId === ds.id ? null : ds.id);
201
+ setPresetName(ds.presetName || ds.name);
202
+ }}
203
+ className={`transition-colors shrink-0 ${
204
+ savingPresetForId === ds.id
205
+ ? "text-emerald-400"
206
+ : ds.presetId
207
+ ? "text-emerald-500"
208
+ : "text-gray-600 hover:text-emerald-400"
209
+ }`}
210
+ title={ds.presetId ? "Saved as preset" : "Save as preset"}
211
+ >
212
+ <svg
213
+ className="w-3.5 h-3.5"
214
+ fill={ds.presetId ? "currentColor" : "none"}
215
+ viewBox="0 0 24 24"
216
+ stroke="currentColor"
217
+ >
218
+ <path
219
+ strokeLinecap="round"
220
+ strokeLinejoin="round"
221
+ strokeWidth={2}
222
+ d="M5 5a2 2 0 012-2h10a2 2 0 012 2v16l-7-3.5L5 21V5z"
223
+ />
224
+ </svg>
225
+ </button>
226
+ {/* Remove */}
227
+ <button
228
+ onClick={(e) => {
229
+ e.stopPropagation();
230
+ onRemoveDataset(ds.id);
231
+ }}
232
+ className="text-gray-600 hover:text-red-400 transition-colors shrink-0"
233
+ title="Remove"
234
+ >
235
+ <svg
236
+ className="w-3.5 h-3.5"
237
+ fill="none"
238
+ viewBox="0 0 24 24"
239
+ stroke="currentColor"
240
+ >
241
+ <path
242
+ strokeLinecap="round"
243
+ strokeLinejoin="round"
244
+ strokeWidth={2}
245
+ d="M6 18L18 6M6 6l12 12"
246
+ />
247
+ </svg>
248
+ </button>
249
+ </div>
250
+
251
+ {/* Inline preset name input */}
252
+ {savingPresetForId === ds.id && (
253
+ <div className="flex gap-1 mt-1 ml-6">
254
+ <input
255
+ type="text"
256
+ value={presetName}
257
+ onChange={(e) => setPresetName(e.target.value)}
258
+ onKeyDown={(e) => {
259
+ if (e.key === "Enter") handleSavePresetForRepo(ds);
260
+ if (e.key === "Escape") setSavingPresetForId(null);
261
+ }}
262
+ placeholder="Preset name..."
263
+ className="flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
264
+ autoFocus
265
+ />
266
+ <button
267
+ onClick={() => handleSavePresetForRepo(ds)}
268
+ className="px-2 py-1 text-xs bg-emerald-600 hover:bg-emerald-500 rounded text-white"
269
+ >
270
+ Save
271
+ </button>
272
+ </div>
273
+ )}
274
+ </div>
275
+ ))}
276
+ </div>
277
+ )}
278
+ </div>
279
+
280
+ {/* Preset edit panel */}
281
+ {editingDatasetId &&
282
+ (() => {
283
+ const editDs = datasets.find((d) => d.id === editingDatasetId);
284
+ if (!editDs?.presetId) return null;
285
+ return (
286
+ <div className="p-3 border-t border-gray-700 space-y-2">
287
+ <div className="text-[10px] text-gray-500 uppercase font-semibold tracking-wider">
288
+ Edit Preset
289
+ </div>
290
+ <input
291
+ type="text"
292
+ value={editPresetName}
293
+ onChange={(e) => setEditPresetName(e.target.value)}
294
+ onKeyDown={(e) => {
295
+ if (e.key === "Enter" && editPresetName.trim()) {
296
+ handleUpdatePreset(editDs.presetId!, editDs.id);
297
+ }
298
+ if (e.key === "Escape") setEditingDatasetId(null);
299
+ }}
300
+ placeholder="Preset name..."
301
+ className="w-full px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
302
+ autoFocus
303
+ />
304
+ <div className="flex gap-2">
305
+ <button
306
+ onClick={() => handleUpdatePreset(editDs.presetId!, editDs.id)}
307
+ disabled={!editPresetName.trim()}
308
+ className="flex-1 px-2 py-1 text-xs bg-emerald-600 hover:bg-emerald-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors"
309
+ >
310
+ Save
311
+ </button>
312
+ <button
313
+ onClick={() => handleDeletePreset(editDs.presetId!, editDs.id)}
314
+ className="px-2 py-1 text-xs bg-red-900 hover:bg-red-800 rounded text-red-300 transition-colors"
315
+ >
316
+ Delete
317
+ </button>
318
+ <button
319
+ onClick={() => setEditingDatasetId(null)}
320
+ className="px-2 py-1 text-xs bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors"
321
+ >
322
+ Cancel
323
+ </button>
324
+ </div>
325
+ </div>
326
+ );
327
+ })()}
328
+
329
+ {/* Add Dataset Form */}
330
+ <div className="p-3 border-t border-gray-700">
331
+ {showAddForm ? (
332
+ <div className="space-y-2">
333
+ <input
334
+ className="w-full bg-gray-800 text-sm text-gray-200 rounded px-2 py-1.5 border border-gray-600 focus:border-emerald-500 outline-none"
335
+ placeholder="org/repo-name"
336
+ value={repo}
337
+ onChange={(e) => setRepo(e.target.value)}
338
+ onKeyDown={(e) => e.key === "Enter" && handleAdd()}
339
+ autoFocus
340
+ />
341
+ <div className="flex gap-2">
342
+ <input
343
+ className="flex-1 bg-gray-800 text-xs text-gray-200 rounded px-2 py-1 border border-gray-600 focus:border-emerald-500 outline-none"
344
+ placeholder="Config"
345
+ value={config}
346
+ onChange={(e) => setConfig(e.target.value)}
347
+ />
348
+ <input
349
+ className="w-16 bg-gray-800 text-xs text-gray-200 rounded px-2 py-1 border border-gray-600 focus:border-emerald-500 outline-none"
350
+ placeholder="Split"
351
+ value={split}
352
+ onChange={(e) => setSplit(e.target.value)}
353
+ />
354
+ </div>
355
+ <div className="flex gap-2">
356
+ <button
357
+ className="flex-1 px-2 py-1.5 text-sm bg-emerald-600 hover:bg-emerald-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors"
358
+ onClick={handleAdd}
359
+ disabled={!repo.trim() || !!loading[repo.trim()]}
360
+ >
361
+ {loading[repo.trim()] ? "Loading..." : "Load"}
362
+ </button>
363
+ <button
364
+ className="px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors"
365
+ onClick={() => setShowAddForm(false)}
366
+ >
367
+ Cancel
368
+ </button>
369
+ </div>
370
+ </div>
371
+ ) : (
372
+ <button
373
+ className="w-full px-3 py-2 text-sm bg-emerald-600 hover:bg-emerald-500 rounded text-white font-medium transition-colors"
374
+ onClick={() => {
375
+ setEditingDatasetId(null);
376
+ setShowAddForm(true);
377
+ setRepo("");
378
+ setConfig("rlm_call_traces");
379
+ setSplit("train");
380
+ }}
381
+ >
382
+ + Add Dataset
383
+ </button>
384
+ )}
385
+ </div>
386
+ </div>
387
+ );
388
+ }
frontend/src/rlm-eval/store.ts ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { useState, useCallback, useEffect, useMemo } from "react";
2
+ import type {
3
+ DatasetInfo,
4
+ Preset,
5
+ PanelNav,
6
+ OverviewData,
7
+ ExampleDetailData,
8
+ RlmIterDetail,
9
+ } from "./types";
10
+ import { api } from "./api";
11
+
12
+ export function useAppState() {
13
+ const [datasets, setDatasets] = useState<DatasetInfo[]>([]);
14
+ const [presets, setPresets] = useState<Preset[]>([]);
15
+ const [error, setError] = useState<string | null>(null);
16
+ const [loading, setLoading] = useState<Record<string, boolean>>({});
17
+
18
+ // Dual panel navigation
19
+ const [panelA, setPanelA] = useState<PanelNav | null>(null);
20
+ const [panelB, setPanelB] = useState<PanelNav | null>(null);
21
+ const [comparisonMode, setComparisonMode] = useState(false);
22
+
23
+ // Data caches
24
+ const [overviewCache, setOverviewCache] = useState<Record<string, OverviewData>>({});
25
+ const [exampleDetailCache, setExampleDetailCache] = useState<Record<string, ExampleDetailData>>({});
26
+ const [iterDetailCache, setIterDetailCache] = useState<Record<string, RlmIterDetail>>({});
27
+
28
+ // Load presets on mount
29
+ useEffect(() => {
30
+ api.listPresets().then((data) => setPresets(data as unknown as Preset[])).catch(() => {});
31
+ }, []);
32
+
33
+ const activeDatasets = useMemo(() => datasets.filter((d) => d.active), [datasets]);
34
+
35
+ // Data fetching helpers
36
+ const fetchOverview = useCallback(async (dsId: string) => {
37
+ if (overviewCache[dsId]) return overviewCache[dsId];
38
+ const data = (await api.getOverview(dsId)) as unknown as OverviewData;
39
+ setOverviewCache((prev) => ({ ...prev, [dsId]: data }));
40
+ return data;
41
+ }, [overviewCache]);
42
+
43
+ const fetchExampleDetail = useCallback(
44
+ async (dsId: string, exampleIdx: number) => {
45
+ const key = `${dsId}:${exampleIdx}`;
46
+ if (exampleDetailCache[key]) return exampleDetailCache[key];
47
+ const data = (await api.getExampleDetail(dsId, exampleIdx)) as unknown as ExampleDetailData;
48
+ setExampleDetailCache((prev) => ({ ...prev, [key]: data }));
49
+ return data;
50
+ },
51
+ [exampleDetailCache]
52
+ );
53
+
54
+ const fetchIterDetail = useCallback(
55
+ async (dsId: string, exampleIdx: number, rlmIter: number) => {
56
+ const key = `${dsId}:${exampleIdx}:${rlmIter}`;
57
+ if (iterDetailCache[key]) return iterDetailCache[key];
58
+ const data = (await api.getIterDetail(dsId, exampleIdx, rlmIter)) as unknown as RlmIterDetail;
59
+ setIterDetailCache((prev) => ({ ...prev, [key]: data }));
60
+ return data;
61
+ },
62
+ [iterDetailCache]
63
+ );
64
+
65
+ // Dataset operations
66
+ const addDataset = useCallback(
67
+ async (repo: string, config?: string, split?: string, presetId?: string, presetName?: string) => {
68
+ setLoading((prev) => ({ ...prev, [repo]: true }));
69
+ setError(null);
70
+ try {
71
+ const result = await api.loadDataset(repo, config, split);
72
+ const dsInfo: DatasetInfo = {
73
+ id: result.id,
74
+ repo: result.repo,
75
+ name: result.name,
76
+ config: result.config,
77
+ split: result.split,
78
+ metadata: result.metadata as unknown as DatasetInfo["metadata"],
79
+ n_examples: result.n_examples,
80
+ n_rows: result.n_rows,
81
+ active: true,
82
+ presetId,
83
+ presetName,
84
+ };
85
+
86
+ setDatasets((prev) => {
87
+ if (prev.some((d) => d.id === dsInfo.id)) return prev;
88
+ return [...prev, dsInfo];
89
+ });
90
+
91
+ // Auto-set panel A if not set
92
+ setPanelA((prev) => prev || { datasetId: dsInfo.id, level: 1 });
93
+ } catch (e: unknown) {
94
+ setError(e instanceof Error ? e.message : "Failed to load dataset");
95
+ } finally {
96
+ setLoading((prev) => ({ ...prev, [repo]: false }));
97
+ }
98
+ },
99
+ []
100
+ );
101
+
102
+ const removeDataset = useCallback(async (id: string) => {
103
+ await api.unloadDataset(id).catch(() => {});
104
+ setDatasets((prev) => prev.filter((d) => d.id !== id));
105
+ setPanelA((prev) => (prev?.datasetId === id ? null : prev));
106
+ setPanelB((prev) => (prev?.datasetId === id ? null : prev));
107
+ }, []);
108
+
109
+ const toggleDataset = useCallback((id: string) => {
110
+ setDatasets((prev) => prev.map((d) => (d.id === id ? { ...d, active: !d.active } : d)));
111
+ }, []);
112
+
113
+ // Navigation
114
+ const navigatePanel = useCallback(
115
+ (panel: "A" | "B", nav: PanelNav) => {
116
+ if (panel === "A") setPanelA(nav);
117
+ else setPanelB(nav);
118
+ },
119
+ []
120
+ );
121
+
122
+ const goUp = useCallback((panel: "A" | "B") => {
123
+ const setter = panel === "A" ? setPanelA : setPanelB;
124
+ setter((prev) => {
125
+ if (!prev) return prev;
126
+ if (prev.level === 2) return { ...prev, level: 1, exampleIdx: undefined };
127
+ return prev;
128
+ });
129
+ }, []);
130
+
131
+ const updateDatasetPresetName = useCallback((dsId: string, name: string) => {
132
+ setDatasets((prev) => prev.map((d) => (d.id === dsId ? { ...d, presetName: name } : d)));
133
+ }, []);
134
+
135
+ const clearDatasetPreset = useCallback((dsId: string) => {
136
+ setDatasets((prev) =>
137
+ prev.map((d) => (d.id === dsId ? { ...d, presetId: undefined, presetName: undefined } : d))
138
+ );
139
+ }, []);
140
+
141
+ const toggleComparison = useCallback(() => {
142
+ setComparisonMode((prev) => {
143
+ if (!prev && panelA) {
144
+ // Entering comparison: initialize panel B same as A
145
+ setPanelB({ ...panelA });
146
+ } else if (prev) {
147
+ setPanelB(null);
148
+ }
149
+ return !prev;
150
+ });
151
+ }, [panelA]);
152
+
153
+ return {
154
+ datasets,
155
+ presets,
156
+ setPresets,
157
+ error,
158
+ setError,
159
+ loading,
160
+ activeDatasets,
161
+ panelA,
162
+ panelB,
163
+ comparisonMode,
164
+ addDataset,
165
+ removeDataset,
166
+ toggleDataset,
167
+ updateDatasetPresetName,
168
+ clearDatasetPreset,
169
+ navigatePanel,
170
+ goUp,
171
+ toggleComparison,
172
+ fetchOverview,
173
+ fetchExampleDetail,
174
+ fetchIterDetail,
175
+ overviewCache,
176
+ exampleDetailCache,
177
+ iterDetailCache,
178
+ };
179
+ }
frontend/src/rlm-eval/types.ts ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export interface ExperimentMetadata {
2
+ run_id: string;
3
+ method: string;
4
+ model: string;
5
+ }
6
+
7
+ export interface DatasetInfo {
8
+ id: string;
9
+ repo: string;
10
+ name: string;
11
+ config: string;
12
+ split: string;
13
+ metadata: ExperimentMetadata;
14
+ n_examples: number;
15
+ n_rows: number;
16
+ active: boolean;
17
+ presetId?: string;
18
+ presetName?: string;
19
+ }
20
+
21
+ export interface ExampleSummary {
22
+ example_idx: number;
23
+ question_text: string;
24
+ eval_correct: boolean | null;
25
+ n_iterations: number;
26
+ total_input_tokens: number;
27
+ total_output_tokens: number;
28
+ total_execution_time: number;
29
+ final_answer_preview: string;
30
+ }
31
+
32
+ export interface OverviewData {
33
+ metadata: ExperimentMetadata;
34
+ examples: ExampleSummary[];
35
+ }
36
+
37
+ export interface RlmIterSummary {
38
+ rlm_iter: number;
39
+ model: string;
40
+ input_tokens: number;
41
+ output_tokens: number;
42
+ execution_time: number;
43
+ has_code_blocks: boolean;
44
+ n_code_blocks: number;
45
+ response_preview: string;
46
+ has_final_answer: boolean;
47
+ timestamp: string;
48
+ }
49
+
50
+ export interface ExampleDetailData {
51
+ example_idx: number;
52
+ question_text: string;
53
+ eval_correct: boolean | null;
54
+ total_input_tokens: number;
55
+ total_output_tokens: number;
56
+ total_execution_time: number;
57
+ final_answer: string | null;
58
+ iterations: RlmIterSummary[];
59
+ }
60
+
61
+ export interface CodeBlock {
62
+ code: string;
63
+ stdout?: string;
64
+ }
65
+
66
+ export interface RlmIterDetail {
67
+ rlm_iter: number;
68
+ prompt: string;
69
+ response: string;
70
+ model: string;
71
+ input_tokens: number;
72
+ output_tokens: number;
73
+ execution_time: number;
74
+ has_code_blocks: boolean;
75
+ code_blocks: CodeBlock[];
76
+ final_answer: string | null;
77
+ timestamp: string;
78
+ }
79
+
80
+ export interface Preset {
81
+ id: string;
82
+ name: string;
83
+ repo: string;
84
+ config: string;
85
+ split?: string;
86
+ }
87
+
88
+ export interface PanelNav {
89
+ datasetId: string;
90
+ level: 1 | 2;
91
+ exampleIdx?: number;
92
+ }