Zayne Rea Sprague commited on
Commit ·
b630916
1
Parent(s): cdf803d
new tab
Browse files- backend/api/presets.py +2 -2
- backend/api/rlm_eval_datasets.py +247 -0
- backend/app.py +2 -1
- frontend/src/App.tsx +9 -2
- frontend/src/rlm-eval/RlmEvalApp.tsx +133 -0
- frontend/src/rlm-eval/api.ts +74 -0
- frontend/src/rlm-eval/components/Breadcrumb.tsx +45 -0
- frontend/src/rlm-eval/components/DatasetSelector.tsx +23 -0
- frontend/src/rlm-eval/components/ExampleDetailLevel.tsx +137 -0
- frontend/src/rlm-eval/components/IterationDetail.tsx +116 -0
- frontend/src/rlm-eval/components/OverviewLevel.tsx +105 -0
- frontend/src/rlm-eval/components/Panel.tsx +86 -0
- frontend/src/rlm-eval/components/Sidebar.tsx +388 -0
- frontend/src/rlm-eval/store.ts +179 -0
- frontend/src/rlm-eval/types.ts +92 -0
backend/api/presets.py
CHANGED
|
@@ -8,7 +8,7 @@ from flask import Blueprint, request, jsonify
|
|
| 8 |
bp = Blueprint("presets", __name__, url_prefix="/api/presets")
|
| 9 |
|
| 10 |
PRESETS_REPO = "reasoning-degeneration-dev/AGG_VIS_PRESETS"
|
| 11 |
-
VALID_TYPES = {"model", "arena", "rlm", "harbor"}
|
| 12 |
LOCAL_PRESETS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "presets")
|
| 13 |
|
| 14 |
# In-memory cache: vis_type -> list[dict]
|
|
@@ -133,7 +133,7 @@ def create_preset(vis_type):
|
|
| 133 |
|
| 134 |
if vis_type == "model":
|
| 135 |
preset["column"] = data.get("column", "model_responses")
|
| 136 |
-
elif vis_type
|
| 137 |
preset["config"] = data.get("config", "rlm_call_traces")
|
| 138 |
|
| 139 |
presets = _get_presets(vis_type)
|
|
|
|
| 8 |
bp = Blueprint("presets", __name__, url_prefix="/api/presets")
|
| 9 |
|
| 10 |
PRESETS_REPO = "reasoning-degeneration-dev/AGG_VIS_PRESETS"
|
| 11 |
+
VALID_TYPES = {"model", "arena", "rlm", "rlm-eval", "harbor"}
|
| 12 |
LOCAL_PRESETS_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "presets")
|
| 13 |
|
| 14 |
# In-memory cache: vis_type -> list[dict]
|
|
|
|
| 133 |
|
| 134 |
if vis_type == "model":
|
| 135 |
preset["column"] = data.get("column", "model_responses")
|
| 136 |
+
elif vis_type in ("rlm", "rlm-eval"):
|
| 137 |
preset["config"] = data.get("config", "rlm_call_traces")
|
| 138 |
|
| 139 |
presets = _get_presets(vis_type)
|
backend/api/rlm_eval_datasets.py
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import hashlib
|
| 3 |
+
from flask import Blueprint, request, jsonify
|
| 4 |
+
from datasets import load_dataset
|
| 5 |
+
|
| 6 |
+
bp = Blueprint("rlm_eval_datasets", __name__, url_prefix="/api/rlm-eval/datasets")
|
| 7 |
+
|
| 8 |
+
_cache: dict[str, dict] = {}
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def _make_id(repo: str, config: str, split: str) -> str:
|
| 12 |
+
key = f"{repo}:{config}:{split}"
|
| 13 |
+
return hashlib.md5(key.encode()).hexdigest()[:12]
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _build_hierarchy(rows: list[dict]) -> dict:
|
| 17 |
+
"""Reconstruct hierarchy from flat rows: examples -> iterations."""
|
| 18 |
+
examples: dict[int, dict] = {}
|
| 19 |
+
|
| 20 |
+
for row in rows:
|
| 21 |
+
ei = row.get("example_idx", 0)
|
| 22 |
+
ri = row.get("rlm_iter", 0)
|
| 23 |
+
|
| 24 |
+
if ei not in examples:
|
| 25 |
+
examples[ei] = {
|
| 26 |
+
"example_idx": ei,
|
| 27 |
+
"question_text": row.get("question_text", ""),
|
| 28 |
+
"eval_correct": row.get("eval_correct"),
|
| 29 |
+
"iterations": {},
|
| 30 |
+
"total_input_tokens": 0,
|
| 31 |
+
"total_output_tokens": 0,
|
| 32 |
+
"total_execution_time": 0.0,
|
| 33 |
+
"final_answer": None,
|
| 34 |
+
"final_answer_preview": "",
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
ex = examples[ei]
|
| 38 |
+
|
| 39 |
+
# Parse code blocks
|
| 40 |
+
code_blocks = []
|
| 41 |
+
cbj = row.get("code_blocks_json", "")
|
| 42 |
+
if cbj and cbj != "[]":
|
| 43 |
+
try:
|
| 44 |
+
code_blocks = json.loads(cbj) if isinstance(cbj, str) else cbj
|
| 45 |
+
except (json.JSONDecodeError, TypeError):
|
| 46 |
+
code_blocks = []
|
| 47 |
+
|
| 48 |
+
iteration = {
|
| 49 |
+
"rlm_iter": ri,
|
| 50 |
+
"prompt": row.get("prompt", ""),
|
| 51 |
+
"response": row.get("response", ""),
|
| 52 |
+
"model": row.get("model", ""),
|
| 53 |
+
"input_tokens": row.get("input_tokens", 0),
|
| 54 |
+
"output_tokens": row.get("output_tokens", 0),
|
| 55 |
+
"execution_time": row.get("execution_time", 0.0),
|
| 56 |
+
"has_code_blocks": row.get("has_code_blocks", False),
|
| 57 |
+
"code_blocks": code_blocks,
|
| 58 |
+
"final_answer": row.get("final_answer"),
|
| 59 |
+
"timestamp": row.get("timestamp", ""),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
ex["iterations"][ri] = iteration
|
| 63 |
+
ex["total_input_tokens"] += iteration["input_tokens"] or 0
|
| 64 |
+
ex["total_output_tokens"] += iteration["output_tokens"] or 0
|
| 65 |
+
ex["total_execution_time"] += iteration["execution_time"] or 0.0
|
| 66 |
+
|
| 67 |
+
if iteration["final_answer"]:
|
| 68 |
+
ex["final_answer"] = iteration["final_answer"]
|
| 69 |
+
ex["final_answer_preview"] = (iteration["final_answer"] or "")[:200]
|
| 70 |
+
|
| 71 |
+
# Sort and convert dicts to lists
|
| 72 |
+
result = []
|
| 73 |
+
for ei_key in sorted(examples.keys()):
|
| 74 |
+
ex = examples[ei_key]
|
| 75 |
+
iters_list = []
|
| 76 |
+
for ri_key in sorted(ex["iterations"].keys()):
|
| 77 |
+
iters_list.append(ex["iterations"][ri_key])
|
| 78 |
+
ex["iterations"] = iters_list
|
| 79 |
+
result.append(ex)
|
| 80 |
+
|
| 81 |
+
return {"examples": result}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
@bp.route("/load", methods=["POST"])
|
| 85 |
+
def load_dataset_endpoint():
|
| 86 |
+
data = request.get_json()
|
| 87 |
+
repo = data.get("repo", "").strip()
|
| 88 |
+
if not repo:
|
| 89 |
+
return jsonify({"error": "repo is required"}), 400
|
| 90 |
+
|
| 91 |
+
config = data.get("config", "rlm_call_traces")
|
| 92 |
+
split = data.get("split", "train")
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
ds = load_dataset(repo, config, split=split)
|
| 96 |
+
except Exception as e:
|
| 97 |
+
return jsonify({"error": f"Failed to load dataset: {e}"}), 400
|
| 98 |
+
|
| 99 |
+
ds_id = _make_id(repo, config, split)
|
| 100 |
+
rows = [ds[i] for i in range(len(ds))]
|
| 101 |
+
hierarchy = _build_hierarchy(rows)
|
| 102 |
+
|
| 103 |
+
# Extract metadata from first row
|
| 104 |
+
first_row = rows[0] if rows else {}
|
| 105 |
+
metadata = {
|
| 106 |
+
"run_id": first_row.get("run_id", ""),
|
| 107 |
+
"method": first_row.get("method", ""),
|
| 108 |
+
"model": first_row.get("model", ""),
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
_cache[ds_id] = {
|
| 112 |
+
"repo": repo,
|
| 113 |
+
"config": config,
|
| 114 |
+
"split": split,
|
| 115 |
+
"hierarchy": hierarchy,
|
| 116 |
+
"metadata": metadata,
|
| 117 |
+
"n_rows": len(rows),
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
short_name = repo.rsplit("/", 1)[-1] if "/" in repo else repo
|
| 121 |
+
|
| 122 |
+
return jsonify({
|
| 123 |
+
"id": ds_id,
|
| 124 |
+
"repo": repo,
|
| 125 |
+
"name": short_name,
|
| 126 |
+
"config": config,
|
| 127 |
+
"split": split,
|
| 128 |
+
"metadata": metadata,
|
| 129 |
+
"n_examples": len(hierarchy["examples"]),
|
| 130 |
+
"n_rows": len(rows),
|
| 131 |
+
})
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
@bp.route("/", methods=["GET"])
|
| 135 |
+
def list_datasets():
|
| 136 |
+
result = []
|
| 137 |
+
for ds_id, info in _cache.items():
|
| 138 |
+
result.append({
|
| 139 |
+
"id": ds_id,
|
| 140 |
+
"repo": info["repo"],
|
| 141 |
+
"name": info["repo"].rsplit("/", 1)[-1] if "/" in info["repo"] else info["repo"],
|
| 142 |
+
"config": info["config"],
|
| 143 |
+
"split": info["split"],
|
| 144 |
+
"metadata": info["metadata"],
|
| 145 |
+
"n_rows": info["n_rows"],
|
| 146 |
+
"n_examples": len(info["hierarchy"]["examples"]),
|
| 147 |
+
})
|
| 148 |
+
return jsonify(result)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
@bp.route("/<ds_id>/overview", methods=["GET"])
|
| 152 |
+
def get_overview(ds_id):
|
| 153 |
+
"""Level 1: Summary of all examples."""
|
| 154 |
+
if ds_id not in _cache:
|
| 155 |
+
return jsonify({"error": "Dataset not loaded"}), 404
|
| 156 |
+
|
| 157 |
+
info = _cache[ds_id]
|
| 158 |
+
hierarchy = info["hierarchy"]
|
| 159 |
+
|
| 160 |
+
summaries = []
|
| 161 |
+
for ex in hierarchy["examples"]:
|
| 162 |
+
summaries.append({
|
| 163 |
+
"example_idx": ex["example_idx"],
|
| 164 |
+
"question_text": (ex["question_text"] or "")[:300],
|
| 165 |
+
"eval_correct": ex["eval_correct"],
|
| 166 |
+
"n_iterations": len(ex["iterations"]),
|
| 167 |
+
"total_input_tokens": ex["total_input_tokens"],
|
| 168 |
+
"total_output_tokens": ex["total_output_tokens"],
|
| 169 |
+
"total_execution_time": ex["total_execution_time"],
|
| 170 |
+
"final_answer_preview": ex["final_answer_preview"],
|
| 171 |
+
})
|
| 172 |
+
|
| 173 |
+
return jsonify({
|
| 174 |
+
"metadata": info["metadata"],
|
| 175 |
+
"examples": summaries,
|
| 176 |
+
})
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
@bp.route("/<ds_id>/example/<int:example_idx>", methods=["GET"])
|
| 180 |
+
def get_example_detail(ds_id, example_idx):
|
| 181 |
+
"""Level 2: Iteration timeline for one example."""
|
| 182 |
+
if ds_id not in _cache:
|
| 183 |
+
return jsonify({"error": "Dataset not loaded"}), 404
|
| 184 |
+
|
| 185 |
+
info = _cache[ds_id]
|
| 186 |
+
hierarchy = info["hierarchy"]
|
| 187 |
+
|
| 188 |
+
ex_data = None
|
| 189 |
+
for ex in hierarchy["examples"]:
|
| 190 |
+
if ex["example_idx"] == example_idx:
|
| 191 |
+
ex_data = ex
|
| 192 |
+
break
|
| 193 |
+
|
| 194 |
+
if ex_data is None:
|
| 195 |
+
return jsonify({"error": f"Example {example_idx} not found"}), 404
|
| 196 |
+
|
| 197 |
+
iters = []
|
| 198 |
+
for it in ex_data["iterations"]:
|
| 199 |
+
iters.append({
|
| 200 |
+
"rlm_iter": it["rlm_iter"],
|
| 201 |
+
"model": it["model"],
|
| 202 |
+
"input_tokens": it["input_tokens"],
|
| 203 |
+
"output_tokens": it["output_tokens"],
|
| 204 |
+
"execution_time": it["execution_time"],
|
| 205 |
+
"has_code_blocks": it["has_code_blocks"],
|
| 206 |
+
"n_code_blocks": len(it["code_blocks"]),
|
| 207 |
+
"response_preview": (it["response"] or "")[:300],
|
| 208 |
+
"has_final_answer": it["final_answer"] is not None,
|
| 209 |
+
"timestamp": it["timestamp"],
|
| 210 |
+
})
|
| 211 |
+
|
| 212 |
+
return jsonify({
|
| 213 |
+
"example_idx": example_idx,
|
| 214 |
+
"question_text": ex_data["question_text"],
|
| 215 |
+
"eval_correct": ex_data["eval_correct"],
|
| 216 |
+
"total_input_tokens": ex_data["total_input_tokens"],
|
| 217 |
+
"total_output_tokens": ex_data["total_output_tokens"],
|
| 218 |
+
"total_execution_time": ex_data["total_execution_time"],
|
| 219 |
+
"final_answer": ex_data["final_answer"],
|
| 220 |
+
"iterations": iters,
|
| 221 |
+
})
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
@bp.route("/<ds_id>/example/<int:example_idx>/iter/<int:rlm_iter>", methods=["GET"])
|
| 225 |
+
def get_iter_detail(ds_id, example_idx, rlm_iter):
|
| 226 |
+
"""Full detail for a specific RLM iteration within an example."""
|
| 227 |
+
if ds_id not in _cache:
|
| 228 |
+
return jsonify({"error": "Dataset not loaded"}), 404
|
| 229 |
+
|
| 230 |
+
info = _cache[ds_id]
|
| 231 |
+
hierarchy = info["hierarchy"]
|
| 232 |
+
|
| 233 |
+
for ex in hierarchy["examples"]:
|
| 234 |
+
if ex["example_idx"] != example_idx:
|
| 235 |
+
continue
|
| 236 |
+
for it in ex["iterations"]:
|
| 237 |
+
if it["rlm_iter"] == rlm_iter:
|
| 238 |
+
return jsonify(it)
|
| 239 |
+
|
| 240 |
+
return jsonify({"error": "Iteration not found"}), 404
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@bp.route("/<ds_id>", methods=["DELETE"])
|
| 244 |
+
def unload_dataset(ds_id):
|
| 245 |
+
if ds_id in _cache:
|
| 246 |
+
del _cache[ds_id]
|
| 247 |
+
return jsonify({"status": "ok"})
|
backend/app.py
CHANGED
|
@@ -6,10 +6,11 @@ def create_app():
|
|
| 6 |
app = Flask(__name__, static_folder="../frontend/dist", static_url_path="/")
|
| 7 |
CORS(app)
|
| 8 |
|
| 9 |
-
from backend.api import model_datasets, arena_datasets, rlm_datasets, harbor_datasets, presets
|
| 10 |
app.register_blueprint(model_datasets.bp)
|
| 11 |
app.register_blueprint(arena_datasets.bp)
|
| 12 |
app.register_blueprint(rlm_datasets.bp)
|
|
|
|
| 13 |
app.register_blueprint(harbor_datasets.bp)
|
| 14 |
app.register_blueprint(presets.bp)
|
| 15 |
|
|
|
|
| 6 |
app = Flask(__name__, static_folder="../frontend/dist", static_url_path="/")
|
| 7 |
CORS(app)
|
| 8 |
|
| 9 |
+
from backend.api import model_datasets, arena_datasets, rlm_datasets, rlm_eval_datasets, harbor_datasets, presets
|
| 10 |
app.register_blueprint(model_datasets.bp)
|
| 11 |
app.register_blueprint(arena_datasets.bp)
|
| 12 |
app.register_blueprint(rlm_datasets.bp)
|
| 13 |
+
app.register_blueprint(rlm_eval_datasets.bp)
|
| 14 |
app.register_blueprint(harbor_datasets.bp)
|
| 15 |
app.register_blueprint(presets.bp)
|
| 16 |
|
frontend/src/App.tsx
CHANGED
|
@@ -2,15 +2,17 @@ import { useState, lazy, Suspense } from "react";
|
|
| 2 |
|
| 3 |
const ModelApp = lazy(() => import("./model/ModelApp"));
|
| 4 |
const ArenaApp = lazy(() => import("./arena/ArenaApp"));
|
|
|
|
| 5 |
const RlmApp = lazy(() => import("./rlm/RlmApp"));
|
| 6 |
const HarborApp = lazy(() => import("./harbor/HarborApp"));
|
| 7 |
|
| 8 |
-
type TabId = "model" | "arena" | "rlm" | "harbor";
|
| 9 |
|
| 10 |
const TABS: { id: TabId; label: string; color: string; activeClass: string }[] = [
|
| 11 |
{ id: "model", label: "Model Trace", color: "blue", activeClass: "border-blue-500 text-blue-400" },
|
| 12 |
{ id: "arena", label: "Arena", color: "purple", activeClass: "border-purple-500 text-purple-400" },
|
| 13 |
-
{ id: "rlm", label: "RLM", color: "
|
|
|
|
| 14 |
{ id: "harbor", label: "Harbor", color: "teal", activeClass: "border-teal-500 text-teal-400" },
|
| 15 |
];
|
| 16 |
|
|
@@ -56,6 +58,11 @@ export default function App() {
|
|
| 56 |
<ArenaApp />
|
| 57 |
</div>
|
| 58 |
)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
{activeTab === "rlm" && (
|
| 60 |
<div className="theme-rlm h-full">
|
| 61 |
<RlmApp />
|
|
|
|
| 2 |
|
| 3 |
const ModelApp = lazy(() => import("./model/ModelApp"));
|
| 4 |
const ArenaApp = lazy(() => import("./arena/ArenaApp"));
|
| 5 |
+
const RlmEvalApp = lazy(() => import("./rlm-eval/RlmEvalApp"));
|
| 6 |
const RlmApp = lazy(() => import("./rlm/RlmApp"));
|
| 7 |
const HarborApp = lazy(() => import("./harbor/HarborApp"));
|
| 8 |
|
| 9 |
+
type TabId = "model" | "arena" | "rlm-eval" | "rlm" | "harbor";
|
| 10 |
|
| 11 |
const TABS: { id: TabId; label: string; color: string; activeClass: string }[] = [
|
| 12 |
{ id: "model", label: "Model Trace", color: "blue", activeClass: "border-blue-500 text-blue-400" },
|
| 13 |
{ id: "arena", label: "Arena", color: "purple", activeClass: "border-purple-500 text-purple-400" },
|
| 14 |
+
{ id: "rlm-eval", label: "RLM", color: "emerald", activeClass: "border-emerald-500 text-emerald-400" },
|
| 15 |
+
{ id: "rlm", label: "RLM+GEPA", color: "orange", activeClass: "border-orange-500 text-orange-400" },
|
| 16 |
{ id: "harbor", label: "Harbor", color: "teal", activeClass: "border-teal-500 text-teal-400" },
|
| 17 |
];
|
| 18 |
|
|
|
|
| 58 |
<ArenaApp />
|
| 59 |
</div>
|
| 60 |
)}
|
| 61 |
+
{activeTab === "rlm-eval" && (
|
| 62 |
+
<div className="theme-rlm-eval h-full">
|
| 63 |
+
<RlmEvalApp />
|
| 64 |
+
</div>
|
| 65 |
+
)}
|
| 66 |
{activeTab === "rlm" && (
|
| 67 |
<div className="theme-rlm h-full">
|
| 68 |
<RlmApp />
|
frontend/src/rlm-eval/RlmEvalApp.tsx
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useEffect } from "react";
|
| 2 |
+
import { useAppState } from "./store";
|
| 3 |
+
import Sidebar from "./components/Sidebar";
|
| 4 |
+
import Panel from "./components/Panel";
|
| 5 |
+
|
| 6 |
+
function RlmEvalApp() {
|
| 7 |
+
const state = useAppState();
|
| 8 |
+
|
| 9 |
+
const handleSelectDataset = (id: string) => {
|
| 10 |
+
state.navigatePanel("A", { datasetId: id, level: 1 });
|
| 11 |
+
};
|
| 12 |
+
|
| 13 |
+
// Keyboard shortcuts
|
| 14 |
+
useEffect(() => {
|
| 15 |
+
const handler = (e: KeyboardEvent) => {
|
| 16 |
+
if (
|
| 17 |
+
e.target instanceof HTMLInputElement ||
|
| 18 |
+
e.target instanceof HTMLTextAreaElement ||
|
| 19 |
+
e.target instanceof HTMLSelectElement
|
| 20 |
+
)
|
| 21 |
+
return;
|
| 22 |
+
|
| 23 |
+
switch (e.key) {
|
| 24 |
+
case "Escape":
|
| 25 |
+
state.goUp("A");
|
| 26 |
+
break;
|
| 27 |
+
case "c":
|
| 28 |
+
state.toggleComparison();
|
| 29 |
+
break;
|
| 30 |
+
}
|
| 31 |
+
};
|
| 32 |
+
window.addEventListener("keydown", handler);
|
| 33 |
+
return () => window.removeEventListener("keydown", handler);
|
| 34 |
+
}, [state.goUp, state.toggleComparison]);
|
| 35 |
+
|
| 36 |
+
return (
|
| 37 |
+
<div className="flex h-full bg-gray-950 text-gray-100">
|
| 38 |
+
{/* Sidebar */}
|
| 39 |
+
<Sidebar
|
| 40 |
+
datasets={state.datasets}
|
| 41 |
+
presets={state.presets}
|
| 42 |
+
setPresets={state.setPresets}
|
| 43 |
+
loading={state.loading}
|
| 44 |
+
onAddDataset={state.addDataset}
|
| 45 |
+
onRemoveDataset={state.removeDataset}
|
| 46 |
+
onToggleDataset={state.toggleDataset}
|
| 47 |
+
onSelectDataset={handleSelectDataset}
|
| 48 |
+
onUpdateDatasetPresetName={state.updateDatasetPresetName}
|
| 49 |
+
onClearDatasetPreset={state.clearDatasetPreset}
|
| 50 |
+
/>
|
| 51 |
+
|
| 52 |
+
{/* Main content */}
|
| 53 |
+
<div className="flex-1 flex flex-col overflow-hidden">
|
| 54 |
+
{/* Error banner */}
|
| 55 |
+
{state.error && (
|
| 56 |
+
<div className="bg-red-900 border-b border-red-700 px-4 py-2 text-sm text-red-200 flex justify-between">
|
| 57 |
+
<span>{state.error}</span>
|
| 58 |
+
<button onClick={() => state.setError(null)} className="text-red-300 hover:text-red-100">
|
| 59 |
+
x
|
| 60 |
+
</button>
|
| 61 |
+
</div>
|
| 62 |
+
)}
|
| 63 |
+
|
| 64 |
+
{/* Toolbar */}
|
| 65 |
+
<div className="flex items-center justify-between px-4 py-2 border-b border-gray-700 bg-gray-900">
|
| 66 |
+
<div className="text-sm text-gray-400">
|
| 67 |
+
{state.activeDatasets.length} dataset{state.activeDatasets.length !== 1 ? "s" : ""} loaded
|
| 68 |
+
</div>
|
| 69 |
+
<button
|
| 70 |
+
className={`text-sm px-3 py-1 rounded ${
|
| 71 |
+
state.comparisonMode
|
| 72 |
+
? "bg-emerald-600 text-white"
|
| 73 |
+
: "bg-gray-800 text-gray-300 hover:bg-gray-700"
|
| 74 |
+
}`}
|
| 75 |
+
onClick={state.toggleComparison}
|
| 76 |
+
>
|
| 77 |
+
{state.comparisonMode ? "Exit Compare" : "Compare"}
|
| 78 |
+
</button>
|
| 79 |
+
</div>
|
| 80 |
+
|
| 81 |
+
{/* Panels */}
|
| 82 |
+
<div className="flex-1 flex gap-2 p-2 overflow-hidden">
|
| 83 |
+
{state.panelA ? (
|
| 84 |
+
<div className={state.comparisonMode ? "w-1/2" : "w-full"}>
|
| 85 |
+
<Panel
|
| 86 |
+
nav={state.panelA}
|
| 87 |
+
dataset={state.datasets.find((d) => d.id === state.panelA?.datasetId)}
|
| 88 |
+
panelLabel={state.comparisonMode ? "A" : undefined}
|
| 89 |
+
onNavigate={(nav) => state.navigatePanel("A", nav)}
|
| 90 |
+
onGoUp={() => state.goUp("A")}
|
| 91 |
+
fetchOverview={state.fetchOverview}
|
| 92 |
+
fetchExampleDetail={state.fetchExampleDetail}
|
| 93 |
+
fetchIterDetail={state.fetchIterDetail}
|
| 94 |
+
/>
|
| 95 |
+
</div>
|
| 96 |
+
) : (
|
| 97 |
+
<div className="flex-1 flex items-center justify-center text-gray-500">
|
| 98 |
+
<div className="text-center">
|
| 99 |
+
<p className="text-lg mb-2">No dataset loaded</p>
|
| 100 |
+
<p className="text-sm">Add a dataset from the sidebar to get started</p>
|
| 101 |
+
</div>
|
| 102 |
+
</div>
|
| 103 |
+
)}
|
| 104 |
+
|
| 105 |
+
{state.comparisonMode && state.panelB && (
|
| 106 |
+
<div className="w-1/2">
|
| 107 |
+
<Panel
|
| 108 |
+
nav={state.panelB}
|
| 109 |
+
dataset={state.datasets.find((d) => d.id === state.panelB?.datasetId)}
|
| 110 |
+
panelLabel="B"
|
| 111 |
+
datasets={state.datasets}
|
| 112 |
+
onNavigate={(nav) => state.navigatePanel("B", nav)}
|
| 113 |
+
onGoUp={() => state.goUp("B")}
|
| 114 |
+
onSwitchDataset={(id) => state.navigatePanel("B", { datasetId: id, level: 1 })}
|
| 115 |
+
fetchOverview={state.fetchOverview}
|
| 116 |
+
fetchExampleDetail={state.fetchExampleDetail}
|
| 117 |
+
fetchIterDetail={state.fetchIterDetail}
|
| 118 |
+
/>
|
| 119 |
+
</div>
|
| 120 |
+
)}
|
| 121 |
+
</div>
|
| 122 |
+
|
| 123 |
+
{/* Keyboard hints */}
|
| 124 |
+
<div className="flex items-center gap-4 px-4 py-1 border-t border-gray-800 text-xs text-gray-600">
|
| 125 |
+
<span>Esc: Go up</span>
|
| 126 |
+
<span>C: Toggle compare</span>
|
| 127 |
+
</div>
|
| 128 |
+
</div>
|
| 129 |
+
</div>
|
| 130 |
+
);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
export default RlmEvalApp;
|
frontend/src/rlm-eval/api.ts
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const BASE = "/api/rlm-eval";
|
| 2 |
+
|
| 3 |
+
async function fetchJson<T>(url: string, init?: RequestInit): Promise<T> {
|
| 4 |
+
const res = await fetch(`${BASE}${url}`, {
|
| 5 |
+
headers: { "Content-Type": "application/json" },
|
| 6 |
+
...init,
|
| 7 |
+
});
|
| 8 |
+
if (!res.ok) {
|
| 9 |
+
const body = await res.json().catch(() => ({}));
|
| 10 |
+
throw new Error(body.error || `HTTP ${res.status}`);
|
| 11 |
+
}
|
| 12 |
+
return res.json();
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
const PRESETS_BASE = "/api/presets/rlm-eval";
|
| 16 |
+
async function fetchPresetsJson<T>(url: string, init?: RequestInit): Promise<T> {
|
| 17 |
+
const res = await fetch(`${PRESETS_BASE}${url}`, {
|
| 18 |
+
headers: { "Content-Type": "application/json" }, ...init,
|
| 19 |
+
});
|
| 20 |
+
if (!res.ok) { const body = await res.json().catch(() => ({})); throw new Error(body.error || `HTTP ${res.status}`); }
|
| 21 |
+
return res.json();
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
export const api = {
|
| 25 |
+
loadDataset: (repo: string, config?: string, split?: string) =>
|
| 26 |
+
fetchJson<{
|
| 27 |
+
id: string;
|
| 28 |
+
repo: string;
|
| 29 |
+
name: string;
|
| 30 |
+
config: string;
|
| 31 |
+
split: string;
|
| 32 |
+
metadata: Record<string, unknown>;
|
| 33 |
+
n_examples: number;
|
| 34 |
+
n_rows: number;
|
| 35 |
+
}>("/datasets/load", {
|
| 36 |
+
method: "POST",
|
| 37 |
+
body: JSON.stringify({
|
| 38 |
+
repo,
|
| 39 |
+
config: config || "rlm_call_traces",
|
| 40 |
+
split: split || "train",
|
| 41 |
+
}),
|
| 42 |
+
}),
|
| 43 |
+
|
| 44 |
+
getOverview: (dsId: string) =>
|
| 45 |
+
fetchJson<Record<string, unknown>>(`/datasets/${dsId}/overview`),
|
| 46 |
+
|
| 47 |
+
getExampleDetail: (dsId: string, exampleIdx: number) =>
|
| 48 |
+
fetchJson<Record<string, unknown>>(`/datasets/${dsId}/example/${exampleIdx}`),
|
| 49 |
+
|
| 50 |
+
getIterDetail: (dsId: string, exampleIdx: number, rlmIter: number) =>
|
| 51 |
+
fetchJson<Record<string, unknown>>(
|
| 52 |
+
`/datasets/${dsId}/example/${exampleIdx}/iter/${rlmIter}`
|
| 53 |
+
),
|
| 54 |
+
|
| 55 |
+
unloadDataset: (dsId: string) =>
|
| 56 |
+
fetchJson<{ status: string }>(`/datasets/${dsId}`, { method: "DELETE" }),
|
| 57 |
+
|
| 58 |
+
listPresets: () => fetchPresetsJson<Record<string, unknown>[]>(""),
|
| 59 |
+
|
| 60 |
+
createPreset: (preset: { name: string; repo: string; config: string; split: string }) =>
|
| 61 |
+
fetchPresetsJson<Record<string, unknown>>("", {
|
| 62 |
+
method: "POST",
|
| 63 |
+
body: JSON.stringify(preset),
|
| 64 |
+
}),
|
| 65 |
+
|
| 66 |
+
updatePreset: (id: string, data: { name: string }) =>
|
| 67 |
+
fetchPresetsJson<Record<string, unknown>>(`/${id}`, {
|
| 68 |
+
method: "PUT",
|
| 69 |
+
body: JSON.stringify(data),
|
| 70 |
+
}),
|
| 71 |
+
|
| 72 |
+
deletePreset: (id: string) =>
|
| 73 |
+
fetchPresetsJson<{ status: string }>(`/${id}`, { method: "DELETE" }),
|
| 74 |
+
};
|
frontend/src/rlm-eval/components/Breadcrumb.tsx
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { PanelNav, DatasetInfo } from "../types";
|
| 2 |
+
|
| 3 |
+
interface BreadcrumbProps {
|
| 4 |
+
nav: PanelNav;
|
| 5 |
+
dataset: DatasetInfo | undefined;
|
| 6 |
+
onNavigate: (nav: PanelNav) => void;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
export default function Breadcrumb({ nav, dataset, onNavigate }: BreadcrumbProps) {
|
| 10 |
+
const parts: { label: string; nav: PanelNav }[] = [];
|
| 11 |
+
|
| 12 |
+
if (dataset) {
|
| 13 |
+
parts.push({
|
| 14 |
+
label: dataset.name,
|
| 15 |
+
nav: { datasetId: nav.datasetId, level: 1 },
|
| 16 |
+
});
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
if (nav.level >= 2 && nav.exampleIdx !== undefined) {
|
| 20 |
+
parts.push({
|
| 21 |
+
label: `Example ${nav.exampleIdx}`,
|
| 22 |
+
nav: { datasetId: nav.datasetId, level: 2, exampleIdx: nav.exampleIdx },
|
| 23 |
+
});
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
return (
|
| 27 |
+
<div className="flex items-center gap-1 text-sm">
|
| 28 |
+
{parts.map((p, i) => (
|
| 29 |
+
<span key={i} className="flex items-center gap-1">
|
| 30 |
+
{i > 0 && <span className="text-gray-500">/</span>}
|
| 31 |
+
{i < parts.length - 1 ? (
|
| 32 |
+
<button
|
| 33 |
+
className="text-emerald-400 hover:text-emerald-300"
|
| 34 |
+
onClick={() => onNavigate(p.nav)}
|
| 35 |
+
>
|
| 36 |
+
{p.label}
|
| 37 |
+
</button>
|
| 38 |
+
) : (
|
| 39 |
+
<span className="text-gray-200">{p.label}</span>
|
| 40 |
+
)}
|
| 41 |
+
</span>
|
| 42 |
+
))}
|
| 43 |
+
</div>
|
| 44 |
+
);
|
| 45 |
+
}
|
frontend/src/rlm-eval/components/DatasetSelector.tsx
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { DatasetInfo } from "../types";
|
| 2 |
+
|
| 3 |
+
interface DatasetSelectorProps {
|
| 4 |
+
datasets: DatasetInfo[];
|
| 5 |
+
currentId: string;
|
| 6 |
+
onSelect: (id: string) => void;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
export default function DatasetSelector({ datasets, currentId, onSelect }: DatasetSelectorProps) {
|
| 10 |
+
return (
|
| 11 |
+
<select
|
| 12 |
+
className="bg-gray-800 text-gray-200 text-xs rounded px-2 py-0.5 border border-gray-600 focus:border-emerald-500 outline-none"
|
| 13 |
+
value={currentId}
|
| 14 |
+
onChange={(e) => onSelect(e.target.value)}
|
| 15 |
+
>
|
| 16 |
+
{datasets.map((ds) => (
|
| 17 |
+
<option key={ds.id} value={ds.id}>
|
| 18 |
+
{ds.name}
|
| 19 |
+
</option>
|
| 20 |
+
))}
|
| 21 |
+
</select>
|
| 22 |
+
);
|
| 23 |
+
}
|
frontend/src/rlm-eval/components/ExampleDetailLevel.tsx
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useEffect, useState } from "react";
|
| 2 |
+
import type { ExampleDetailData, RlmIterDetail } from "../types";
|
| 3 |
+
import IterationDetail from "./IterationDetail";
|
| 4 |
+
|
| 5 |
+
interface ExampleDetailLevelProps {
|
| 6 |
+
datasetId: string;
|
| 7 |
+
exampleIdx: number;
|
| 8 |
+
fetchExampleDetail: (dsId: string, exampleIdx: number) => Promise<ExampleDetailData>;
|
| 9 |
+
fetchIterDetail: (dsId: string, exampleIdx: number, rlmIter: number) => Promise<RlmIterDetail>;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
export default function ExampleDetailLevel({
|
| 13 |
+
datasetId,
|
| 14 |
+
exampleIdx,
|
| 15 |
+
fetchExampleDetail,
|
| 16 |
+
fetchIterDetail,
|
| 17 |
+
}: ExampleDetailLevelProps) {
|
| 18 |
+
const [data, setData] = useState<ExampleDetailData | null>(null);
|
| 19 |
+
const [expandedIter, setExpandedIter] = useState<number | null>(null);
|
| 20 |
+
const [iterDetail, setIterDetail] = useState<RlmIterDetail | null>(null);
|
| 21 |
+
|
| 22 |
+
useEffect(() => {
|
| 23 |
+
fetchExampleDetail(datasetId, exampleIdx).then(setData).catch(() => {});
|
| 24 |
+
}, [datasetId, exampleIdx, fetchExampleDetail]);
|
| 25 |
+
|
| 26 |
+
useEffect(() => {
|
| 27 |
+
if (expandedIter === null) {
|
| 28 |
+
setIterDetail(null);
|
| 29 |
+
return;
|
| 30 |
+
}
|
| 31 |
+
fetchIterDetail(datasetId, exampleIdx, expandedIter)
|
| 32 |
+
.then(setIterDetail)
|
| 33 |
+
.catch(() => {});
|
| 34 |
+
}, [datasetId, exampleIdx, expandedIter, fetchIterDetail]);
|
| 35 |
+
|
| 36 |
+
if (!data) return <div className="p-4 text-gray-400">Loading example detail...</div>;
|
| 37 |
+
|
| 38 |
+
return (
|
| 39 |
+
<div className="p-4 space-y-4 overflow-y-auto">
|
| 40 |
+
{/* Question text */}
|
| 41 |
+
<div className="bg-gray-800 border border-gray-700 rounded-lg p-4">
|
| 42 |
+
<div className="flex items-center gap-2 mb-2">
|
| 43 |
+
<div className="text-xs font-semibold text-emerald-400">Question</div>
|
| 44 |
+
{data.eval_correct === true && (
|
| 45 |
+
<span className="text-emerald-400 text-xs font-bold">✓ Correct</span>
|
| 46 |
+
)}
|
| 47 |
+
{data.eval_correct === false && (
|
| 48 |
+
<span className="text-red-400 text-xs font-bold">✗ Incorrect</span>
|
| 49 |
+
)}
|
| 50 |
+
</div>
|
| 51 |
+
<div className="text-sm text-gray-200 whitespace-pre-wrap max-h-40 overflow-y-auto">
|
| 52 |
+
{data.question_text}
|
| 53 |
+
</div>
|
| 54 |
+
</div>
|
| 55 |
+
|
| 56 |
+
{/* Stats row */}
|
| 57 |
+
<div className="flex gap-4 text-xs text-gray-400">
|
| 58 |
+
<span>
|
| 59 |
+
Total tokens:{" "}
|
| 60 |
+
<span className="text-gray-200">
|
| 61 |
+
{((data.total_input_tokens + data.total_output_tokens) / 1000).toFixed(1)}k
|
| 62 |
+
</span>
|
| 63 |
+
</span>
|
| 64 |
+
<span>
|
| 65 |
+
Time: <span className="text-gray-200">{data.total_execution_time.toFixed(1)}s</span>
|
| 66 |
+
</span>
|
| 67 |
+
<span>
|
| 68 |
+
Iterations: <span className="text-gray-200">{data.iterations.length}</span>
|
| 69 |
+
</span>
|
| 70 |
+
</div>
|
| 71 |
+
|
| 72 |
+
{/* Iteration timeline */}
|
| 73 |
+
<div>
|
| 74 |
+
<div className="text-xs font-semibold text-gray-400 mb-2">Iteration Timeline</div>
|
| 75 |
+
<div className="flex gap-2 overflow-x-auto pb-2">
|
| 76 |
+
{data.iterations.map((it) => (
|
| 77 |
+
<div
|
| 78 |
+
key={it.rlm_iter}
|
| 79 |
+
className={`flex-shrink-0 w-56 bg-gray-800 border rounded-lg p-3 cursor-pointer transition-colors hover:border-emerald-500 ${
|
| 80 |
+
expandedIter === it.rlm_iter
|
| 81 |
+
? "border-emerald-500 ring-1 ring-emerald-500"
|
| 82 |
+
: it.has_final_answer
|
| 83 |
+
? "border-emerald-600"
|
| 84 |
+
: "border-gray-700"
|
| 85 |
+
}`}
|
| 86 |
+
onClick={() =>
|
| 87 |
+
setExpandedIter(expandedIter === it.rlm_iter ? null : it.rlm_iter)
|
| 88 |
+
}
|
| 89 |
+
>
|
| 90 |
+
<div className="flex items-center justify-between mb-2">
|
| 91 |
+
<span className="bg-gray-700 text-gray-200 text-xs font-mono px-2 py-0.5 rounded">
|
| 92 |
+
iter {it.rlm_iter}
|
| 93 |
+
</span>
|
| 94 |
+
<div className="flex gap-1">
|
| 95 |
+
{it.has_code_blocks && (
|
| 96 |
+
<span className="bg-emerald-900 text-emerald-300 text-xs px-1.5 py-0.5 rounded">
|
| 97 |
+
{it.n_code_blocks} code
|
| 98 |
+
</span>
|
| 99 |
+
)}
|
| 100 |
+
{it.has_final_answer && (
|
| 101 |
+
<span className="bg-amber-900 text-amber-300 text-xs px-1.5 py-0.5 rounded">
|
| 102 |
+
FINAL
|
| 103 |
+
</span>
|
| 104 |
+
)}
|
| 105 |
+
</div>
|
| 106 |
+
</div>
|
| 107 |
+
|
| 108 |
+
<div className="flex justify-between text-xs text-gray-500 mb-2">
|
| 109 |
+
<span>{((it.input_tokens + it.output_tokens) / 1000).toFixed(1)}k tok</span>
|
| 110 |
+
<span>{it.execution_time.toFixed(1)}s</span>
|
| 111 |
+
</div>
|
| 112 |
+
|
| 113 |
+
<div className="text-xs text-gray-400 line-clamp-3 leading-relaxed">
|
| 114 |
+
{it.response_preview || "(empty)"}
|
| 115 |
+
</div>
|
| 116 |
+
</div>
|
| 117 |
+
))}
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
|
| 121 |
+
{/* Expanded iteration detail */}
|
| 122 |
+
{expandedIter !== null && iterDetail && (
|
| 123 |
+
<IterationDetail data={iterDetail} />
|
| 124 |
+
)}
|
| 125 |
+
|
| 126 |
+
{/* Final answer if present */}
|
| 127 |
+
{data.final_answer && (
|
| 128 |
+
<div className="bg-emerald-950 border border-emerald-700 rounded-lg p-4">
|
| 129 |
+
<div className="text-xs font-semibold text-emerald-400 mb-2">Final Answer</div>
|
| 130 |
+
<div className="text-sm text-gray-200 whitespace-pre-wrap max-h-60 overflow-y-auto">
|
| 131 |
+
{data.final_answer}
|
| 132 |
+
</div>
|
| 133 |
+
</div>
|
| 134 |
+
)}
|
| 135 |
+
</div>
|
| 136 |
+
);
|
| 137 |
+
}
|
frontend/src/rlm-eval/components/IterationDetail.tsx
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import type { RlmIterDetail } from "../types";
|
| 3 |
+
|
| 4 |
+
interface IterationDetailProps {
|
| 5 |
+
data: RlmIterDetail;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
function parsePromptMessages(promptStr: string): { role: string; content: string }[] {
|
| 9 |
+
try {
|
| 10 |
+
const parsed = JSON.parse(promptStr);
|
| 11 |
+
if (Array.isArray(parsed)) return parsed;
|
| 12 |
+
} catch { /* not JSON */ }
|
| 13 |
+
return [{ role: "raw", content: promptStr }];
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
const roleColors: Record<string, string> = {
|
| 17 |
+
system: "border-violet-500 bg-violet-950",
|
| 18 |
+
user: "border-emerald-500 bg-emerald-950",
|
| 19 |
+
assistant: "border-sky-500 bg-sky-950",
|
| 20 |
+
raw: "border-gray-500 bg-gray-900",
|
| 21 |
+
};
|
| 22 |
+
|
| 23 |
+
export default function IterationDetail({ data }: IterationDetailProps) {
|
| 24 |
+
const [promptExpanded, setPromptExpanded] = useState(false);
|
| 25 |
+
const messages = parsePromptMessages(data.prompt);
|
| 26 |
+
|
| 27 |
+
return (
|
| 28 |
+
<div className="space-y-4 border border-gray-700 rounded-lg p-4 bg-gray-900">
|
| 29 |
+
{/* Stats */}
|
| 30 |
+
<div className="flex gap-4 text-xs text-gray-400">
|
| 31 |
+
<span>Model: <span className="text-gray-200">{data.model}</span></span>
|
| 32 |
+
<span>In: <span className="text-emerald-300">{(data.input_tokens / 1000).toFixed(1)}k</span></span>
|
| 33 |
+
<span>Out: <span className="text-emerald-300">{(data.output_tokens / 1000).toFixed(1)}k</span></span>
|
| 34 |
+
<span>Time: <span className="text-gray-200">{data.execution_time.toFixed(1)}s</span></span>
|
| 35 |
+
</div>
|
| 36 |
+
|
| 37 |
+
{/* Prompt section (collapsible) */}
|
| 38 |
+
<div>
|
| 39 |
+
<button
|
| 40 |
+
className="flex items-center gap-2 text-sm font-semibold text-gray-300 hover:text-gray-100 mb-2"
|
| 41 |
+
onClick={() => setPromptExpanded(!promptExpanded)}
|
| 42 |
+
>
|
| 43 |
+
<span className={`transform transition-transform ${promptExpanded ? "rotate-90" : ""}`}>
|
| 44 |
+
▶
|
| 45 |
+
</span>
|
| 46 |
+
Prompt ({messages.length} messages)
|
| 47 |
+
</button>
|
| 48 |
+
{promptExpanded && (
|
| 49 |
+
<div className="space-y-2 ml-4">
|
| 50 |
+
{messages.map((msg, i) => (
|
| 51 |
+
<div
|
| 52 |
+
key={i}
|
| 53 |
+
className={`border-l-2 rounded-r-lg px-3 py-2 ${roleColors[msg.role] || roleColors.raw}`}
|
| 54 |
+
>
|
| 55 |
+
<div className="text-xs font-semibold text-gray-400 mb-1 uppercase">{msg.role}</div>
|
| 56 |
+
<div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto">
|
| 57 |
+
{msg.content.length > 8000 ? msg.content.slice(0, 8000) + "\n...(truncated)" : msg.content}
|
| 58 |
+
</div>
|
| 59 |
+
</div>
|
| 60 |
+
))}
|
| 61 |
+
</div>
|
| 62 |
+
)}
|
| 63 |
+
</div>
|
| 64 |
+
|
| 65 |
+
{/* Response */}
|
| 66 |
+
<div>
|
| 67 |
+
<div className="text-sm font-semibold text-gray-300 mb-2">Response</div>
|
| 68 |
+
<div className="bg-gray-800 border border-gray-700 rounded-lg p-3">
|
| 69 |
+
<div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto font-mono">
|
| 70 |
+
{data.response}
|
| 71 |
+
</div>
|
| 72 |
+
</div>
|
| 73 |
+
</div>
|
| 74 |
+
|
| 75 |
+
{/* Code Blocks */}
|
| 76 |
+
{data.code_blocks.length > 0 && (
|
| 77 |
+
<div>
|
| 78 |
+
<div className="text-sm font-semibold text-gray-300 mb-2">
|
| 79 |
+
Code Blocks ({data.code_blocks.length})
|
| 80 |
+
</div>
|
| 81 |
+
<div className="space-y-3">
|
| 82 |
+
{data.code_blocks.map((cb, i) => (
|
| 83 |
+
<div key={i} className="border border-gray-700 rounded-lg overflow-hidden">
|
| 84 |
+
<div className="bg-gray-800 px-3 py-1.5 text-xs text-gray-400 border-b border-gray-700 flex items-center gap-2">
|
| 85 |
+
<span className="text-emerald-400 font-mono">python</span>
|
| 86 |
+
<span>Block {i + 1}</span>
|
| 87 |
+
</div>
|
| 88 |
+
<pre className="bg-gray-900 p-3 text-sm text-gray-200 overflow-x-auto font-mono leading-relaxed">
|
| 89 |
+
{cb.code}
|
| 90 |
+
</pre>
|
| 91 |
+
{cb.stdout && (
|
| 92 |
+
<div className="border-t border-gray-700">
|
| 93 |
+
<div className="bg-gray-800 px-3 py-1 text-xs text-gray-400">stdout</div>
|
| 94 |
+
<pre className="bg-emerald-950 p-3 text-sm text-emerald-200 overflow-x-auto font-mono">
|
| 95 |
+
{cb.stdout}
|
| 96 |
+
</pre>
|
| 97 |
+
</div>
|
| 98 |
+
)}
|
| 99 |
+
</div>
|
| 100 |
+
))}
|
| 101 |
+
</div>
|
| 102 |
+
</div>
|
| 103 |
+
)}
|
| 104 |
+
|
| 105 |
+
{/* Final Answer */}
|
| 106 |
+
{data.final_answer && (
|
| 107 |
+
<div className="bg-emerald-950 border border-emerald-700 rounded-lg p-4">
|
| 108 |
+
<div className="text-xs font-semibold text-emerald-400 mb-2">Final Answer</div>
|
| 109 |
+
<div className="text-sm text-gray-200 whitespace-pre-wrap max-h-96 overflow-y-auto">
|
| 110 |
+
{data.final_answer}
|
| 111 |
+
</div>
|
| 112 |
+
</div>
|
| 113 |
+
)}
|
| 114 |
+
</div>
|
| 115 |
+
);
|
| 116 |
+
}
|
frontend/src/rlm-eval/components/OverviewLevel.tsx
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useEffect, useState } from "react";
|
| 2 |
+
import type { OverviewData, PanelNav } from "../types";
|
| 3 |
+
|
| 4 |
+
interface OverviewLevelProps {
|
| 5 |
+
datasetId: string;
|
| 6 |
+
fetchOverview: (dsId: string) => Promise<OverviewData>;
|
| 7 |
+
onDrillDown: (nav: PanelNav) => void;
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
export default function OverviewLevel({ datasetId, fetchOverview, onDrillDown }: OverviewLevelProps) {
|
| 11 |
+
const [data, setData] = useState<OverviewData | null>(null);
|
| 12 |
+
|
| 13 |
+
useEffect(() => {
|
| 14 |
+
fetchOverview(datasetId).then(setData).catch(() => {});
|
| 15 |
+
}, [datasetId, fetchOverview]);
|
| 16 |
+
|
| 17 |
+
if (!data) return <div className="p-4 text-gray-400">Loading overview...</div>;
|
| 18 |
+
|
| 19 |
+
const correctCount = data.examples.filter((ex) => ex.eval_correct === true).length;
|
| 20 |
+
const incorrectCount = data.examples.filter((ex) => ex.eval_correct === false).length;
|
| 21 |
+
const unknownCount = data.examples.filter((ex) => ex.eval_correct === null || ex.eval_correct === undefined).length;
|
| 22 |
+
|
| 23 |
+
return (
|
| 24 |
+
<div className="p-4 space-y-3">
|
| 25 |
+
{/* Experiment metadata */}
|
| 26 |
+
<div className="flex gap-3 text-xs text-gray-400">
|
| 27 |
+
<span>Model: <span className="text-gray-200">{data.metadata.model}</span></span>
|
| 28 |
+
<span>Method: <span className="text-gray-200">{data.metadata.method}</span></span>
|
| 29 |
+
<span>Run: <span className="text-gray-200">{data.metadata.run_id}</span></span>
|
| 30 |
+
</div>
|
| 31 |
+
|
| 32 |
+
{/* Summary stats */}
|
| 33 |
+
<div className="flex gap-3 text-xs text-gray-400">
|
| 34 |
+
<span>{data.examples.length} examples</span>
|
| 35 |
+
{correctCount > 0 && (
|
| 36 |
+
<span className="text-emerald-400">{correctCount} correct</span>
|
| 37 |
+
)}
|
| 38 |
+
{incorrectCount > 0 && (
|
| 39 |
+
<span className="text-red-400">{incorrectCount} incorrect</span>
|
| 40 |
+
)}
|
| 41 |
+
{unknownCount > 0 && (
|
| 42 |
+
<span className="text-gray-500">{unknownCount} unknown</span>
|
| 43 |
+
)}
|
| 44 |
+
</div>
|
| 45 |
+
|
| 46 |
+
{/* Example cards */}
|
| 47 |
+
<div className="space-y-2">
|
| 48 |
+
{data.examples.map((ex) => (
|
| 49 |
+
<div
|
| 50 |
+
key={ex.example_idx}
|
| 51 |
+
className="bg-gray-800 border border-gray-700 rounded-lg p-4 hover:border-emerald-500 cursor-pointer transition-colors"
|
| 52 |
+
onClick={() =>
|
| 53 |
+
onDrillDown({
|
| 54 |
+
datasetId,
|
| 55 |
+
level: 2,
|
| 56 |
+
exampleIdx: ex.example_idx,
|
| 57 |
+
})
|
| 58 |
+
}
|
| 59 |
+
>
|
| 60 |
+
<div className="flex items-center justify-between mb-2">
|
| 61 |
+
<div className="flex items-center gap-3">
|
| 62 |
+
<span className="bg-emerald-600 text-white text-xs font-bold px-2 py-0.5 rounded-full">
|
| 63 |
+
Ex {ex.example_idx}
|
| 64 |
+
</span>
|
| 65 |
+
<span className="bg-gray-700 text-gray-300 text-xs px-2 py-0.5 rounded">
|
| 66 |
+
{ex.n_iterations} iter{ex.n_iterations !== 1 ? "s" : ""}
|
| 67 |
+
</span>
|
| 68 |
+
{ex.eval_correct === true && (
|
| 69 |
+
<span className="text-emerald-400 text-sm font-bold" title="Correct">
|
| 70 |
+
✓
|
| 71 |
+
</span>
|
| 72 |
+
)}
|
| 73 |
+
{ex.eval_correct === false && (
|
| 74 |
+
<span className="text-red-400 text-sm font-bold" title="Incorrect">
|
| 75 |
+
✗
|
| 76 |
+
</span>
|
| 77 |
+
)}
|
| 78 |
+
</div>
|
| 79 |
+
<span className="text-xs text-gray-400">
|
| 80 |
+
{ex.total_execution_time.toFixed(1)}s
|
| 81 |
+
</span>
|
| 82 |
+
</div>
|
| 83 |
+
|
| 84 |
+
{/* Question preview */}
|
| 85 |
+
<div className="text-sm text-gray-300 line-clamp-2 mb-2 leading-relaxed">
|
| 86 |
+
{ex.question_text || "(no question text)"}
|
| 87 |
+
</div>
|
| 88 |
+
|
| 89 |
+
<div className="flex gap-4 text-xs text-gray-400">
|
| 90 |
+
<span>
|
| 91 |
+
{((ex.total_input_tokens + ex.total_output_tokens) / 1000).toFixed(1)}k tokens
|
| 92 |
+
</span>
|
| 93 |
+
</div>
|
| 94 |
+
|
| 95 |
+
{ex.final_answer_preview && (
|
| 96 |
+
<div className="mt-2 text-xs text-gray-500 truncate">
|
| 97 |
+
Answer: {ex.final_answer_preview}
|
| 98 |
+
</div>
|
| 99 |
+
)}
|
| 100 |
+
</div>
|
| 101 |
+
))}
|
| 102 |
+
</div>
|
| 103 |
+
</div>
|
| 104 |
+
);
|
| 105 |
+
}
|
frontend/src/rlm-eval/components/Panel.tsx
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { PanelNav, DatasetInfo, OverviewData, ExampleDetailData, RlmIterDetail } from "../types";
|
| 2 |
+
import Breadcrumb from "./Breadcrumb";
|
| 3 |
+
import OverviewLevel from "./OverviewLevel";
|
| 4 |
+
import ExampleDetailLevel from "./ExampleDetailLevel";
|
| 5 |
+
import DatasetSelector from "./DatasetSelector";
|
| 6 |
+
|
| 7 |
+
interface PanelProps {
|
| 8 |
+
nav: PanelNav;
|
| 9 |
+
dataset: DatasetInfo | undefined;
|
| 10 |
+
panelLabel?: string;
|
| 11 |
+
datasets?: DatasetInfo[];
|
| 12 |
+
onNavigate: (nav: PanelNav) => void;
|
| 13 |
+
onGoUp: () => void;
|
| 14 |
+
onSwitchDataset?: (id: string) => void;
|
| 15 |
+
fetchOverview: (dsId: string) => Promise<OverviewData>;
|
| 16 |
+
fetchExampleDetail: (dsId: string, exampleIdx: number) => Promise<ExampleDetailData>;
|
| 17 |
+
fetchIterDetail: (
|
| 18 |
+
dsId: string,
|
| 19 |
+
exampleIdx: number,
|
| 20 |
+
rlmIter: number
|
| 21 |
+
) => Promise<RlmIterDetail>;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
export default function Panel({
|
| 25 |
+
nav,
|
| 26 |
+
dataset,
|
| 27 |
+
panelLabel,
|
| 28 |
+
datasets,
|
| 29 |
+
onNavigate,
|
| 30 |
+
onGoUp,
|
| 31 |
+
onSwitchDataset,
|
| 32 |
+
fetchOverview,
|
| 33 |
+
fetchExampleDetail,
|
| 34 |
+
fetchIterDetail,
|
| 35 |
+
}: PanelProps) {
|
| 36 |
+
return (
|
| 37 |
+
<div className="flex flex-col h-full border border-gray-700 rounded-lg bg-gray-900 overflow-hidden">
|
| 38 |
+
{/* Panel header */}
|
| 39 |
+
<div className="flex items-center gap-2 px-3 py-2 border-b border-gray-700 bg-gray-800">
|
| 40 |
+
{nav.level > 1 && (
|
| 41 |
+
<button
|
| 42 |
+
className="text-gray-400 hover:text-gray-200 text-sm"
|
| 43 |
+
onClick={onGoUp}
|
| 44 |
+
title="Go up"
|
| 45 |
+
>
|
| 46 |
+
←
|
| 47 |
+
</button>
|
| 48 |
+
)}
|
| 49 |
+
{panelLabel && (
|
| 50 |
+
<span className="text-xs bg-gray-700 text-gray-300 px-1.5 py-0.5 rounded">
|
| 51 |
+
{panelLabel}
|
| 52 |
+
</span>
|
| 53 |
+
)}
|
| 54 |
+
<Breadcrumb nav={nav} dataset={dataset} onNavigate={onNavigate} />
|
| 55 |
+
{panelLabel === "B" && datasets && onSwitchDataset && (
|
| 56 |
+
<div className="ml-auto">
|
| 57 |
+
<DatasetSelector
|
| 58 |
+
datasets={datasets}
|
| 59 |
+
currentId={nav.datasetId}
|
| 60 |
+
onSelect={(id) => onSwitchDataset(id)}
|
| 61 |
+
/>
|
| 62 |
+
</div>
|
| 63 |
+
)}
|
| 64 |
+
</div>
|
| 65 |
+
|
| 66 |
+
{/* Panel content */}
|
| 67 |
+
<div className="flex-1 overflow-y-auto">
|
| 68 |
+
{nav.level === 1 && (
|
| 69 |
+
<OverviewLevel
|
| 70 |
+
datasetId={nav.datasetId}
|
| 71 |
+
fetchOverview={fetchOverview}
|
| 72 |
+
onDrillDown={onNavigate}
|
| 73 |
+
/>
|
| 74 |
+
)}
|
| 75 |
+
{nav.level === 2 && nav.exampleIdx !== undefined && (
|
| 76 |
+
<ExampleDetailLevel
|
| 77 |
+
datasetId={nav.datasetId}
|
| 78 |
+
exampleIdx={nav.exampleIdx}
|
| 79 |
+
fetchExampleDetail={fetchExampleDetail}
|
| 80 |
+
fetchIterDetail={fetchIterDetail}
|
| 81 |
+
/>
|
| 82 |
+
)}
|
| 83 |
+
</div>
|
| 84 |
+
</div>
|
| 85 |
+
);
|
| 86 |
+
}
|
frontend/src/rlm-eval/components/Sidebar.tsx
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState } from "react";
|
| 2 |
+
import type { DatasetInfo, Preset } from "../types";
|
| 3 |
+
import { api } from "../api";
|
| 4 |
+
|
| 5 |
+
interface SidebarProps {
|
| 6 |
+
datasets: DatasetInfo[];
|
| 7 |
+
presets: Preset[];
|
| 8 |
+
setPresets: (p: Preset[]) => void;
|
| 9 |
+
loading: Record<string, boolean>;
|
| 10 |
+
onAddDataset: (repo: string, config?: string, split?: string, presetId?: string, presetName?: string) => void;
|
| 11 |
+
onRemoveDataset: (id: string) => void;
|
| 12 |
+
onToggleDataset: (id: string) => void;
|
| 13 |
+
onSelectDataset: (id: string) => void;
|
| 14 |
+
onUpdateDatasetPresetName: (dsId: string, name: string) => void;
|
| 15 |
+
onClearDatasetPreset: (dsId: string) => void;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
export default function Sidebar({
|
| 19 |
+
datasets,
|
| 20 |
+
presets,
|
| 21 |
+
setPresets,
|
| 22 |
+
loading,
|
| 23 |
+
onAddDataset,
|
| 24 |
+
onRemoveDataset,
|
| 25 |
+
onToggleDataset,
|
| 26 |
+
onSelectDataset,
|
| 27 |
+
onUpdateDatasetPresetName,
|
| 28 |
+
onClearDatasetPreset,
|
| 29 |
+
}: SidebarProps) {
|
| 30 |
+
const [showAddForm, setShowAddForm] = useState(false);
|
| 31 |
+
const [repo, setRepo] = useState("");
|
| 32 |
+
const [config, setConfig] = useState("rlm_call_traces");
|
| 33 |
+
const [split, setSplit] = useState("train");
|
| 34 |
+
const [presetSearch, setPresetSearch] = useState("");
|
| 35 |
+
|
| 36 |
+
// Inline preset saving
|
| 37 |
+
const [savingPresetForId, setSavingPresetForId] = useState<string | null>(null);
|
| 38 |
+
const [presetName, setPresetName] = useState("");
|
| 39 |
+
|
| 40 |
+
// Preset editing panel
|
| 41 |
+
const [editingDatasetId, setEditingDatasetId] = useState<string | null>(null);
|
| 42 |
+
const [editPresetName, setEditPresetName] = useState("");
|
| 43 |
+
|
| 44 |
+
const handleAdd = () => {
|
| 45 |
+
if (!repo.trim()) return;
|
| 46 |
+
onAddDataset(repo.trim(), config, split);
|
| 47 |
+
setRepo("");
|
| 48 |
+
setShowAddForm(false);
|
| 49 |
+
};
|
| 50 |
+
|
| 51 |
+
const handleLoadPreset = (p: Preset) => {
|
| 52 |
+
onAddDataset(p.repo, p.config, p.split || "train", p.id, p.name);
|
| 53 |
+
};
|
| 54 |
+
|
| 55 |
+
const handleSavePresetForRepo = async (ds: DatasetInfo) => {
|
| 56 |
+
if (!presetName.trim()) return;
|
| 57 |
+
try {
|
| 58 |
+
const preset = (await api.createPreset({
|
| 59 |
+
name: presetName.trim(),
|
| 60 |
+
repo: ds.repo,
|
| 61 |
+
config: ds.config,
|
| 62 |
+
split: ds.split,
|
| 63 |
+
})) as unknown as Preset;
|
| 64 |
+
setPresets([...presets, preset]);
|
| 65 |
+
onUpdateDatasetPresetName(ds.id, presetName.trim());
|
| 66 |
+
} catch {
|
| 67 |
+
/* ignore */
|
| 68 |
+
}
|
| 69 |
+
setPresetName("");
|
| 70 |
+
setSavingPresetForId(null);
|
| 71 |
+
};
|
| 72 |
+
|
| 73 |
+
const handleUpdatePreset = async (presetId: string, dsId: string) => {
|
| 74 |
+
if (!editPresetName.trim()) return;
|
| 75 |
+
try {
|
| 76 |
+
await api.updatePreset(presetId, { name: editPresetName.trim() });
|
| 77 |
+
setPresets(
|
| 78 |
+
presets.map((p) => (p.id === presetId ? { ...p, name: editPresetName.trim() } : p))
|
| 79 |
+
);
|
| 80 |
+
onUpdateDatasetPresetName(dsId, editPresetName.trim());
|
| 81 |
+
} catch {
|
| 82 |
+
/* ignore */
|
| 83 |
+
}
|
| 84 |
+
setEditingDatasetId(null);
|
| 85 |
+
};
|
| 86 |
+
|
| 87 |
+
const handleDeletePreset = async (id: string, dsId?: string) => {
|
| 88 |
+
await api.deletePreset(id).catch(() => {});
|
| 89 |
+
setPresets(presets.filter((p) => p.id !== id));
|
| 90 |
+
if (dsId) {
|
| 91 |
+
onClearDatasetPreset(dsId);
|
| 92 |
+
}
|
| 93 |
+
setEditingDatasetId(null);
|
| 94 |
+
};
|
| 95 |
+
|
| 96 |
+
const filteredPresets = presetSearch
|
| 97 |
+
? presets.filter(
|
| 98 |
+
(p) =>
|
| 99 |
+
p.name.toLowerCase().includes(presetSearch.toLowerCase()) ||
|
| 100 |
+
p.repo.toLowerCase().includes(presetSearch.toLowerCase())
|
| 101 |
+
)
|
| 102 |
+
: presets;
|
| 103 |
+
|
| 104 |
+
return (
|
| 105 |
+
<div className="w-64 min-w-64 bg-gray-900 border-r border-gray-700 flex flex-col h-full overflow-hidden">
|
| 106 |
+
{/* Header */}
|
| 107 |
+
<div className="p-3 border-b border-gray-700">
|
| 108 |
+
<h1 className="text-sm font-bold tracking-wide text-gray-200">RLM Eval Visualizer</h1>
|
| 109 |
+
</div>
|
| 110 |
+
|
| 111 |
+
{/* Presets section */}
|
| 112 |
+
<div className="p-3 border-b border-gray-700">
|
| 113 |
+
<div className="text-xs font-semibold text-gray-400 uppercase tracking-wider mb-2">
|
| 114 |
+
Presets
|
| 115 |
+
</div>
|
| 116 |
+
{presets.length === 0 ? (
|
| 117 |
+
<p className="text-xs text-gray-500 italic">No presets saved</p>
|
| 118 |
+
) : (
|
| 119 |
+
<>
|
| 120 |
+
{presets.length > 6 && (
|
| 121 |
+
<input
|
| 122 |
+
type="text"
|
| 123 |
+
value={presetSearch}
|
| 124 |
+
onChange={(e) => setPresetSearch(e.target.value)}
|
| 125 |
+
placeholder="Search presets..."
|
| 126 |
+
className="w-full px-2 py-1 mb-2 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
|
| 127 |
+
/>
|
| 128 |
+
)}
|
| 129 |
+
<div className="flex flex-wrap gap-1 max-h-32 overflow-y-auto">
|
| 130 |
+
{filteredPresets.map((p) => (
|
| 131 |
+
<div key={p.id} className="group relative">
|
| 132 |
+
<button
|
| 133 |
+
onClick={() => handleLoadPreset(p)}
|
| 134 |
+
className="px-2 py-1 text-xs bg-gray-800 hover:bg-gray-700 rounded border border-gray-600 text-gray-300 transition-colors"
|
| 135 |
+
title={`${p.repo} (${p.config}, ${p.split ?? "train"})`}
|
| 136 |
+
>
|
| 137 |
+
{p.name}
|
| 138 |
+
</button>
|
| 139 |
+
<div className="hidden group-hover:flex absolute top-full left-0 mt-1 z-10 gap-1">
|
| 140 |
+
<button
|
| 141 |
+
onClick={() => handleDeletePreset(p.id)}
|
| 142 |
+
className="px-1.5 py-0.5 text-[10px] bg-red-900 hover:bg-red-800 rounded text-red-300"
|
| 143 |
+
>
|
| 144 |
+
Delete
|
| 145 |
+
</button>
|
| 146 |
+
</div>
|
| 147 |
+
</div>
|
| 148 |
+
))}
|
| 149 |
+
</div>
|
| 150 |
+
</>
|
| 151 |
+
)}
|
| 152 |
+
</div>
|
| 153 |
+
|
| 154 |
+
{/* Loaded Experiments */}
|
| 155 |
+
<div className="flex-1 overflow-y-auto p-3">
|
| 156 |
+
<div className="text-xs font-semibold text-gray-400 uppercase tracking-wider mb-2">
|
| 157 |
+
Loaded Datasets
|
| 158 |
+
</div>
|
| 159 |
+
{datasets.length === 0 ? (
|
| 160 |
+
<p className="text-xs text-gray-500 italic">No datasets loaded</p>
|
| 161 |
+
) : (
|
| 162 |
+
<div className="space-y-1">
|
| 163 |
+
{datasets.map((ds) => (
|
| 164 |
+
<div key={ds.id}>
|
| 165 |
+
<div
|
| 166 |
+
onClick={() => {
|
| 167 |
+
if (ds.presetId) {
|
| 168 |
+
setEditingDatasetId(editingDatasetId === ds.id ? null : ds.id);
|
| 169 |
+
setEditPresetName(ds.presetName || "");
|
| 170 |
+
setShowAddForm(false);
|
| 171 |
+
}
|
| 172 |
+
onSelectDataset(ds.id);
|
| 173 |
+
}}
|
| 174 |
+
className={`flex items-center gap-2 px-2 py-1.5 rounded text-sm transition-colors cursor-pointer ${
|
| 175 |
+
ds.active ? "bg-gray-800" : "bg-gray-900 opacity-60"
|
| 176 |
+
} ${editingDatasetId === ds.id ? "ring-1 ring-emerald-500" : "hover:bg-gray-800"}`}
|
| 177 |
+
>
|
| 178 |
+
<input
|
| 179 |
+
type="checkbox"
|
| 180 |
+
checked={ds.active}
|
| 181 |
+
onChange={() => onToggleDataset(ds.id)}
|
| 182 |
+
onClick={(e) => e.stopPropagation()}
|
| 183 |
+
className="accent-emerald-500 shrink-0"
|
| 184 |
+
/>
|
| 185 |
+
<div className="flex-1 min-w-0">
|
| 186 |
+
<div
|
| 187 |
+
className="text-xs font-medium text-gray-200 truncate"
|
| 188 |
+
title={ds.presetName ? `${ds.presetName}\n${ds.repo}` : ds.repo}
|
| 189 |
+
>
|
| 190 |
+
{ds.presetName || ds.name}
|
| 191 |
+
</div>
|
| 192 |
+
<div className="text-[10px] text-gray-500">
|
| 193 |
+
{ds.metadata.model} | {ds.n_examples} examples
|
| 194 |
+
</div>
|
| 195 |
+
</div>
|
| 196 |
+
{/* Save as preset bookmark */}
|
| 197 |
+
<button
|
| 198 |
+
onClick={(e) => {
|
| 199 |
+
e.stopPropagation();
|
| 200 |
+
setSavingPresetForId(savingPresetForId === ds.id ? null : ds.id);
|
| 201 |
+
setPresetName(ds.presetName || ds.name);
|
| 202 |
+
}}
|
| 203 |
+
className={`transition-colors shrink-0 ${
|
| 204 |
+
savingPresetForId === ds.id
|
| 205 |
+
? "text-emerald-400"
|
| 206 |
+
: ds.presetId
|
| 207 |
+
? "text-emerald-500"
|
| 208 |
+
: "text-gray-600 hover:text-emerald-400"
|
| 209 |
+
}`}
|
| 210 |
+
title={ds.presetId ? "Saved as preset" : "Save as preset"}
|
| 211 |
+
>
|
| 212 |
+
<svg
|
| 213 |
+
className="w-3.5 h-3.5"
|
| 214 |
+
fill={ds.presetId ? "currentColor" : "none"}
|
| 215 |
+
viewBox="0 0 24 24"
|
| 216 |
+
stroke="currentColor"
|
| 217 |
+
>
|
| 218 |
+
<path
|
| 219 |
+
strokeLinecap="round"
|
| 220 |
+
strokeLinejoin="round"
|
| 221 |
+
strokeWidth={2}
|
| 222 |
+
d="M5 5a2 2 0 012-2h10a2 2 0 012 2v16l-7-3.5L5 21V5z"
|
| 223 |
+
/>
|
| 224 |
+
</svg>
|
| 225 |
+
</button>
|
| 226 |
+
{/* Remove */}
|
| 227 |
+
<button
|
| 228 |
+
onClick={(e) => {
|
| 229 |
+
e.stopPropagation();
|
| 230 |
+
onRemoveDataset(ds.id);
|
| 231 |
+
}}
|
| 232 |
+
className="text-gray-600 hover:text-red-400 transition-colors shrink-0"
|
| 233 |
+
title="Remove"
|
| 234 |
+
>
|
| 235 |
+
<svg
|
| 236 |
+
className="w-3.5 h-3.5"
|
| 237 |
+
fill="none"
|
| 238 |
+
viewBox="0 0 24 24"
|
| 239 |
+
stroke="currentColor"
|
| 240 |
+
>
|
| 241 |
+
<path
|
| 242 |
+
strokeLinecap="round"
|
| 243 |
+
strokeLinejoin="round"
|
| 244 |
+
strokeWidth={2}
|
| 245 |
+
d="M6 18L18 6M6 6l12 12"
|
| 246 |
+
/>
|
| 247 |
+
</svg>
|
| 248 |
+
</button>
|
| 249 |
+
</div>
|
| 250 |
+
|
| 251 |
+
{/* Inline preset name input */}
|
| 252 |
+
{savingPresetForId === ds.id && (
|
| 253 |
+
<div className="flex gap-1 mt-1 ml-6">
|
| 254 |
+
<input
|
| 255 |
+
type="text"
|
| 256 |
+
value={presetName}
|
| 257 |
+
onChange={(e) => setPresetName(e.target.value)}
|
| 258 |
+
onKeyDown={(e) => {
|
| 259 |
+
if (e.key === "Enter") handleSavePresetForRepo(ds);
|
| 260 |
+
if (e.key === "Escape") setSavingPresetForId(null);
|
| 261 |
+
}}
|
| 262 |
+
placeholder="Preset name..."
|
| 263 |
+
className="flex-1 px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
|
| 264 |
+
autoFocus
|
| 265 |
+
/>
|
| 266 |
+
<button
|
| 267 |
+
onClick={() => handleSavePresetForRepo(ds)}
|
| 268 |
+
className="px-2 py-1 text-xs bg-emerald-600 hover:bg-emerald-500 rounded text-white"
|
| 269 |
+
>
|
| 270 |
+
Save
|
| 271 |
+
</button>
|
| 272 |
+
</div>
|
| 273 |
+
)}
|
| 274 |
+
</div>
|
| 275 |
+
))}
|
| 276 |
+
</div>
|
| 277 |
+
)}
|
| 278 |
+
</div>
|
| 279 |
+
|
| 280 |
+
{/* Preset edit panel */}
|
| 281 |
+
{editingDatasetId &&
|
| 282 |
+
(() => {
|
| 283 |
+
const editDs = datasets.find((d) => d.id === editingDatasetId);
|
| 284 |
+
if (!editDs?.presetId) return null;
|
| 285 |
+
return (
|
| 286 |
+
<div className="p-3 border-t border-gray-700 space-y-2">
|
| 287 |
+
<div className="text-[10px] text-gray-500 uppercase font-semibold tracking-wider">
|
| 288 |
+
Edit Preset
|
| 289 |
+
</div>
|
| 290 |
+
<input
|
| 291 |
+
type="text"
|
| 292 |
+
value={editPresetName}
|
| 293 |
+
onChange={(e) => setEditPresetName(e.target.value)}
|
| 294 |
+
onKeyDown={(e) => {
|
| 295 |
+
if (e.key === "Enter" && editPresetName.trim()) {
|
| 296 |
+
handleUpdatePreset(editDs.presetId!, editDs.id);
|
| 297 |
+
}
|
| 298 |
+
if (e.key === "Escape") setEditingDatasetId(null);
|
| 299 |
+
}}
|
| 300 |
+
placeholder="Preset name..."
|
| 301 |
+
className="w-full px-2 py-1 text-xs bg-gray-800 border border-gray-600 rounded text-gray-200 placeholder-gray-500 focus:border-emerald-500 focus:outline-none"
|
| 302 |
+
autoFocus
|
| 303 |
+
/>
|
| 304 |
+
<div className="flex gap-2">
|
| 305 |
+
<button
|
| 306 |
+
onClick={() => handleUpdatePreset(editDs.presetId!, editDs.id)}
|
| 307 |
+
disabled={!editPresetName.trim()}
|
| 308 |
+
className="flex-1 px-2 py-1 text-xs bg-emerald-600 hover:bg-emerald-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors"
|
| 309 |
+
>
|
| 310 |
+
Save
|
| 311 |
+
</button>
|
| 312 |
+
<button
|
| 313 |
+
onClick={() => handleDeletePreset(editDs.presetId!, editDs.id)}
|
| 314 |
+
className="px-2 py-1 text-xs bg-red-900 hover:bg-red-800 rounded text-red-300 transition-colors"
|
| 315 |
+
>
|
| 316 |
+
Delete
|
| 317 |
+
</button>
|
| 318 |
+
<button
|
| 319 |
+
onClick={() => setEditingDatasetId(null)}
|
| 320 |
+
className="px-2 py-1 text-xs bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors"
|
| 321 |
+
>
|
| 322 |
+
Cancel
|
| 323 |
+
</button>
|
| 324 |
+
</div>
|
| 325 |
+
</div>
|
| 326 |
+
);
|
| 327 |
+
})()}
|
| 328 |
+
|
| 329 |
+
{/* Add Dataset Form */}
|
| 330 |
+
<div className="p-3 border-t border-gray-700">
|
| 331 |
+
{showAddForm ? (
|
| 332 |
+
<div className="space-y-2">
|
| 333 |
+
<input
|
| 334 |
+
className="w-full bg-gray-800 text-sm text-gray-200 rounded px-2 py-1.5 border border-gray-600 focus:border-emerald-500 outline-none"
|
| 335 |
+
placeholder="org/repo-name"
|
| 336 |
+
value={repo}
|
| 337 |
+
onChange={(e) => setRepo(e.target.value)}
|
| 338 |
+
onKeyDown={(e) => e.key === "Enter" && handleAdd()}
|
| 339 |
+
autoFocus
|
| 340 |
+
/>
|
| 341 |
+
<div className="flex gap-2">
|
| 342 |
+
<input
|
| 343 |
+
className="flex-1 bg-gray-800 text-xs text-gray-200 rounded px-2 py-1 border border-gray-600 focus:border-emerald-500 outline-none"
|
| 344 |
+
placeholder="Config"
|
| 345 |
+
value={config}
|
| 346 |
+
onChange={(e) => setConfig(e.target.value)}
|
| 347 |
+
/>
|
| 348 |
+
<input
|
| 349 |
+
className="w-16 bg-gray-800 text-xs text-gray-200 rounded px-2 py-1 border border-gray-600 focus:border-emerald-500 outline-none"
|
| 350 |
+
placeholder="Split"
|
| 351 |
+
value={split}
|
| 352 |
+
onChange={(e) => setSplit(e.target.value)}
|
| 353 |
+
/>
|
| 354 |
+
</div>
|
| 355 |
+
<div className="flex gap-2">
|
| 356 |
+
<button
|
| 357 |
+
className="flex-1 px-2 py-1.5 text-sm bg-emerald-600 hover:bg-emerald-500 disabled:bg-gray-700 disabled:text-gray-500 rounded text-white transition-colors"
|
| 358 |
+
onClick={handleAdd}
|
| 359 |
+
disabled={!repo.trim() || !!loading[repo.trim()]}
|
| 360 |
+
>
|
| 361 |
+
{loading[repo.trim()] ? "Loading..." : "Load"}
|
| 362 |
+
</button>
|
| 363 |
+
<button
|
| 364 |
+
className="px-3 py-1.5 text-sm bg-gray-700 hover:bg-gray-600 rounded text-gray-300 transition-colors"
|
| 365 |
+
onClick={() => setShowAddForm(false)}
|
| 366 |
+
>
|
| 367 |
+
Cancel
|
| 368 |
+
</button>
|
| 369 |
+
</div>
|
| 370 |
+
</div>
|
| 371 |
+
) : (
|
| 372 |
+
<button
|
| 373 |
+
className="w-full px-3 py-2 text-sm bg-emerald-600 hover:bg-emerald-500 rounded text-white font-medium transition-colors"
|
| 374 |
+
onClick={() => {
|
| 375 |
+
setEditingDatasetId(null);
|
| 376 |
+
setShowAddForm(true);
|
| 377 |
+
setRepo("");
|
| 378 |
+
setConfig("rlm_call_traces");
|
| 379 |
+
setSplit("train");
|
| 380 |
+
}}
|
| 381 |
+
>
|
| 382 |
+
+ Add Dataset
|
| 383 |
+
</button>
|
| 384 |
+
)}
|
| 385 |
+
</div>
|
| 386 |
+
</div>
|
| 387 |
+
);
|
| 388 |
+
}
|
frontend/src/rlm-eval/store.ts
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useState, useCallback, useEffect, useMemo } from "react";
|
| 2 |
+
import type {
|
| 3 |
+
DatasetInfo,
|
| 4 |
+
Preset,
|
| 5 |
+
PanelNav,
|
| 6 |
+
OverviewData,
|
| 7 |
+
ExampleDetailData,
|
| 8 |
+
RlmIterDetail,
|
| 9 |
+
} from "./types";
|
| 10 |
+
import { api } from "./api";
|
| 11 |
+
|
| 12 |
+
export function useAppState() {
|
| 13 |
+
const [datasets, setDatasets] = useState<DatasetInfo[]>([]);
|
| 14 |
+
const [presets, setPresets] = useState<Preset[]>([]);
|
| 15 |
+
const [error, setError] = useState<string | null>(null);
|
| 16 |
+
const [loading, setLoading] = useState<Record<string, boolean>>({});
|
| 17 |
+
|
| 18 |
+
// Dual panel navigation
|
| 19 |
+
const [panelA, setPanelA] = useState<PanelNav | null>(null);
|
| 20 |
+
const [panelB, setPanelB] = useState<PanelNav | null>(null);
|
| 21 |
+
const [comparisonMode, setComparisonMode] = useState(false);
|
| 22 |
+
|
| 23 |
+
// Data caches
|
| 24 |
+
const [overviewCache, setOverviewCache] = useState<Record<string, OverviewData>>({});
|
| 25 |
+
const [exampleDetailCache, setExampleDetailCache] = useState<Record<string, ExampleDetailData>>({});
|
| 26 |
+
const [iterDetailCache, setIterDetailCache] = useState<Record<string, RlmIterDetail>>({});
|
| 27 |
+
|
| 28 |
+
// Load presets on mount
|
| 29 |
+
useEffect(() => {
|
| 30 |
+
api.listPresets().then((data) => setPresets(data as unknown as Preset[])).catch(() => {});
|
| 31 |
+
}, []);
|
| 32 |
+
|
| 33 |
+
const activeDatasets = useMemo(() => datasets.filter((d) => d.active), [datasets]);
|
| 34 |
+
|
| 35 |
+
// Data fetching helpers
|
| 36 |
+
const fetchOverview = useCallback(async (dsId: string) => {
|
| 37 |
+
if (overviewCache[dsId]) return overviewCache[dsId];
|
| 38 |
+
const data = (await api.getOverview(dsId)) as unknown as OverviewData;
|
| 39 |
+
setOverviewCache((prev) => ({ ...prev, [dsId]: data }));
|
| 40 |
+
return data;
|
| 41 |
+
}, [overviewCache]);
|
| 42 |
+
|
| 43 |
+
const fetchExampleDetail = useCallback(
|
| 44 |
+
async (dsId: string, exampleIdx: number) => {
|
| 45 |
+
const key = `${dsId}:${exampleIdx}`;
|
| 46 |
+
if (exampleDetailCache[key]) return exampleDetailCache[key];
|
| 47 |
+
const data = (await api.getExampleDetail(dsId, exampleIdx)) as unknown as ExampleDetailData;
|
| 48 |
+
setExampleDetailCache((prev) => ({ ...prev, [key]: data }));
|
| 49 |
+
return data;
|
| 50 |
+
},
|
| 51 |
+
[exampleDetailCache]
|
| 52 |
+
);
|
| 53 |
+
|
| 54 |
+
const fetchIterDetail = useCallback(
|
| 55 |
+
async (dsId: string, exampleIdx: number, rlmIter: number) => {
|
| 56 |
+
const key = `${dsId}:${exampleIdx}:${rlmIter}`;
|
| 57 |
+
if (iterDetailCache[key]) return iterDetailCache[key];
|
| 58 |
+
const data = (await api.getIterDetail(dsId, exampleIdx, rlmIter)) as unknown as RlmIterDetail;
|
| 59 |
+
setIterDetailCache((prev) => ({ ...prev, [key]: data }));
|
| 60 |
+
return data;
|
| 61 |
+
},
|
| 62 |
+
[iterDetailCache]
|
| 63 |
+
);
|
| 64 |
+
|
| 65 |
+
// Dataset operations
|
| 66 |
+
const addDataset = useCallback(
|
| 67 |
+
async (repo: string, config?: string, split?: string, presetId?: string, presetName?: string) => {
|
| 68 |
+
setLoading((prev) => ({ ...prev, [repo]: true }));
|
| 69 |
+
setError(null);
|
| 70 |
+
try {
|
| 71 |
+
const result = await api.loadDataset(repo, config, split);
|
| 72 |
+
const dsInfo: DatasetInfo = {
|
| 73 |
+
id: result.id,
|
| 74 |
+
repo: result.repo,
|
| 75 |
+
name: result.name,
|
| 76 |
+
config: result.config,
|
| 77 |
+
split: result.split,
|
| 78 |
+
metadata: result.metadata as unknown as DatasetInfo["metadata"],
|
| 79 |
+
n_examples: result.n_examples,
|
| 80 |
+
n_rows: result.n_rows,
|
| 81 |
+
active: true,
|
| 82 |
+
presetId,
|
| 83 |
+
presetName,
|
| 84 |
+
};
|
| 85 |
+
|
| 86 |
+
setDatasets((prev) => {
|
| 87 |
+
if (prev.some((d) => d.id === dsInfo.id)) return prev;
|
| 88 |
+
return [...prev, dsInfo];
|
| 89 |
+
});
|
| 90 |
+
|
| 91 |
+
// Auto-set panel A if not set
|
| 92 |
+
setPanelA((prev) => prev || { datasetId: dsInfo.id, level: 1 });
|
| 93 |
+
} catch (e: unknown) {
|
| 94 |
+
setError(e instanceof Error ? e.message : "Failed to load dataset");
|
| 95 |
+
} finally {
|
| 96 |
+
setLoading((prev) => ({ ...prev, [repo]: false }));
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
[]
|
| 100 |
+
);
|
| 101 |
+
|
| 102 |
+
const removeDataset = useCallback(async (id: string) => {
|
| 103 |
+
await api.unloadDataset(id).catch(() => {});
|
| 104 |
+
setDatasets((prev) => prev.filter((d) => d.id !== id));
|
| 105 |
+
setPanelA((prev) => (prev?.datasetId === id ? null : prev));
|
| 106 |
+
setPanelB((prev) => (prev?.datasetId === id ? null : prev));
|
| 107 |
+
}, []);
|
| 108 |
+
|
| 109 |
+
const toggleDataset = useCallback((id: string) => {
|
| 110 |
+
setDatasets((prev) => prev.map((d) => (d.id === id ? { ...d, active: !d.active } : d)));
|
| 111 |
+
}, []);
|
| 112 |
+
|
| 113 |
+
// Navigation
|
| 114 |
+
const navigatePanel = useCallback(
|
| 115 |
+
(panel: "A" | "B", nav: PanelNav) => {
|
| 116 |
+
if (panel === "A") setPanelA(nav);
|
| 117 |
+
else setPanelB(nav);
|
| 118 |
+
},
|
| 119 |
+
[]
|
| 120 |
+
);
|
| 121 |
+
|
| 122 |
+
const goUp = useCallback((panel: "A" | "B") => {
|
| 123 |
+
const setter = panel === "A" ? setPanelA : setPanelB;
|
| 124 |
+
setter((prev) => {
|
| 125 |
+
if (!prev) return prev;
|
| 126 |
+
if (prev.level === 2) return { ...prev, level: 1, exampleIdx: undefined };
|
| 127 |
+
return prev;
|
| 128 |
+
});
|
| 129 |
+
}, []);
|
| 130 |
+
|
| 131 |
+
const updateDatasetPresetName = useCallback((dsId: string, name: string) => {
|
| 132 |
+
setDatasets((prev) => prev.map((d) => (d.id === dsId ? { ...d, presetName: name } : d)));
|
| 133 |
+
}, []);
|
| 134 |
+
|
| 135 |
+
const clearDatasetPreset = useCallback((dsId: string) => {
|
| 136 |
+
setDatasets((prev) =>
|
| 137 |
+
prev.map((d) => (d.id === dsId ? { ...d, presetId: undefined, presetName: undefined } : d))
|
| 138 |
+
);
|
| 139 |
+
}, []);
|
| 140 |
+
|
| 141 |
+
const toggleComparison = useCallback(() => {
|
| 142 |
+
setComparisonMode((prev) => {
|
| 143 |
+
if (!prev && panelA) {
|
| 144 |
+
// Entering comparison: initialize panel B same as A
|
| 145 |
+
setPanelB({ ...panelA });
|
| 146 |
+
} else if (prev) {
|
| 147 |
+
setPanelB(null);
|
| 148 |
+
}
|
| 149 |
+
return !prev;
|
| 150 |
+
});
|
| 151 |
+
}, [panelA]);
|
| 152 |
+
|
| 153 |
+
return {
|
| 154 |
+
datasets,
|
| 155 |
+
presets,
|
| 156 |
+
setPresets,
|
| 157 |
+
error,
|
| 158 |
+
setError,
|
| 159 |
+
loading,
|
| 160 |
+
activeDatasets,
|
| 161 |
+
panelA,
|
| 162 |
+
panelB,
|
| 163 |
+
comparisonMode,
|
| 164 |
+
addDataset,
|
| 165 |
+
removeDataset,
|
| 166 |
+
toggleDataset,
|
| 167 |
+
updateDatasetPresetName,
|
| 168 |
+
clearDatasetPreset,
|
| 169 |
+
navigatePanel,
|
| 170 |
+
goUp,
|
| 171 |
+
toggleComparison,
|
| 172 |
+
fetchOverview,
|
| 173 |
+
fetchExampleDetail,
|
| 174 |
+
fetchIterDetail,
|
| 175 |
+
overviewCache,
|
| 176 |
+
exampleDetailCache,
|
| 177 |
+
iterDetailCache,
|
| 178 |
+
};
|
| 179 |
+
}
|
frontend/src/rlm-eval/types.ts
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export interface ExperimentMetadata {
|
| 2 |
+
run_id: string;
|
| 3 |
+
method: string;
|
| 4 |
+
model: string;
|
| 5 |
+
}
|
| 6 |
+
|
| 7 |
+
export interface DatasetInfo {
|
| 8 |
+
id: string;
|
| 9 |
+
repo: string;
|
| 10 |
+
name: string;
|
| 11 |
+
config: string;
|
| 12 |
+
split: string;
|
| 13 |
+
metadata: ExperimentMetadata;
|
| 14 |
+
n_examples: number;
|
| 15 |
+
n_rows: number;
|
| 16 |
+
active: boolean;
|
| 17 |
+
presetId?: string;
|
| 18 |
+
presetName?: string;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
export interface ExampleSummary {
|
| 22 |
+
example_idx: number;
|
| 23 |
+
question_text: string;
|
| 24 |
+
eval_correct: boolean | null;
|
| 25 |
+
n_iterations: number;
|
| 26 |
+
total_input_tokens: number;
|
| 27 |
+
total_output_tokens: number;
|
| 28 |
+
total_execution_time: number;
|
| 29 |
+
final_answer_preview: string;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
export interface OverviewData {
|
| 33 |
+
metadata: ExperimentMetadata;
|
| 34 |
+
examples: ExampleSummary[];
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
export interface RlmIterSummary {
|
| 38 |
+
rlm_iter: number;
|
| 39 |
+
model: string;
|
| 40 |
+
input_tokens: number;
|
| 41 |
+
output_tokens: number;
|
| 42 |
+
execution_time: number;
|
| 43 |
+
has_code_blocks: boolean;
|
| 44 |
+
n_code_blocks: number;
|
| 45 |
+
response_preview: string;
|
| 46 |
+
has_final_answer: boolean;
|
| 47 |
+
timestamp: string;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
export interface ExampleDetailData {
|
| 51 |
+
example_idx: number;
|
| 52 |
+
question_text: string;
|
| 53 |
+
eval_correct: boolean | null;
|
| 54 |
+
total_input_tokens: number;
|
| 55 |
+
total_output_tokens: number;
|
| 56 |
+
total_execution_time: number;
|
| 57 |
+
final_answer: string | null;
|
| 58 |
+
iterations: RlmIterSummary[];
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
export interface CodeBlock {
|
| 62 |
+
code: string;
|
| 63 |
+
stdout?: string;
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
export interface RlmIterDetail {
|
| 67 |
+
rlm_iter: number;
|
| 68 |
+
prompt: string;
|
| 69 |
+
response: string;
|
| 70 |
+
model: string;
|
| 71 |
+
input_tokens: number;
|
| 72 |
+
output_tokens: number;
|
| 73 |
+
execution_time: number;
|
| 74 |
+
has_code_blocks: boolean;
|
| 75 |
+
code_blocks: CodeBlock[];
|
| 76 |
+
final_answer: string | null;
|
| 77 |
+
timestamp: string;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
export interface Preset {
|
| 81 |
+
id: string;
|
| 82 |
+
name: string;
|
| 83 |
+
repo: string;
|
| 84 |
+
config: string;
|
| 85 |
+
split?: string;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
export interface PanelNav {
|
| 89 |
+
datasetId: string;
|
| 90 |
+
level: 1 | 2;
|
| 91 |
+
exampleIdx?: number;
|
| 92 |
+
}
|