Spaces:
Running
Running
File size: 13,866 Bytes
485d935 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 | """Build viz data for a single sweep + register it under a benchmark.
A **benchmark** (e.g. Data-Agent Bench, DABstep Bench) is a curated collection of
tasks plus metadata about where the tasks came from. A **sweep** is one run of N
models Γ M harnesses Γ k attempts over the benchmark's tasks.
This script does three things per invocation:
1. Generate `<out>/summary.json` β heatmap + per-attempt metadata for one sweep.
Tasks in the summary are enriched with `gold_answer`, `question`,
`reward_mode`, normalized `difficulty_level`, via the benchmark's PROFILE.
2. Generate `<out>/traces.json` β one big dict of per-attempt trajectories,
synced separately to a bucket.
3. Update `<out>/../benchmarks.json` β the site-wide index that the viewer
reads to render the benchmark toggle + info icon.
Output layout (`viz_server/site/` by default):
site/
βββ benchmarks.json β registry: benchmark β [sweeps]
βββ viewer.html
βββ v1/{summary,traces}.json β sweep under data-agent-bench
βββ dabstep/{summary,traces}.json β sweep under dabstep-bench
Usage:
python viz_server/build_data.py \\
--name dabstep \\
--suite rl/harbor/tasks/dabstep-workdir \\
--benchmark dabstep-bench \\
--out viz_server/site/dabstep
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import tomllib
from pathlib import Path
# This lives at <repo>/viz_server/; the data-builder it reuses lives in <repo>/rl.
_RL = Path(__file__).resolve().parents[1] / "rl"
sys.path.insert(0, str(_RL))
from scripts.eval_sweep_viz import ( # noqa: E402
build_data, HTML_TEMPLATE, SWEEP_BASE, VERIFIED_SUITE, JOBS_DIR,
)
SITE_DIR = Path(__file__).resolve().parent / "site"
# Heavy per-attempt fields moved out of summary into traces.json.
_HEAVY = ("trajectory", "grader_text", "opencode_log", "trial_log_tail")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Benchmark profiles β declarative metadata extractors per task layout
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
BENCHMARKS = {
"data-agent-bench": {
"label": "Data-Agent Bench",
"description": "Verified data-analysis tasks over Kaggle datasets. "
"Each task is a (question, gold-answer) pair tested with a "
"mode-aware grader (exact / numeric / flexible / LLM-judge).",
"source": {
"harbor": "AdithyaSK/data_agent_rl_environment_eval-harbor",
"hf_dataset": "AdithyaSK/data_agent_rl_environment_eval",
},
# Profile: how to extract task metadata from this benchmark's task.toml layout.
"profile": {
"difficulty_level": ("metadata.toml", "difficulty_level"), # already int 0-4
"gold_answer": ("metadata.toml", "gold_answer"),
"kaggle": ("metadata.toml", "kaggle_dataset_name"),
"reward_mode": ("metadata.toml", "reward_mode_initial"),
"question": ("task.toml", "description"),
},
},
"dabstep-bench": {
"label": "DABstep Bench",
"description": "Data Agent Benchmark for Multi-step Reasoning β 450 financial-"
"analytics factoid tasks over a shared Adyen-payments corpus. "
"Released by Adyen + Hugging Face (arXiv:2506.23719).",
"source": {
"harbor": "AdithyaSK/dabstep-harbor",
"hf_dataset": "adyen/DABstep",
"paper": "https://arxiv.org/abs/2506.23719",
"leaderboard":"https://huggingface.co/spaces/adyen/DABstep",
},
"profile": {
# DABstep stores `difficulty = "easy"|"hard"`; we also injected
# `difficulty_level` (0|4) during the /workdir rewrite, prefer that.
"difficulty_level": ("metadata.toml", "difficulty_level"),
"difficulty_map": {"easy": 0, "hard": 4},
"difficulty_str": ("metadata.toml", "difficulty"),
# Gold lives in solution/solve.sh β extract via regex below.
"gold_answer_file": "solution/solve.sh",
"question_file": "instruction.md",
# Static for DABstep β uses its own fuzzy scorer.
"reward_mode_const": "dabstep-fuzzy",
},
},
}
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Metadata enrichment
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
_DABSTEP_GOLD_RE = re.compile(
r"cat\s*<<['\"]?ANSWER_EOF['\"]?\s*>\s*/workdir/answer\.txt\s*\n(.*?)\nANSWER_EOF",
re.DOTALL,
)
# Pulls the value out of `solution/solve.sh` style:
# cat <<'ANSWER_EOF' > /workdir/answer.txt
# <gold>
# ANSWER_EOF
def _read_solve_gold(path: Path) -> str:
"""Extract the gold answer string from a solve.sh heredoc."""
if not path.exists():
return ""
m = _DABSTEP_GOLD_RE.search(path.read_text())
return m.group(1).strip() if m else ""
def _read_instruction(path: Path, max_chars: int = 4000) -> str:
"""Read instruction.md and trim β UI shows it in a side panel, so keep it sane."""
if not path.exists():
return ""
txt = path.read_text().strip()
return txt if len(txt) <= max_chars else txt[:max_chars] + "\nβ¦[truncated]"
def enrich_tasks(data: dict, suite: Path, benchmark_key: str) -> None:
"""Mutate `data['tasks']` to add gold/question/difficulty per the benchmark profile.
Idempotent β only fills fields that are empty / wrong. The base `build_data`
output already has `difficulty_level`, `question`, `answer`, `kaggle`,
`reward_mode` populated from task.toml's [metadata]; this is for benchmarks
whose task layout doesn't carry that metadata in [metadata] directly.
"""
profile = BENCHMARKS.get(benchmark_key, {}).get("profile", {})
if not profile:
return # no enrichment configured for this benchmark; keep base output as-is
tasks_dir = suite
for task in data.get("tasks", []):
task_dir = tasks_dir / task["id"]
# Difficulty: prefer existing int; otherwise map from string field via profile.
if not task.get("difficulty_level"):
tt = task_dir / "task.toml"
if tt.exists():
tomld = tomllib.loads(tt.read_text())
meta = tomld.get("metadata", {})
if "difficulty_level" in meta:
task["difficulty_level"] = int(meta["difficulty_level"])
elif "difficulty" in meta and "difficulty_map" in profile:
task["difficulty_level"] = profile["difficulty_map"].get(
meta["difficulty"], 0
)
# Gold answer: parse solution/solve.sh if profile points there
if not task.get("answer") and profile.get("gold_answer_file"):
task["answer"] = _read_solve_gold(task_dir / profile["gold_answer_file"])
# Question: read instruction.md if profile points there
if not task.get("question") and profile.get("question_file"):
task["question"] = _read_instruction(task_dir / profile["question_file"])
# Reward mode: static label if profile says so
if not task.get("reward_mode") and profile.get("reward_mode_const"):
task["reward_mode"] = profile["reward_mode_const"]
# Kaggle: leave blank if benchmark has no Kaggle dependency (UI shows "β")
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# Split heavy β traces.json
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def split(data: dict, out: Path) -> dict:
"""Strip heavy fields into traces.json (keyed by tid); return slim summary."""
traces: dict[str, dict] = {}
tid = 0
for task in data.get("tasks", []):
for cell in task.get("cells", {}).values():
for att in cell.get("attempts", []):
traces[str(tid)] = {k: att.pop(k, None) for k in _HEAVY}
att["tid"] = tid
tid += 1
(out / "traces.json").write_text(json.dumps(traces, separators=(",", ":"), default=str))
return data
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# benchmarks.json registry β merged into existing on every run
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def update_benchmarks_registry(site_dir: Path, benchmark_key: str, sweep_key: str) -> None:
"""Ensure benchmarks.json exists at site_dir/, has the benchmark, and registers the sweep."""
reg_path = site_dir / "benchmarks.json"
reg: dict = {}
if reg_path.exists():
try:
reg = json.loads(reg_path.read_text())
except json.JSONDecodeError:
reg = {}
bench = BENCHMARKS.get(benchmark_key)
if not bench:
# Unknown benchmark β write a minimal entry so the toggle still works.
bench = {"label": benchmark_key, "description": "", "source": {}}
entry = reg.setdefault(benchmark_key, {
"label": bench["label"],
"description": bench["description"],
"source": bench["source"],
"sweeps": [],
})
# Always refresh labels/descriptions from current BENCHMARKS dict (single source of truth).
entry["label"] = bench["label"]
entry["description"] = bench["description"]
entry["source"] = bench["source"]
if sweep_key not in entry["sweeps"]:
entry["sweeps"].append(sweep_key)
entry["sweeps"].sort()
reg_path.write_text(json.dumps(reg, indent=2))
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# CLI
# βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
def main() -> None:
ap = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
ap.add_argument("--name", required=True,
help="Sweep folder under data/eval_sweep/, also used as the sweep key.")
ap.add_argument("--benchmark", required=True, choices=sorted(BENCHMARKS.keys()),
help="Which benchmark this sweep belongs to.")
ap.add_argument("--suite", required=True,
help="Path to the Harbor task suite (where task.toml/instruction.md/etc live).")
ap.add_argument("--sweep-dir", default=None,
help="Override the sweep dir (default data/eval_sweep/<name>).")
ap.add_argument("--jobs-dir", default=str(JOBS_DIR))
ap.add_argument("--k-max", type=int, default=4, dest="k_max")
ap.add_argument("--out", default=None,
help="Output dir (default viz_server/site/<name>).")
args = ap.parse_args()
sweep_dir = Path(args.sweep_dir) if args.sweep_dir else (SWEEP_BASE / args.name)
out = Path(args.out) if args.out else (SITE_DIR / args.name)
out.mkdir(parents=True, exist_ok=True)
suite = Path(args.suite).resolve()
# The app reads sweeps by their site/<dir>/ name, not by --name. Use that as
# the canonical sweep key so the registry stays consistent with the layout.
sweep_key = out.name
print(f"[build] benchmark={args.benchmark} sweep_key={sweep_key} (raw name={args.name})")
print(f" sweep_dir={sweep_dir}")
print(f" suite={suite}")
data = build_data(sweep_dir.resolve(), Path(args.jobs_dir).resolve(),
suite, args.k_max)
enrich_tasks(data, suite, args.benchmark)
summary = split(data, out)
(out / "summary.json").write_text(json.dumps(summary, separators=(",", ":"), default=str))
# Update site-wide benchmarks.json registry (key by site dir, not --name).
update_benchmarks_registry(out.parent, args.benchmark, sweep_key)
sz = (out / "summary.json").stat().st_size / 1024 / 1024
tz = (out / "traces.json").stat().st_size / 1024 / 1024
print(f" summary.json: {sz:.1f} MB")
print(f" traces.json: {tz:.1f} MB")
print(f" benchmarks.json updated β {out.parent / 'benchmarks.json'}")
if __name__ == "__main__":
main()
|