Praneshrajan15's picture
feat: initial playground deployment
5143557 verified
"""Benchmark report rendering and README marker updates."""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
from typing import cast
from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput
def _format_metric(mean_value: float | None, std_value: float | None) -> str:
"""Format a mean/std metric cell for markdown tables."""
if mean_value is None:
return "Skipped"
if std_value is None:
return f"{mean_value:.4f}"
return f"{mean_value:.4f} +/- {std_value:.4f}"
def _render_table(headers: list[str], rows: list[list[str]]) -> str:
"""Render a simple markdown table."""
lines = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def load_agent_output(path: Path) -> BenchmarkRunOutput:
"""Load agent comparison JSON output."""
return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8")))
def load_sota_output(path: Path) -> dict[str, object]:
"""Load citation-only SOTA comparison JSON output."""
raw = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(raw, dict):
raise ValueError("SOTA comparison JSON must be a top-level object.")
return cast(dict[str, object], raw)
def replace_benchmark_block(readme_text: str, block_text: str) -> str:
"""Replace the README benchmark marker block idempotently."""
start_marker = "<!-- BENCH:START -->"
end_marker = "<!-- BENCH:END -->"
if start_marker not in readme_text or end_marker not in readme_text:
raise ValueError("README benchmark markers are missing.")
start = readme_text.index(start_marker) + len(start_marker)
end = readme_text.index(end_marker)
return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:]
def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]:
"""Build a simple cross-dataset local summary table."""
grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
skipped: dict[str, str | None] = {}
for aggregate in aggregates:
if aggregate.status == "ok":
grouped[aggregate.method].append(aggregate)
else:
skipped.setdefault(aggregate.method, aggregate.skip_reason)
rows: list[list[str]] = []
methods = sorted(set(grouped) | set(skipped))
for method in methods:
ok_rows = grouped.get(method, [])
if not ok_rows:
rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
continue
p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
rows.append(
[
method,
f"{p_mean:.4f}",
f"{r_mean:.4f}",
f"{f_mean:.4f}",
f"{step_mean:.2f}",
f"{quota_mean:.4f}",
]
)
return rows
def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]:
"""Collect distinct aggregate skip reasons in stable order."""
reasons: list[str] = []
for aggregate in aggregates:
reason = aggregate.skip_reason
if aggregate.status == "ok" or reason is None or reason in reasons:
continue
reasons.append(reason)
return reasons
def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str:
"""Build the generated README benchmark summary block."""
rows = _aggregate_across_datasets(agent_output.aggregates)
table = _render_table(
["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
rows,
)
skip_reasons = _collect_skip_reasons(agent_output.aggregates)
skip_note = ""
if skip_reasons:
skip_note = (
"\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
)
return (
"Generated from `eval/results/agent_comparison.json`.\n\n"
f"{table}\n\n"
f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows."
f"{skip_note}"
)
def render_benchmark_report(
agent_output: BenchmarkRunOutput,
sota_output: dict[str, object],
) -> str:
"""Render the full markdown benchmark report."""
per_dataset_sections: list[str] = []
by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
for aggregate in agent_output.aggregates:
by_dataset[aggregate.dataset].append(aggregate)
for dataset, rows in by_dataset.items():
table_rows = [
[
row.method,
_format_metric(row.precision_mean, row.precision_std),
_format_metric(row.recall_mean, row.recall_std),
_format_metric(row.f1_mean, row.f1_std),
_format_metric(row.avg_steps_mean, row.avg_steps_std),
_format_metric(row.quota_units_mean, row.quota_units_std),
]
for row in rows
]
per_dataset_sections.append(
f"### {dataset.title()}\n\n"
+ _render_table(
["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
table_rows,
)
)
local_summary = _render_table(
["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"],
_aggregate_across_datasets(agent_output.aggregates),
)
raw_rows = sota_output.get("rows", [])
if not isinstance(raw_rows, list):
raw_rows = []
sota_rows = [
[
str(row["method"]),
str(row["dataset"]),
f"{float(row['precision']):.3f}",
f"{float(row['recall']):.3f}",
f"{float(row['f1']):.3f}",
str(row["note"]),
]
for row in raw_rows
if isinstance(row, dict)
]
source = sota_output.get("source", {})
source_title = (
source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source"
)
source_url = source.get("url", "") if isinstance(source, dict) else ""
skip_reasons = _collect_skip_reasons(agent_output.aggregates)
skip_note = ""
if skip_reasons:
skip_note = (
"\nSkipped methods in this reproduced run: "
+ "; ".join(skip_reasons)
+ "\n"
)
method_values = agent_output.metadata.get("methods", [])
dataset_values = agent_output.metadata.get("datasets", [])
methods = [str(method) for method in method_values] if isinstance(method_values, list) else []
datasets = (
[str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else []
)
seeds = str(agent_output.metadata.get("seeds", ""))
reproduction_command = str(agent_output.metadata.get("reproduction_command", ""))
return (
"# Benchmark Report\n\n"
"## Reproduction\n\n"
f"`{reproduction_command}`\n\n"
"## Configuration\n\n"
f"- Methods: {', '.join(methods)}\n"
f"- Datasets: {', '.join(datasets)}\n"
f"- Seeds: {seeds}\n"
"- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
f"{skip_note}\n"
"## Cross-Dataset Local Results\n\n"
f"{local_summary}\n\n"
"## Per-Dataset Local Results\n\n"
+ "\n\n".join(per_dataset_sections)
+ "\n\n## Citation-Only SOTA Reference\n\n"
+ f"Source: [{source_title}]({source_url})\n\n"
+ "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n"
+ _render_table(
["Method", "Dataset", "Precision", "Recall", "F1", "Note"],
sota_rows,
)
+ "\n\n## Methodology\n\n"
+ "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. Quota units are reported in free-tier fractions rather than dollars.\n"
)
def write_benchmark_outputs(
*,
agent_json_path: Path,
sota_json_path: Path,
report_path: Path,
readme_path: Path,
) -> None:
"""Generate the benchmark report and patch the README block."""
agent_output = load_agent_output(agent_json_path)
sota_output = load_sota_output(sota_json_path)
report_text = render_benchmark_report(agent_output, sota_output)
report_path.write_text(report_text, encoding="utf-8")
readme_text = readme_path.read_text(encoding="utf-8")
benchmark_block = build_readme_benchmark_block(agent_output, report_path)
updated_readme = replace_benchmark_block(readme_text, benchmark_block)
readme_path.write_text(updated_readme, encoding="utf-8")