File size: 9,745 Bytes
5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 eed1cab 5143557 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 | """Benchmark report rendering and README marker updates."""
from __future__ import annotations
import json
from collections import defaultdict
from pathlib import Path
from typing import cast
from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput
def _format_metric(mean_value: float | None, std_value: float | None) -> str:
"""Format a mean/std metric cell for markdown tables."""
if mean_value is None:
return "Skipped"
if std_value is None:
return f"{mean_value:.4f}"
return f"{mean_value:.4f} +/- {std_value:.4f}"
def _render_table(headers: list[str], rows: list[list[str]]) -> str:
"""Render a simple markdown table."""
lines = [
"| " + " | ".join(headers) + " |",
"| " + " | ".join("---" for _ in headers) + " |",
]
for row in rows:
lines.append("| " + " | ".join(row) + " |")
return "\n".join(lines)
def load_agent_output(path: Path) -> BenchmarkRunOutput:
"""Load agent comparison JSON output."""
return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8")))
def load_sota_output(path: Path) -> dict[str, object]:
"""Load citation-only SOTA comparison JSON output."""
raw = json.loads(path.read_text(encoding="utf-8"))
if not isinstance(raw, dict):
raise ValueError("SOTA comparison JSON must be a top-level object.")
return cast(dict[str, object], raw)
def replace_benchmark_block(readme_text: str, block_text: str) -> str:
"""Replace the README benchmark marker block idempotently."""
start_marker = "<!-- BENCH:START -->"
end_marker = "<!-- BENCH:END -->"
if start_marker not in readme_text or end_marker not in readme_text:
raise ValueError("README benchmark markers are missing.")
start = readme_text.index(start_marker) + len(start_marker)
end = readme_text.index(end_marker)
return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:]
def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]:
"""Build a simple cross-dataset local summary table."""
grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
skipped: dict[str, str | None] = {}
for aggregate in aggregates:
if aggregate.status == "ok":
grouped[aggregate.method].append(aggregate)
else:
skipped.setdefault(aggregate.method, aggregate.skip_reason)
rows: list[list[str]] = []
methods = sorted(set(grouped) | set(skipped))
for method in methods:
ok_rows = grouped.get(method, [])
if not ok_rows:
rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
continue
p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
gpu_hours_mean = sum(row.gpu_hours_mean or 0.0 for row in ok_rows) / len(ok_rows)
rows.append(
[
method,
f"{p_mean:.4f}",
f"{r_mean:.4f}",
f"{f_mean:.4f}",
f"{step_mean:.2f}",
f"{quota_mean:.4f}",
f"{gpu_hours_mean:.4f}",
]
)
return rows
def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]:
"""Collect distinct aggregate skip reasons in stable order."""
reasons: list[str] = []
for aggregate in aggregates:
reason = aggregate.skip_reason
if aggregate.status == "ok" or reason is None or reason in reasons:
continue
reasons.append(reason)
return reasons
def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str:
"""Build the generated README benchmark summary block."""
rows = _aggregate_across_datasets(agent_output.aggregates)
table = _render_table(
["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
rows,
)
skip_reasons = _collect_skip_reasons(agent_output.aggregates)
skip_note = ""
if skip_reasons:
skip_note = "\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
return (
"Generated from `eval/results/agent_comparison.json`.\n\n"
f"{table}\n\n"
f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows."
f"{skip_note}"
)
def render_benchmark_report(
agent_output: BenchmarkRunOutput,
sota_output: dict[str, object],
) -> str:
"""Render the full markdown benchmark report."""
per_dataset_sections: list[str] = []
by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
for aggregate in agent_output.aggregates:
by_dataset[aggregate.dataset].append(aggregate)
for dataset, rows in by_dataset.items():
table_rows = [
[
row.method,
_format_metric(row.precision_mean, row.precision_std),
_format_metric(row.recall_mean, row.recall_std),
_format_metric(row.f1_mean, row.f1_std),
_format_metric(row.avg_steps_mean, row.avg_steps_std),
_format_metric(row.quota_units_mean, row.quota_units_std),
_format_metric(row.gpu_hours_mean, row.gpu_hours_std),
]
for row in rows
]
per_dataset_sections.append(
f"### {dataset.title()}\n\n"
+ _render_table(
[
"Method",
"Precision",
"Recall",
"F1",
"Avg Steps",
"Quota Units",
"GPU Hours",
],
table_rows,
)
)
local_summary = _render_table(
["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
_aggregate_across_datasets(agent_output.aggregates),
)
raw_rows = sota_output.get("rows", [])
if not isinstance(raw_rows, list):
raw_rows = []
sota_rows = [
[
str(row["method"]),
str(row["dataset"]),
f"{float(row['precision']):.3f}",
f"{float(row['recall']):.3f}",
f"{float(row['f1']):.3f}",
str(row["note"]),
]
for row in raw_rows
if isinstance(row, dict)
]
source = sota_output.get("source", {})
source_title = (
source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source"
)
source_url = source.get("url", "") if isinstance(source, dict) else ""
skip_reasons = _collect_skip_reasons(agent_output.aggregates)
skip_note = ""
if skip_reasons:
skip_note = "\nSkipped methods in this reproduced run: " + "; ".join(skip_reasons) + "\n"
method_values = agent_output.metadata.get("methods", [])
dataset_values = agent_output.metadata.get("datasets", [])
methods = [str(method) for method in method_values] if isinstance(method_values, list) else []
datasets = (
[str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else []
)
seeds = str(agent_output.metadata.get("seeds", ""))
reproduction_command = str(agent_output.metadata.get("reproduction_command", ""))
return (
"# Benchmark Report\n\n"
"## Reproduction\n\n"
f"`{reproduction_command}`\n\n"
"## Configuration\n\n"
f"- Methods: {', '.join(methods)}\n"
f"- Datasets: {', '.join(datasets)}\n"
f"- Seeds: {seeds}\n"
"- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
"- GRPO compute cost is reported as free-tier GPU-hours, not dollars.\n"
f"{skip_note}\n"
"## Cross-Dataset Local Results\n\n"
f"{local_summary}\n\n"
"## Per-Dataset Local Results\n\n"
+ "\n\n".join(per_dataset_sections)
+ "\n\n## Citation-Only SOTA Reference\n\n"
+ f"Source: [{source_title}]({source_url})\n\n"
+ "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n"
+ _render_table(
["Method", "Dataset", "Precision", "Recall", "F1", "Note"],
sota_rows,
)
+ "\n\n## Methodology\n\n"
+ "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. LLM quota units are free-tier fractions; GRPO compute cost is GPU-hours, not dollars.\n"
)
def write_benchmark_outputs(
*,
agent_json_path: Path,
sota_json_path: Path,
report_path: Path,
readme_path: Path,
) -> None:
"""Generate the benchmark report and patch the README block."""
agent_output = load_agent_output(agent_json_path)
sota_output = load_sota_output(sota_json_path)
report_text = render_benchmark_report(agent_output, sota_output)
report_path.write_text(report_text, encoding="utf-8")
readme_text = readme_path.read_text(encoding="utf-8")
benchmark_block = build_readme_benchmark_block(agent_output, report_path)
updated_readme = replace_benchmark_block(readme_text, benchmark_block)
readme_path.write_text(updated_readme, encoding="utf-8")
|