| """Benchmark report rendering and README marker updates.""" |
|
|
| from __future__ import annotations |
|
|
| import json |
| from collections import defaultdict |
| from pathlib import Path |
| from typing import cast |
|
|
| from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput |
|
|
|
|
| def _format_metric(mean_value: float | None, std_value: float | None) -> str: |
| """Format a mean/std metric cell for markdown tables.""" |
| if mean_value is None: |
| return "Skipped" |
| if std_value is None: |
| return f"{mean_value:.4f}" |
| return f"{mean_value:.4f} +/- {std_value:.4f}" |
|
|
|
|
| def _render_table(headers: list[str], rows: list[list[str]]) -> str: |
| """Render a simple markdown table.""" |
| lines = [ |
| "| " + " | ".join(headers) + " |", |
| "| " + " | ".join("---" for _ in headers) + " |", |
| ] |
| for row in rows: |
| lines.append("| " + " | ".join(row) + " |") |
| return "\n".join(lines) |
|
|
|
|
| def load_agent_output(path: Path) -> BenchmarkRunOutput: |
| """Load agent comparison JSON output.""" |
| return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8"))) |
|
|
|
|
| def load_sota_output(path: Path) -> dict[str, object]: |
| """Load citation-only SOTA comparison JSON output.""" |
| raw = json.loads(path.read_text(encoding="utf-8")) |
| if not isinstance(raw, dict): |
| raise ValueError("SOTA comparison JSON must be a top-level object.") |
| return cast(dict[str, object], raw) |
|
|
|
|
| def replace_benchmark_block(readme_text: str, block_text: str) -> str: |
| """Replace the README benchmark marker block idempotently.""" |
| start_marker = "<!-- BENCH:START -->" |
| end_marker = "<!-- BENCH:END -->" |
| if start_marker not in readme_text or end_marker not in readme_text: |
| raise ValueError("README benchmark markers are missing.") |
| start = readme_text.index(start_marker) + len(start_marker) |
| end = readme_text.index(end_marker) |
| return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:] |
|
|
|
|
| def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]: |
| """Build a simple cross-dataset local summary table.""" |
| grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list) |
| skipped: dict[str, str | None] = {} |
| for aggregate in aggregates: |
| if aggregate.status == "ok": |
| grouped[aggregate.method].append(aggregate) |
| else: |
| skipped.setdefault(aggregate.method, aggregate.skip_reason) |
|
|
| rows: list[list[str]] = [] |
| methods = sorted(set(grouped) | set(skipped)) |
| for method in methods: |
| ok_rows = grouped.get(method, []) |
| if not ok_rows: |
| rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"]) |
| continue |
| p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows) |
| r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows) |
| f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows) |
| step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows) |
| quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows) |
| rows.append( |
| [ |
| method, |
| f"{p_mean:.4f}", |
| f"{r_mean:.4f}", |
| f"{f_mean:.4f}", |
| f"{step_mean:.2f}", |
| f"{quota_mean:.4f}", |
| ] |
| ) |
| return rows |
|
|
|
|
| def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]: |
| """Collect distinct aggregate skip reasons in stable order.""" |
| reasons: list[str] = [] |
| for aggregate in aggregates: |
| reason = aggregate.skip_reason |
| if aggregate.status == "ok" or reason is None or reason in reasons: |
| continue |
| reasons.append(reason) |
| return reasons |
|
|
|
|
| def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str: |
| """Build the generated README benchmark summary block.""" |
| rows = _aggregate_across_datasets(agent_output.aggregates) |
| table = _render_table( |
| ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], |
| rows, |
| ) |
| skip_reasons = _collect_skip_reasons(agent_output.aggregates) |
| skip_note = "" |
| if skip_reasons: |
| skip_note = ( |
| "\n\nSkipped methods in this run: " + "; ".join(skip_reasons) |
| ) |
| return ( |
| "Generated from `eval/results/agent_comparison.json`.\n\n" |
| f"{table}\n\n" |
| f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows." |
| f"{skip_note}" |
| ) |
|
|
|
|
| def render_benchmark_report( |
| agent_output: BenchmarkRunOutput, |
| sota_output: dict[str, object], |
| ) -> str: |
| """Render the full markdown benchmark report.""" |
| per_dataset_sections: list[str] = [] |
| by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list) |
| for aggregate in agent_output.aggregates: |
| by_dataset[aggregate.dataset].append(aggregate) |
|
|
| for dataset, rows in by_dataset.items(): |
| table_rows = [ |
| [ |
| row.method, |
| _format_metric(row.precision_mean, row.precision_std), |
| _format_metric(row.recall_mean, row.recall_std), |
| _format_metric(row.f1_mean, row.f1_std), |
| _format_metric(row.avg_steps_mean, row.avg_steps_std), |
| _format_metric(row.quota_units_mean, row.quota_units_std), |
| ] |
| for row in rows |
| ] |
| per_dataset_sections.append( |
| f"### {dataset.title()}\n\n" |
| + _render_table( |
| ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], |
| table_rows, |
| ) |
| ) |
|
|
| local_summary = _render_table( |
| ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], |
| _aggregate_across_datasets(agent_output.aggregates), |
| ) |
|
|
| raw_rows = sota_output.get("rows", []) |
| if not isinstance(raw_rows, list): |
| raw_rows = [] |
| sota_rows = [ |
| [ |
| str(row["method"]), |
| str(row["dataset"]), |
| f"{float(row['precision']):.3f}", |
| f"{float(row['recall']):.3f}", |
| f"{float(row['f1']):.3f}", |
| str(row["note"]), |
| ] |
| for row in raw_rows |
| if isinstance(row, dict) |
| ] |
| source = sota_output.get("source", {}) |
| source_title = ( |
| source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source" |
| ) |
| source_url = source.get("url", "") if isinstance(source, dict) else "" |
| skip_reasons = _collect_skip_reasons(agent_output.aggregates) |
| skip_note = "" |
| if skip_reasons: |
| skip_note = ( |
| "\nSkipped methods in this reproduced run: " |
| + "; ".join(skip_reasons) |
| + "\n" |
| ) |
|
|
| method_values = agent_output.metadata.get("methods", []) |
| dataset_values = agent_output.metadata.get("datasets", []) |
| methods = [str(method) for method in method_values] if isinstance(method_values, list) else [] |
| datasets = ( |
| [str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else [] |
| ) |
| seeds = str(agent_output.metadata.get("seeds", "")) |
| reproduction_command = str(agent_output.metadata.get("reproduction_command", "")) |
|
|
| return ( |
| "# Benchmark Report\n\n" |
| "## Reproduction\n\n" |
| f"`{reproduction_command}`\n\n" |
| "## Configuration\n\n" |
| f"- Methods: {', '.join(methods)}\n" |
| f"- Datasets: {', '.join(datasets)}\n" |
| f"- Seeds: {seeds}\n" |
| "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n" |
| f"{skip_note}\n" |
| "## Cross-Dataset Local Results\n\n" |
| f"{local_summary}\n\n" |
| "## Per-Dataset Local Results\n\n" |
| + "\n\n".join(per_dataset_sections) |
| + "\n\n## Citation-Only SOTA Reference\n\n" |
| + f"Source: [{source_title}]({source_url})\n\n" |
| + "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n" |
| + _render_table( |
| ["Method", "Dataset", "Precision", "Recall", "F1", "Note"], |
| sota_rows, |
| ) |
| + "\n\n## Methodology\n\n" |
| + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. Quota units are reported in free-tier fractions rather than dollars.\n" |
| ) |
|
|
|
|
| def write_benchmark_outputs( |
| *, |
| agent_json_path: Path, |
| sota_json_path: Path, |
| report_path: Path, |
| readme_path: Path, |
| ) -> None: |
| """Generate the benchmark report and patch the README block.""" |
| agent_output = load_agent_output(agent_json_path) |
| sota_output = load_sota_output(sota_json_path) |
| report_text = render_benchmark_report(agent_output, sota_output) |
| report_path.write_text(report_text, encoding="utf-8") |
|
|
| readme_text = readme_path.read_text(encoding="utf-8") |
| benchmark_block = build_readme_benchmark_block(agent_output, report_path) |
| updated_readme = replace_benchmark_block(readme_text, benchmark_block) |
| readme_path.write_text(updated_readme, encoding="utf-8") |
|
|