"""Benchmark report rendering and README marker updates.""" from __future__ import annotations import json from collections import defaultdict from pathlib import Path from typing import cast from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput def _format_metric(mean_value: float | None, std_value: float | None) -> str: """Format a mean/std metric cell for markdown tables.""" if mean_value is None: return "Skipped" if std_value is None: return f"{mean_value:.4f}" return f"{mean_value:.4f} +/- {std_value:.4f}" def _render_table(headers: list[str], rows: list[list[str]]) -> str: """Render a simple markdown table.""" lines = [ "| " + " | ".join(headers) + " |", "| " + " | ".join("---" for _ in headers) + " |", ] for row in rows: lines.append("| " + " | ".join(row) + " |") return "\n".join(lines) def load_agent_output(path: Path) -> BenchmarkRunOutput: """Load agent comparison JSON output.""" return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8"))) def load_sota_output(path: Path) -> dict[str, object]: """Load citation-only SOTA comparison JSON output.""" raw = json.loads(path.read_text(encoding="utf-8")) if not isinstance(raw, dict): raise ValueError("SOTA comparison JSON must be a top-level object.") return cast(dict[str, object], raw) def replace_benchmark_block(readme_text: str, block_text: str) -> str: """Replace the README benchmark marker block idempotently.""" start_marker = "" end_marker = "" if start_marker not in readme_text or end_marker not in readme_text: raise ValueError("README benchmark markers are missing.") start = readme_text.index(start_marker) + len(start_marker) end = readme_text.index(end_marker) return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:] def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]: """Build a simple cross-dataset local summary table.""" grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list) skipped: dict[str, str | None] = {} for aggregate in aggregates: if aggregate.status == "ok": grouped[aggregate.method].append(aggregate) else: skipped.setdefault(aggregate.method, aggregate.skip_reason) rows: list[list[str]] = [] methods = sorted(set(grouped) | set(skipped)) for method in methods: ok_rows = grouped.get(method, []) if not ok_rows: rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"]) continue p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows) r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows) f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows) step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows) quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows) rows.append( [ method, f"{p_mean:.4f}", f"{r_mean:.4f}", f"{f_mean:.4f}", f"{step_mean:.2f}", f"{quota_mean:.4f}", ] ) return rows def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]: """Collect distinct aggregate skip reasons in stable order.""" reasons: list[str] = [] for aggregate in aggregates: reason = aggregate.skip_reason if aggregate.status == "ok" or reason is None or reason in reasons: continue reasons.append(reason) return reasons def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str: """Build the generated README benchmark summary block.""" rows = _aggregate_across_datasets(agent_output.aggregates) table = _render_table( ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], rows, ) skip_reasons = _collect_skip_reasons(agent_output.aggregates) skip_note = "" if skip_reasons: skip_note = ( "\n\nSkipped methods in this run: " + "; ".join(skip_reasons) ) return ( "Generated from `eval/results/agent_comparison.json`.\n\n" f"{table}\n\n" f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows." f"{skip_note}" ) def render_benchmark_report( agent_output: BenchmarkRunOutput, sota_output: dict[str, object], ) -> str: """Render the full markdown benchmark report.""" per_dataset_sections: list[str] = [] by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list) for aggregate in agent_output.aggregates: by_dataset[aggregate.dataset].append(aggregate) for dataset, rows in by_dataset.items(): table_rows = [ [ row.method, _format_metric(row.precision_mean, row.precision_std), _format_metric(row.recall_mean, row.recall_std), _format_metric(row.f1_mean, row.f1_std), _format_metric(row.avg_steps_mean, row.avg_steps_std), _format_metric(row.quota_units_mean, row.quota_units_std), ] for row in rows ] per_dataset_sections.append( f"### {dataset.title()}\n\n" + _render_table( ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], table_rows, ) ) local_summary = _render_table( ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units"], _aggregate_across_datasets(agent_output.aggregates), ) raw_rows = sota_output.get("rows", []) if not isinstance(raw_rows, list): raw_rows = [] sota_rows = [ [ str(row["method"]), str(row["dataset"]), f"{float(row['precision']):.3f}", f"{float(row['recall']):.3f}", f"{float(row['f1']):.3f}", str(row["note"]), ] for row in raw_rows if isinstance(row, dict) ] source = sota_output.get("source", {}) source_title = ( source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source" ) source_url = source.get("url", "") if isinstance(source, dict) else "" skip_reasons = _collect_skip_reasons(agent_output.aggregates) skip_note = "" if skip_reasons: skip_note = ( "\nSkipped methods in this reproduced run: " + "; ".join(skip_reasons) + "\n" ) method_values = agent_output.metadata.get("methods", []) dataset_values = agent_output.metadata.get("datasets", []) methods = [str(method) for method in method_values] if isinstance(method_values, list) else [] datasets = ( [str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else [] ) seeds = str(agent_output.metadata.get("seeds", "")) reproduction_command = str(agent_output.metadata.get("reproduction_command", "")) return ( "# Benchmark Report\n\n" "## Reproduction\n\n" f"`{reproduction_command}`\n\n" "## Configuration\n\n" f"- Methods: {', '.join(methods)}\n" f"- Datasets: {', '.join(datasets)}\n" f"- Seeds: {seeds}\n" "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n" f"{skip_note}\n" "## Cross-Dataset Local Results\n\n" f"{local_summary}\n\n" "## Per-Dataset Local Results\n\n" + "\n\n".join(per_dataset_sections) + "\n\n## Citation-Only SOTA Reference\n\n" + f"Source: [{source_title}]({source_url})\n\n" + "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n" + _render_table( ["Method", "Dataset", "Precision", "Recall", "F1", "Note"], sota_rows, ) + "\n\n## Methodology\n\n" + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. Quota units are reported in free-tier fractions rather than dollars.\n" ) def write_benchmark_outputs( *, agent_json_path: Path, sota_json_path: Path, report_path: Path, readme_path: Path, ) -> None: """Generate the benchmark report and patch the README block.""" agent_output = load_agent_output(agent_json_path) sota_output = load_sota_output(sota_json_path) report_text = render_benchmark_report(agent_output, sota_output) report_path.write_text(report_text, encoding="utf-8") readme_text = readme_path.read_text(encoding="utf-8") benchmark_block = build_readme_benchmark_block(agent_output, report_path) updated_readme = replace_benchmark_block(readme_text, benchmark_block) readme_path.write_text(updated_readme, encoding="utf-8")