Spaces:

Praneshrajan15
/

dataforge-playground

Running

File size: 9,745 Bytes

"""Benchmark report rendering and README marker updates."""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path
from typing import cast

from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput


def _format_metric(mean_value: float | None, std_value: float | None) -> str:
    """Format a mean/std metric cell for markdown tables."""
    if mean_value is None:
        return "Skipped"
    if std_value is None:
        return f"{mean_value:.4f}"
    return f"{mean_value:.4f} +/- {std_value:.4f}"


def _render_table(headers: list[str], rows: list[list[str]]) -> str:
    """Render a simple markdown table."""
    lines = [
        "| " + " | ".join(headers) + " |",
        "| " + " | ".join("---" for _ in headers) + " |",
    ]
    for row in rows:
        lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)


def load_agent_output(path: Path) -> BenchmarkRunOutput:
    """Load agent comparison JSON output."""
    return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8")))


def load_sota_output(path: Path) -> dict[str, object]:
    """Load citation-only SOTA comparison JSON output."""
    raw = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(raw, dict):
        raise ValueError("SOTA comparison JSON must be a top-level object.")
    return cast(dict[str, object], raw)


def replace_benchmark_block(readme_text: str, block_text: str) -> str:
    """Replace the README benchmark marker block idempotently."""
    start_marker = "<!-- BENCH:START -->"
    end_marker = "<!-- BENCH:END -->"
    if start_marker not in readme_text or end_marker not in readme_text:
        raise ValueError("README benchmark markers are missing.")
    start = readme_text.index(start_marker) + len(start_marker)
    end = readme_text.index(end_marker)
    return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:]


def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]:
    """Build a simple cross-dataset local summary table."""
    grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
    skipped: dict[str, str | None] = {}
    for aggregate in aggregates:
        if aggregate.status == "ok":
            grouped[aggregate.method].append(aggregate)
        else:
            skipped.setdefault(aggregate.method, aggregate.skip_reason)

    rows: list[list[str]] = []
    methods = sorted(set(grouped) | set(skipped))
    for method in methods:
        ok_rows = grouped.get(method, [])
        if not ok_rows:
            rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
            continue
        p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
        r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
        f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
        step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
        quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
        gpu_hours_mean = sum(row.gpu_hours_mean or 0.0 for row in ok_rows) / len(ok_rows)
        rows.append(
            [
                method,
                f"{p_mean:.4f}",
                f"{r_mean:.4f}",
                f"{f_mean:.4f}",
                f"{step_mean:.2f}",
                f"{quota_mean:.4f}",
                f"{gpu_hours_mean:.4f}",
            ]
        )
    return rows


def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]:
    """Collect distinct aggregate skip reasons in stable order."""
    reasons: list[str] = []
    for aggregate in aggregates:
        reason = aggregate.skip_reason
        if aggregate.status == "ok" or reason is None or reason in reasons:
            continue
        reasons.append(reason)
    return reasons


def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str:
    """Build the generated README benchmark summary block."""
    rows = _aggregate_across_datasets(agent_output.aggregates)
    table = _render_table(
        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
        rows,
    )
    skip_reasons = _collect_skip_reasons(agent_output.aggregates)
    skip_note = ""
    if skip_reasons:
        skip_note = "\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
    return (
        "Generated from `eval/results/agent_comparison.json`.\n\n"
        f"{table}\n\n"
        f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows."
        f"{skip_note}"
    )


def render_benchmark_report(
    agent_output: BenchmarkRunOutput,
    sota_output: dict[str, object],
) -> str:
    """Render the full markdown benchmark report."""
    per_dataset_sections: list[str] = []
    by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
    for aggregate in agent_output.aggregates:
        by_dataset[aggregate.dataset].append(aggregate)

    for dataset, rows in by_dataset.items():
        table_rows = [
            [
                row.method,
                _format_metric(row.precision_mean, row.precision_std),
                _format_metric(row.recall_mean, row.recall_std),
                _format_metric(row.f1_mean, row.f1_std),
                _format_metric(row.avg_steps_mean, row.avg_steps_std),
                _format_metric(row.quota_units_mean, row.quota_units_std),
                _format_metric(row.gpu_hours_mean, row.gpu_hours_std),
            ]
            for row in rows
        ]
        per_dataset_sections.append(
            f"### {dataset.title()}\n\n"
            + _render_table(
                [
                    "Method",
                    "Precision",
                    "Recall",
                    "F1",
                    "Avg Steps",
                    "Quota Units",
                    "GPU Hours",
                ],
                table_rows,
            )
        )

    local_summary = _render_table(
        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
        _aggregate_across_datasets(agent_output.aggregates),
    )

    raw_rows = sota_output.get("rows", [])
    if not isinstance(raw_rows, list):
        raw_rows = []
    sota_rows = [
        [
            str(row["method"]),
            str(row["dataset"]),
            f"{float(row['precision']):.3f}",
            f"{float(row['recall']):.3f}",
            f"{float(row['f1']):.3f}",
            str(row["note"]),
        ]
        for row in raw_rows
        if isinstance(row, dict)
    ]
    source = sota_output.get("source", {})
    source_title = (
        source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source"
    )
    source_url = source.get("url", "") if isinstance(source, dict) else ""
    skip_reasons = _collect_skip_reasons(agent_output.aggregates)
    skip_note = ""
    if skip_reasons:
        skip_note = "\nSkipped methods in this reproduced run: " + "; ".join(skip_reasons) + "\n"

    method_values = agent_output.metadata.get("methods", [])
    dataset_values = agent_output.metadata.get("datasets", [])
    methods = [str(method) for method in method_values] if isinstance(method_values, list) else []
    datasets = (
        [str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else []
    )
    seeds = str(agent_output.metadata.get("seeds", ""))
    reproduction_command = str(agent_output.metadata.get("reproduction_command", ""))

    return (
        "# Benchmark Report\n\n"
        "## Reproduction\n\n"
        f"`{reproduction_command}`\n\n"
        "## Configuration\n\n"
        f"- Methods: {', '.join(methods)}\n"
        f"- Datasets: {', '.join(datasets)}\n"
        f"- Seeds: {seeds}\n"
        "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
        "- GRPO compute cost is reported as free-tier GPU-hours, not dollars.\n"
        f"{skip_note}\n"
        "## Cross-Dataset Local Results\n\n"
        f"{local_summary}\n\n"
        "## Per-Dataset Local Results\n\n"
        + "\n\n".join(per_dataset_sections)
        + "\n\n## Citation-Only SOTA Reference\n\n"
        + f"Source: [{source_title}]({source_url})\n\n"
        + "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n"
        + _render_table(
            ["Method", "Dataset", "Precision", "Recall", "F1", "Note"],
            sota_rows,
        )
        + "\n\n## Methodology\n\n"
        + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. LLM quota units are free-tier fractions; GRPO compute cost is GPU-hours, not dollars.\n"
    )


def write_benchmark_outputs(
    *,
    agent_json_path: Path,
    sota_json_path: Path,
    report_path: Path,
    readme_path: Path,
) -> None:
    """Generate the benchmark report and patch the README block."""
    agent_output = load_agent_output(agent_json_path)
    sota_output = load_sota_output(sota_json_path)
    report_text = render_benchmark_report(agent_output, sota_output)
    report_path.write_text(report_text, encoding="utf-8")

    readme_text = readme_path.read_text(encoding="utf-8")
    benchmark_block = build_readme_benchmark_block(agent_output, report_path)
    updated_readme = replace_benchmark_block(readme_text, benchmark_block)
    readme_path.write_text(updated_readme, encoding="utf-8")