File size: 9,745 Bytes
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
eed1cab
 
 
 
 
 
 
 
 
5143557
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
eed1cab
5143557
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
"""Benchmark report rendering and README marker updates."""

from __future__ import annotations

import json
from collections import defaultdict
from pathlib import Path
from typing import cast

from dataforge.bench.core import AggregateBenchmarkResult, BenchmarkRunOutput


def _format_metric(mean_value: float | None, std_value: float | None) -> str:
    """Format a mean/std metric cell for markdown tables."""
    if mean_value is None:
        return "Skipped"
    if std_value is None:
        return f"{mean_value:.4f}"
    return f"{mean_value:.4f} +/- {std_value:.4f}"


def _render_table(headers: list[str], rows: list[list[str]]) -> str:
    """Render a simple markdown table."""
    lines = [
        "| " + " | ".join(headers) + " |",
        "| " + " | ".join("---" for _ in headers) + " |",
    ]
    for row in rows:
        lines.append("| " + " | ".join(row) + " |")
    return "\n".join(lines)


def load_agent_output(path: Path) -> BenchmarkRunOutput:
    """Load agent comparison JSON output."""
    return BenchmarkRunOutput.model_validate(json.loads(path.read_text(encoding="utf-8")))


def load_sota_output(path: Path) -> dict[str, object]:
    """Load citation-only SOTA comparison JSON output."""
    raw = json.loads(path.read_text(encoding="utf-8"))
    if not isinstance(raw, dict):
        raise ValueError("SOTA comparison JSON must be a top-level object.")
    return cast(dict[str, object], raw)


def replace_benchmark_block(readme_text: str, block_text: str) -> str:
    """Replace the README benchmark marker block idempotently."""
    start_marker = "<!-- BENCH:START -->"
    end_marker = "<!-- BENCH:END -->"
    if start_marker not in readme_text or end_marker not in readme_text:
        raise ValueError("README benchmark markers are missing.")
    start = readme_text.index(start_marker) + len(start_marker)
    end = readme_text.index(end_marker)
    return readme_text[:start] + "\n" + block_text.strip() + "\n" + readme_text[end:]


def _aggregate_across_datasets(aggregates: list[AggregateBenchmarkResult]) -> list[list[str]]:
    """Build a simple cross-dataset local summary table."""
    grouped: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
    skipped: dict[str, str | None] = {}
    for aggregate in aggregates:
        if aggregate.status == "ok":
            grouped[aggregate.method].append(aggregate)
        else:
            skipped.setdefault(aggregate.method, aggregate.skip_reason)

    rows: list[list[str]] = []
    methods = sorted(set(grouped) | set(skipped))
    for method in methods:
        ok_rows = grouped.get(method, [])
        if not ok_rows:
            rows.append([method, "Skipped", "Skipped", "Skipped", "Skipped", "Skipped", "Skipped"])
            continue
        p_mean = sum(row.precision_mean or 0.0 for row in ok_rows) / len(ok_rows)
        r_mean = sum(row.recall_mean or 0.0 for row in ok_rows) / len(ok_rows)
        f_mean = sum(row.f1_mean or 0.0 for row in ok_rows) / len(ok_rows)
        step_mean = sum(row.avg_steps_mean or 0.0 for row in ok_rows) / len(ok_rows)
        quota_mean = sum(row.quota_units_mean or 0.0 for row in ok_rows) / len(ok_rows)
        gpu_hours_mean = sum(row.gpu_hours_mean or 0.0 for row in ok_rows) / len(ok_rows)
        rows.append(
            [
                method,
                f"{p_mean:.4f}",
                f"{r_mean:.4f}",
                f"{f_mean:.4f}",
                f"{step_mean:.2f}",
                f"{quota_mean:.4f}",
                f"{gpu_hours_mean:.4f}",
            ]
        )
    return rows


def _collect_skip_reasons(aggregates: list[AggregateBenchmarkResult]) -> list[str]:
    """Collect distinct aggregate skip reasons in stable order."""
    reasons: list[str] = []
    for aggregate in aggregates:
        reason = aggregate.skip_reason
        if aggregate.status == "ok" or reason is None or reason in reasons:
            continue
        reasons.append(reason)
    return reasons


def build_readme_benchmark_block(agent_output: BenchmarkRunOutput, report_path: Path) -> str:
    """Build the generated README benchmark summary block."""
    rows = _aggregate_across_datasets(agent_output.aggregates)
    table = _render_table(
        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
        rows,
    )
    skip_reasons = _collect_skip_reasons(agent_output.aggregates)
    skip_note = ""
    if skip_reasons:
        skip_note = "\n\nSkipped methods in this run: " + "; ".join(skip_reasons)
    return (
        "Generated from `eval/results/agent_comparison.json`.\n\n"
        f"{table}\n\n"
        f"See `{report_path.name}` for per-dataset tables, error bars, and citation-only SOTA rows."
        f"{skip_note}"
    )


def render_benchmark_report(
    agent_output: BenchmarkRunOutput,
    sota_output: dict[str, object],
) -> str:
    """Render the full markdown benchmark report."""
    per_dataset_sections: list[str] = []
    by_dataset: dict[str, list[AggregateBenchmarkResult]] = defaultdict(list)
    for aggregate in agent_output.aggregates:
        by_dataset[aggregate.dataset].append(aggregate)

    for dataset, rows in by_dataset.items():
        table_rows = [
            [
                row.method,
                _format_metric(row.precision_mean, row.precision_std),
                _format_metric(row.recall_mean, row.recall_std),
                _format_metric(row.f1_mean, row.f1_std),
                _format_metric(row.avg_steps_mean, row.avg_steps_std),
                _format_metric(row.quota_units_mean, row.quota_units_std),
                _format_metric(row.gpu_hours_mean, row.gpu_hours_std),
            ]
            for row in rows
        ]
        per_dataset_sections.append(
            f"### {dataset.title()}\n\n"
            + _render_table(
                [
                    "Method",
                    "Precision",
                    "Recall",
                    "F1",
                    "Avg Steps",
                    "Quota Units",
                    "GPU Hours",
                ],
                table_rows,
            )
        )

    local_summary = _render_table(
        ["Method", "Precision", "Recall", "F1", "Avg Steps", "Quota Units", "GPU Hours"],
        _aggregate_across_datasets(agent_output.aggregates),
    )

    raw_rows = sota_output.get("rows", [])
    if not isinstance(raw_rows, list):
        raw_rows = []
    sota_rows = [
        [
            str(row["method"]),
            str(row["dataset"]),
            f"{float(row['precision']):.3f}",
            f"{float(row['recall']):.3f}",
            f"{float(row['f1']):.3f}",
            str(row["note"]),
        ]
        for row in raw_rows
        if isinstance(row, dict)
    ]
    source = sota_output.get("source", {})
    source_title = (
        source.get("title", "Unknown source") if isinstance(source, dict) else "Unknown source"
    )
    source_url = source.get("url", "") if isinstance(source, dict) else ""
    skip_reasons = _collect_skip_reasons(agent_output.aggregates)
    skip_note = ""
    if skip_reasons:
        skip_note = "\nSkipped methods in this reproduced run: " + "; ".join(skip_reasons) + "\n"

    method_values = agent_output.metadata.get("methods", [])
    dataset_values = agent_output.metadata.get("datasets", [])
    methods = [str(method) for method in method_values] if isinstance(method_values, list) else []
    datasets = (
        [str(dataset) for dataset in dataset_values] if isinstance(dataset_values, list) else []
    )
    seeds = str(agent_output.metadata.get("seeds", ""))
    reproduction_command = str(agent_output.metadata.get("reproduction_command", ""))

    return (
        "# Benchmark Report\n\n"
        "## Reproduction\n\n"
        f"`{reproduction_command}`\n\n"
        "## Configuration\n\n"
        f"- Methods: {', '.join(methods)}\n"
        f"- Datasets: {', '.join(datasets)}\n"
        f"- Seeds: {seeds}\n"
        "- Free-tier quota units: `max(llm_calls / 1000, (prompt_tokens + completion_tokens) / 100000)`\n"
        "- GRPO compute cost is reported as free-tier GPU-hours, not dollars.\n"
        f"{skip_note}\n"
        "## Cross-Dataset Local Results\n\n"
        f"{local_summary}\n\n"
        "## Per-Dataset Local Results\n\n"
        + "\n\n".join(per_dataset_sections)
        + "\n\n## Citation-Only SOTA Reference\n\n"
        + f"Source: [{source_title}]({source_url})\n\n"
        + "HoloClean rows are transcribed from BClean Table 4; see [HoloClean 2017](https://www.vldb.org/pvldb/vol10/p1190-rekatsinas.pdf) for the original system description.\n\n"
        + _render_table(
            ["Method", "Dataset", "Precision", "Recall", "F1", "Note"],
            sota_rows,
        )
        + "\n\n## Methodology\n\n"
        + "Local rows are reproduced from generated JSON. Citation-only SOTA rows are copied from literature and are not rerun in this repository. LLM quota units are free-tier fractions; GRPO compute cost is GPU-hours, not dollars.\n"
    )


def write_benchmark_outputs(
    *,
    agent_json_path: Path,
    sota_json_path: Path,
    report_path: Path,
    readme_path: Path,
) -> None:
    """Generate the benchmark report and patch the README block."""
    agent_output = load_agent_output(agent_json_path)
    sota_output = load_sota_output(sota_json_path)
    report_text = render_benchmark_report(agent_output, sota_output)
    report_path.write_text(report_text, encoding="utf-8")

    readme_text = readme_path.read_text(encoding="utf-8")
    benchmark_block = build_readme_benchmark_block(agent_output, report_path)
    updated_readme = replace_benchmark_block(readme_text, benchmark_block)
    readme_path.write_text(updated_readme, encoding="utf-8")