File size: 4,696 Bytes
91a1214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Benchmark-ready run artefacts.

Every evaluation pass writes a consistent set of files under
``<run_root>/<run_id>/`` so Phase 3 cross-model comparisons can join them
without bespoke parsing per model:

    metrics.json            β€” :class:`MetricsReport` dumped via dataclass-asdict
    predictions.jsonl       β€” one row per (image, prediction, references)
    diagnostics.jsonl       β€” one :class:`SampleDiagnostics` per row
    run_meta.json           β€” model id, decode strategy, n_samples, timestamp
    report.md               β€” Markdown summary humans actually read

A "run" is one (model, decode_strategy, dataset_slice) tuple. ``run_id`` is
a free-form string β€” the CLI defaults to a timestamp; comparison code groups
by ``model_id`` to plot bars across models.
"""

from __future__ import annotations

import json
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path

from captioning.evaluation.inspection import SampleDiagnostics, write_diagnostics_jsonl
from captioning.evaluation.runner import MetricsReport


@dataclass(frozen=True)
class RunMeta:
    """Per-evaluation-run metadata persisted next to metrics."""

    model_id: str
    decode_strategy: str
    weights_path: str
    tokenizer_dir: str
    n_samples: int
    max_length: int
    beam_width: int | None = None
    length_penalty: float | None = None
    repetition_penalty: float | None = None
    timestamp_utc: str = field(default_factory=lambda: datetime.now(timezone.utc).isoformat())

    def to_dict(self) -> dict[str, object]:
        return {
            "model_id": self.model_id,
            "decode_strategy": self.decode_strategy,
            "weights_path": self.weights_path,
            "tokenizer_dir": self.tokenizer_dir,
            "n_samples": self.n_samples,
            "max_length": self.max_length,
            "beam_width": self.beam_width,
            "length_penalty": self.length_penalty,
            "repetition_penalty": self.repetition_penalty,
            "timestamp_utc": self.timestamp_utc,
        }


def write_run_artifacts(
    run_dir: str | Path,
    *,
    metrics: MetricsReport,
    meta: RunMeta,
    images: list[str],
    predictions: list[str],
    references: list[list[str]],
    diagnostics: list[SampleDiagnostics],
) -> Path:
    """Write every benchmark artefact to ``run_dir`` and return the directory.

    Idempotent over a clean ``run_dir``; overwrites existing files inside.
    """
    out = Path(run_dir)
    out.mkdir(parents=True, exist_ok=True)

    (out / "metrics.json").write_text(json.dumps(metrics.to_dict(), indent=2), encoding="utf-8")
    (out / "run_meta.json").write_text(json.dumps(meta.to_dict(), indent=2), encoding="utf-8")

    with (out / "predictions.jsonl").open("w", encoding="utf-8") as f:
        for img, pred, refs in zip(images, predictions, references, strict=True):
            row = {"image": img, "prediction": pred, "references": list(refs)}
            f.write(json.dumps(row, ensure_ascii=False) + "\n")

    write_diagnostics_jsonl(diagnostics, out / "diagnostics.jsonl")
    (out / "report.md").write_text(_render_report_markdown(meta, metrics), encoding="utf-8")
    return out


def _render_report_markdown(meta: RunMeta, m: MetricsReport) -> str:
    """Render the human-facing Markdown summary of a single run."""

    def fmt(v: float | None) -> str:
        return "n/a" if v is None else f"{v:.2f}"

    lines = [
        f"# Evaluation run β€” {meta.model_id}",
        "",
        f"- Decode strategy: `{meta.decode_strategy}`",
        f"- Weights: `{meta.weights_path}`",
        f"- Tokenizer dir: `{meta.tokenizer_dir}`",
        f"- Samples: **{meta.n_samples}**",
        f"- Timestamp (UTC): {meta.timestamp_utc}",
    ]
    if meta.beam_width is not None:
        lines.append(f"- Beam width: {meta.beam_width}")
    if meta.length_penalty is not None:
        lines.append(f"- Length penalty: {meta.length_penalty}")
    if meta.repetition_penalty is not None:
        lines.append(f"- Repetition penalty: {meta.repetition_penalty}")
    lines += [
        "",
        "## Metrics",
        "",
        "| Metric | Value |",
        "|---|---|",
        f"| BLEU-1 | {fmt(m.bleu1)} |",
        f"| BLEU-2 | {fmt(m.bleu2)} |",
        f"| BLEU-3 | {fmt(m.bleu3)} |",
        f"| BLEU-4 | {fmt(m.bleu4)} |",
        f"| ROUGE-L | {fmt(m.rouge_l)} |",
        f"| METEOR | {fmt(m.meteor)} |",
        f"| CIDEr | {fmt(m.cider)} |",
    ]
    if m.errors:
        lines += ["", "## Skipped or failed metrics", ""]
        for name, err in m.errors.items():
            lines.append(f"- `{name}`: {err}")
    return "\n".join(lines) + "\n"