Spaces:
Configuration error
Configuration error
| """Per-sample inspection utilities for diagnosing weak captions. | |
| The aggregate corpus metric tells you *how bad* the model is; this module | |
| tells you *why*. For each (image, prediction, reference-set) triple it | |
| records per-sample BLEU-4, sentence-level ROUGE-L, the prediction length, | |
| the longest repeated token run, and whether the prediction is empty after | |
| stripping sentinels. | |
| Three failure modes the evaluation pass is trying to surface: | |
| * **Generic captions** — high BLEU-1, low BLEU-4 (n-gram trickle out). | |
| * **Repetition** — large ``repeat_run`` value. | |
| * **Early stopping** — ``length_tokens`` far below reference median. | |
| Output JSONL is intentionally flat (one line per sample) so it can be loaded | |
| with ``pandas.read_json(..., lines=True)`` or grep'd from the shell. The | |
| runner that uses this module writes one such file per evaluation pass | |
| alongside ``metrics.json`` for the same run. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from collections.abc import Iterable, Sequence | |
| from dataclasses import asdict, dataclass | |
| from itertools import pairwise | |
| from pathlib import Path | |
| from captioning.evaluation.tokenization import strip_sentinels | |
| class SampleDiagnostics: | |
| """Inspectable record for one (image, prediction, reference-set) triple.""" | |
| image: str | |
| prediction: str | |
| references: list[str] | |
| length_tokens: int | |
| longest_repeat_run: int | |
| sentence_bleu4: float | None | |
| sentence_rouge_l: float | None | |
| flags: list[str] | |
| def _longest_repeat_run(tokens: Sequence[str]) -> int: | |
| """Return the longest run of immediately-repeated tokens. | |
| Example: ``["a", "a", "a", "dog"]`` -> ``3``. Used to flag the classic | |
| transformer-decoder collapse where the same token is emitted on every step. | |
| """ | |
| if not tokens: | |
| return 0 | |
| best = current = 1 | |
| for prev, cur in pairwise(tokens): | |
| current = current + 1 if cur == prev else 1 | |
| best = max(best, current) | |
| return best | |
| def _sentence_bleu4(prediction: str, references: Sequence[str]) -> float | None: | |
| """Sentence-level BLEU-4 via sacrebleu's effective-order smoothing.""" | |
| try: | |
| import sacrebleu | |
| except ImportError: | |
| return None | |
| if not references or not prediction: | |
| return None | |
| scorer = sacrebleu.metrics.BLEU(effective_order=True, max_ngram_order=4) | |
| return float(scorer.sentence_score(prediction, list(references)).score) | |
| def _sentence_rouge_l(prediction: str, references: Sequence[str]) -> float | None: | |
| """Best-of-references sentence-level ROUGE-L F-measure (0-100 scale).""" | |
| try: | |
| from rouge_score import rouge_scorer | |
| except ImportError: | |
| return None | |
| valid_refs = [r for r in references if r] | |
| if not valid_refs or not prediction: | |
| return None | |
| scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True) | |
| best = max(scorer.score(r, prediction)["rougeL"].fmeasure for r in valid_refs) | |
| return float(100.0 * best) | |
| def diagnose_sample( | |
| image: str, | |
| prediction: str, | |
| references: Sequence[str], | |
| ) -> SampleDiagnostics: | |
| """Return :class:`SampleDiagnostics` for one prediction-vs-references row.""" | |
| pred_clean = strip_sentinels(prediction) | |
| ref_clean = [strip_sentinels(r) for r in references if r] | |
| tokens = pred_clean.split() | |
| flags: list[str] = [] | |
| if not pred_clean: | |
| flags.append("empty") | |
| if len(tokens) <= 2: | |
| flags.append("very_short") | |
| repeat = _longest_repeat_run(tokens) | |
| if repeat >= 3: | |
| flags.append("repetitive") | |
| if ref_clean and tokens and len(tokens) < min(len(r.split()) for r in ref_clean) // 2: | |
| flags.append("under_length") | |
| return SampleDiagnostics( | |
| image=image, | |
| prediction=pred_clean, | |
| references=ref_clean, | |
| length_tokens=len(tokens), | |
| longest_repeat_run=repeat, | |
| sentence_bleu4=_sentence_bleu4(pred_clean, ref_clean), | |
| sentence_rouge_l=_sentence_rouge_l(pred_clean, ref_clean), | |
| flags=flags, | |
| ) | |
| def diagnose_many( | |
| images: Sequence[str], | |
| predictions: Sequence[str], | |
| references: Sequence[Sequence[str]], | |
| ) -> list[SampleDiagnostics]: | |
| """Vectorised :func:`diagnose_sample` over parallel sequences.""" | |
| if not (len(images) == len(predictions) == len(references)): | |
| raise ValueError( | |
| "images, predictions, references must be the same length: " | |
| f"got {len(images)} / {len(predictions)} / {len(references)}" | |
| ) | |
| return [ | |
| diagnose_sample(img, pred, refs) | |
| for img, pred, refs in zip(images, predictions, references, strict=True) | |
| ] | |
| def write_diagnostics_jsonl( | |
| diagnostics: Iterable[SampleDiagnostics], | |
| path: str | Path, | |
| ) -> None: | |
| """Write one JSON object per line — pandas/jq friendly. | |
| Args: | |
| diagnostics: An iterable of :class:`SampleDiagnostics` (typically the | |
| output of :func:`diagnose_many`). | |
| path: Destination file. Parent directory is created if needed. | |
| """ | |
| out = Path(path) | |
| out.parent.mkdir(parents=True, exist_ok=True) | |
| with out.open("w", encoding="utf-8") as f: | |
| for d in diagnostics: | |
| f.write(json.dumps(asdict(d), ensure_ascii=False) + "\n") | |
| def format_diagnostic_row(d: SampleDiagnostics) -> str: | |
| """Return a one-line human-readable summary — used by the CLI tail print.""" | |
| bleu = f"BLEU4={d.sentence_bleu4:5.1f}" if d.sentence_bleu4 is not None else "BLEU4= n/a" | |
| rouge = f"R-L={d.sentence_rouge_l:5.1f}" if d.sentence_rouge_l is not None else "R-L= n/a" | |
| flagstr = ",".join(d.flags) if d.flags else "-" | |
| return ( | |
| f"{Path(d.image).name:35s} " | |
| f"{bleu} {rouge} len={d.length_tokens:>2} repeat={d.longest_repeat_run:>2} " | |
| f"flags={flagstr}\n pred: {d.prediction}\n ref : {d.references[0] if d.references else ''}" | |
| ) | |