Spaces:
Configuration error
Configuration error
| """Corpus BLEU score. | |
| The IEEE paper reports BLEU-4 ~24 on COCO val. ``sacrebleu`` is the de-facto | |
| BLEU implementation; NLTK's BLEU has idiosyncratic smoothing and would not | |
| reproduce the published number across machines. | |
| ``corpus_bleu_score`` returns BLEU-4 (the default n=4 score) so existing | |
| callers keep working. ``corpus_bleu_breakdown`` additionally exposes BLEU-1, | |
| BLEU-2, BLEU-3, BLEU-4 in one pass — useful for the inspection utility and | |
| for the JSON report consumed by Phase 3 cross-model comparison. | |
| """ | |
| from __future__ import annotations | |
| from collections.abc import Sequence | |
| from dataclasses import dataclass | |
| from captioning.evaluation.tokenization import ( | |
| strip_sentinels_many, | |
| strip_sentinels_references, | |
| ) | |
| class BleuBreakdown: | |
| """Per-n BLEU precisions plus the overall BLEU-4 score (0-100 scale).""" | |
| bleu1: float | |
| bleu2: float | |
| bleu3: float | |
| bleu4: float | |
| def _refs_by_slot(references: Sequence[Sequence[str]]) -> list[list[str]]: | |
| """Convert ragged per-example references to sacrebleu's per-slot layout.""" | |
| max_refs = max(len(r) for r in references) if references else 0 | |
| return [[refs[i] if i < len(refs) else "" for refs in references] for i in range(max_refs)] | |
| def corpus_bleu_score( | |
| predictions: Sequence[str], | |
| references: Sequence[Sequence[str]], | |
| ) -> float: | |
| """Compute corpus BLEU-4 via ``sacrebleu``. | |
| Args: | |
| predictions: One generated caption per evaluation example. | |
| references: One *list* of reference captions per evaluation example. | |
| COCO has up to 5 references per image; pad shorter lists with the | |
| empty string ``""`` if needed (sacrebleu handles ragged lists). | |
| Returns: | |
| BLEU-4 in the 0-100 range (sacrebleu's convention; multiply by 1 | |
| to compare with NLTK's 0-1 range — they're not interchangeable). | |
| Raises: | |
| ImportError: If sacrebleu is not installed. Install via the eval | |
| extras: ``pip install -e ".[eval]"`` or the requirements file. | |
| """ | |
| return corpus_bleu_breakdown(predictions, references).bleu4 | |
| def corpus_bleu_breakdown( | |
| predictions: Sequence[str], | |
| references: Sequence[Sequence[str]], | |
| ) -> BleuBreakdown: | |
| """Compute BLEU-1, BLEU-2, BLEU-3, BLEU-4 in a single pass. | |
| Args: | |
| predictions: One generated caption per example. | |
| references: One *list* of reference captions per example. | |
| Returns: | |
| :class:`BleuBreakdown` with all four cumulative BLEU-n scores on the | |
| 0-100 scale (sacrebleu's convention). | |
| Raises: | |
| ImportError: If sacrebleu is not installed. | |
| ValueError: On mismatched lengths. | |
| """ | |
| try: | |
| import sacrebleu | |
| except ImportError as e: | |
| raise ImportError( | |
| "sacrebleu is required for BLEU evaluation. " | |
| "Install it via `pip install -r requirements-eval.txt`." | |
| ) from e | |
| if len(predictions) != len(references): | |
| raise ValueError( | |
| f"predictions ({len(predictions)}) and references " | |
| f"({len(references)}) must have the same length" | |
| ) | |
| preds = strip_sentinels_many(predictions) | |
| refs = strip_sentinels_references(references) | |
| refs_by_slot = _refs_by_slot(refs) | |
| # ``corpus_bleu`` only returns BLEU-4. To get cumulative BLEU-1..3 we | |
| # instantiate ``BLEU`` directly with ``max_ngram_order=n``, which weights | |
| # the geometric mean over precisions[:n] (same convention as NLTK's | |
| # cumulative BLEU and the COCO eval scripts). | |
| bleu_cls = sacrebleu.metrics.BLEU | |
| scores: list[float] = [] | |
| for n in (1, 2, 3, 4): | |
| scorer = bleu_cls(max_ngram_order=n, effective_order=True) | |
| scores.append(float(scorer.corpus_score(preds, refs_by_slot).score)) | |
| return BleuBreakdown(bleu1=scores[0], bleu2=scores[1], bleu3=scores[2], bleu4=scores[3]) | |