Spaces:
Configuration error
Configuration error
File size: 1,562 Bytes
91a1214 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | """Reference/hypothesis tokenisation helpers shared across metric modules.
CIDEr, METEOR (via pycocoevalcap) and ROUGE-L all expect *string* inputs but
each implements its own tokenisation internally. To compare metrics on the
same footing we strip our sentinel tokens once, up front, before any metric
sees a caption.
These helpers exist because every metric module would otherwise re-implement
the same ``[start]`` / ``[end]`` stripping inline — and bugs would diverge.
"""
from __future__ import annotations
from collections.abc import Iterable, Sequence
from captioning.preprocessing.caption import END_TOKEN, START_TOKEN
def strip_sentinels(caption: str) -> str:
"""Remove ``[start]`` / ``[end]`` sentinels and collapse whitespace.
Args:
caption: A caption string that may carry our training sentinels.
Returns:
The same caption with the sentinels removed and consecutive whitespace
collapsed to a single space. Empty input returns ``""``.
"""
if not caption:
return ""
cleaned = caption.replace(START_TOKEN, " ").replace(END_TOKEN, " ")
return " ".join(cleaned.split())
def strip_sentinels_many(captions: Iterable[str]) -> list[str]:
"""Apply :func:`strip_sentinels` to every caption in ``captions``."""
return [strip_sentinels(c) for c in captions]
def strip_sentinels_references(
references: Sequence[Sequence[str]],
) -> list[list[str]]:
"""Apply :func:`strip_sentinels` to every reference list."""
return [strip_sentinels_many(refs) for refs in references]
|