Spaces:
Configuration error
Configuration error
File size: 2,086 Bytes
3a2e5f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | """Tests for ``captioning.preprocessing.caption.preprocess_caption``.
The function is the cheapest possible thing to test thoroughly, and it's also
the hottest train/serve-skew risk: any divergence here changes both the
training vocabulary and the inference path.
"""
from __future__ import annotations
import re
import pytest
from captioning.preprocessing.caption import (
END_TOKEN,
START_TOKEN,
preprocess_caption,
)
def _notebook_baseline(text: str) -> str:
"""Verbatim notebook cell 3 for parity comparison."""
text = text.lower()
text = re.sub(r"[^\w\s]", "", text)
text = re.sub(r"\s+", " ", text)
text = text.strip()
return "[start] " + text + " [end]"
@pytest.mark.parametrize(
"raw",
[
"A man riding a bike",
"ALL CAPS ARE LOWERED",
"punctuation, removed!",
" multiple spaces ",
"Numbers 123 stay",
"Tabs\tand\nnewlines",
"",
],
)
def test_matches_notebook_baseline(raw: str) -> None:
assert preprocess_caption(raw) == _notebook_baseline(raw)
def test_wraps_in_sentinels() -> None:
out = preprocess_caption("hello world")
assert out.startswith(START_TOKEN + " ")
assert out.endswith(" " + END_TOKEN)
def test_idempotent_on_already_clean() -> None:
"""Already-lowercase, no-punctuation input shouldn't change between
inner content runs."""
clean = "a man riding a bike"
out1 = preprocess_caption(clean)
# Inner content (without sentinels) should equal the input.
inner = out1.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}")
assert inner == clean
def test_strips_emoji_and_unicode_punct() -> None:
"""``\\w`` in Python regex matches unicode word chars by default; punctuation
(including emoji) is dropped. Documenting current behaviour."""
out = preprocess_caption("hello 😀 world!")
inner = out.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}")
# Emoji is non-word non-whitespace → stripped; collapsed spaces leave one space.
assert inner == "hello world"
|