Spaces:
Configuration error
Configuration error
| """Tests for ``captioning.preprocessing.caption.preprocess_caption``. | |
| The function is the cheapest possible thing to test thoroughly, and it's also | |
| the hottest train/serve-skew risk: any divergence here changes both the | |
| training vocabulary and the inference path. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| import pytest | |
| from captioning.preprocessing.caption import ( | |
| END_TOKEN, | |
| START_TOKEN, | |
| preprocess_caption, | |
| ) | |
| def _notebook_baseline(text: str) -> str: | |
| """Verbatim notebook cell 3 for parity comparison.""" | |
| text = text.lower() | |
| text = re.sub(r"[^\w\s]", "", text) | |
| text = re.sub(r"\s+", " ", text) | |
| text = text.strip() | |
| return "[start] " + text + " [end]" | |
| def test_matches_notebook_baseline(raw: str) -> None: | |
| assert preprocess_caption(raw) == _notebook_baseline(raw) | |
| def test_wraps_in_sentinels() -> None: | |
| out = preprocess_caption("hello world") | |
| assert out.startswith(START_TOKEN + " ") | |
| assert out.endswith(" " + END_TOKEN) | |
| def test_idempotent_on_already_clean() -> None: | |
| """Already-lowercase, no-punctuation input shouldn't change between | |
| inner content runs.""" | |
| clean = "a man riding a bike" | |
| out1 = preprocess_caption(clean) | |
| # Inner content (without sentinels) should equal the input. | |
| inner = out1.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}") | |
| assert inner == clean | |
| def test_strips_emoji_and_unicode_punct() -> None: | |
| """``\\w`` in Python regex matches unicode word chars by default; punctuation | |
| (including emoji) is dropped. Documenting current behaviour.""" | |
| out = preprocess_caption("hello ๐ world!") | |
| inner = out.removeprefix(f"{START_TOKEN} ").removesuffix(f" {END_TOKEN}") | |
| # Emoji is non-word non-whitespace โ stripped; collapsed spaces leave one space. | |
| assert inner == "hello world" | |