File size: 3,107 Bytes
9bbba62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | from __future__ import annotations
import logging
import pytest
from sentence_transformers.base.modules.input_module import InputModule
class _TokenizeOnlyModule(InputModule):
"""Subclass that overrides tokenize() but not preprocess() (legacy pattern)."""
def forward(self, features):
return features
def tokenize(self, texts, **kwargs):
return {"tokens": texts}
def save(self, output_path, *args, safe_serialization=True, **kwargs):
pass
class _PreprocessModule(InputModule):
"""Subclass that properly overrides preprocess()."""
def forward(self, features):
return features
def preprocess(self, inputs, prompt=None, **kwargs):
if prompt:
inputs = self._prepend_prompt(inputs, prompt)
return {"tokens": inputs}
def save(self, output_path, *args, safe_serialization=True, **kwargs):
pass
class _BareModule(InputModule):
"""Subclass that overrides neither preprocess() nor tokenize()."""
def forward(self, features):
return features
def save(self, output_path, *args, safe_serialization=True, **kwargs):
pass
class TestPreprocessBackwardCompat:
def test_tokenize_only_subclass_delegates_and_warns(self, caplog):
module = _TokenizeOnlyModule()
with caplog.at_level(logging.WARNING):
result = module.preprocess(["hello", "world"])
assert result == {"tokens": ["hello", "world"]}
assert any("overrides `tokenize` instead of `preprocess`" in r.message for r in caplog.records)
def test_tokenize_only_subclass_prepends_prompt(self):
module = _TokenizeOnlyModule()
result = module.preprocess(["hello"], prompt="search: ")
assert result == {"tokens": ["search: hello"]}
def test_preprocess_subclass_works_without_warning(self, caplog):
module = _PreprocessModule()
with caplog.at_level(logging.WARNING):
result = module.preprocess(["hello"])
assert result == {"tokens": ["hello"]}
assert not any("tokenize" in r.message for r in caplog.records)
def test_preprocess_subclass_with_prompt(self):
module = _PreprocessModule()
result = module.preprocess(["hello"], prompt="query: ")
assert result == {"tokens": ["query: hello"]}
def test_bare_subclass_raises_not_implemented(self):
module = _BareModule()
with pytest.raises(NotImplementedError, match="must implement the `preprocess` method"):
module.preprocess(["hello"])
class TestTokenizeDeprecation:
def test_tokenize_on_preprocess_subclass_warns_and_delegates(self, caplog):
module = _PreprocessModule()
with caplog.at_level(logging.WARNING):
result = module.tokenize(["hello"])
assert result == {"tokens": ["hello"]}
assert any("tokenize" in r.message and "deprecated" in r.message for r in caplog.records)
def test_tokenize_on_bare_subclass_raises(self):
module = _BareModule()
with pytest.raises(NotImplementedError):
module.tokenize(["hello"])
|