| """Tests del pipeline de preprocesamiento de texto.""" | |
| import re | |
| import pytest | |
| from src.features.text_preprocessor import TextPreprocessor | |
| def preprocessor(features_config: str) -> TextPreprocessor: | |
| return TextPreprocessor(config_path=features_config) | |
| def test_empty_text_returns_empty_string(preprocessor: TextPreprocessor): | |
| assert preprocessor.transform("") == "" | |
| assert preprocessor.transform(" ") == "" | |
| def test_url_text_removes_urls(preprocessor: TextPreprocessor): | |
| raw = "Visit https://example.com/path and www.test.org now" | |
| clean = preprocessor.transform(raw) | |
| assert "http" not in clean | |
| assert "www." not in clean | |
| assert "example.com" not in clean | |
| assert re.search(r"https?://", clean) is None | |
| def test_normal_text_lowercase_and_lemmatized(preprocessor: TextPreprocessor): | |
| raw = "The runners are running quickly" | |
| clean = preprocessor.transform(raw) | |
| assert isinstance(clean, str) | |
| assert clean == clean.lower() | |
| assert clean != "" | |
| assert "run" in clean.split() | |