File size: 1,072 Bytes
447c4a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
"""Tests del pipeline de preprocesamiento de texto."""

import re

import pytest

from src.features.text_preprocessor import TextPreprocessor


@pytest.fixture(scope="module")
def preprocessor(features_config: str) -> TextPreprocessor:
    return TextPreprocessor(config_path=features_config)


def test_empty_text_returns_empty_string(preprocessor: TextPreprocessor):
    assert preprocessor.transform("") == ""
    assert preprocessor.transform("   ") == ""


def test_url_text_removes_urls(preprocessor: TextPreprocessor):
    raw = "Visit https://example.com/path and www.test.org now"
    clean = preprocessor.transform(raw)

    assert "http" not in clean
    assert "www." not in clean
    assert "example.com" not in clean
    assert re.search(r"https?://", clean) is None


def test_normal_text_lowercase_and_lemmatized(preprocessor: TextPreprocessor):
    raw = "The runners are running quickly"
    clean = preprocessor.transform(raw)

    assert isinstance(clean, str)
    assert clean == clean.lower()
    assert clean != ""
    assert "run" in clean.split()