File size: 1,051 Bytes
447c4a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
"""Tests del vectorizador TF-IDF."""

import pytest

from src.features.vectorizer import Vectorizer


# min_df=3 en configs/features.yaml: términos deben aparecer en ≥3 documentos
CORPUS_TRAIN = [
    "the quick brown fox jumps",
    "the lazy dog runs fast",
    "the fox and dog play together",
    "another quick fox story here",
]
CORPUS_TEST = ["the fox is quick today"]


@pytest.fixture(scope="module")
def vectorizer(features_config: str) -> Vectorizer:
    return Vectorizer(config_path=features_config, method="tfidf")


def test_fit_transform_output_shape(vectorizer: Vectorizer):
    matrix = vectorizer.fit_transform(CORPUS_TRAIN)

    assert matrix.shape[0] == len(CORPUS_TRAIN)
    assert matrix.shape[1] > 0
    assert matrix.shape[1] <= 5000


def test_transform_preserves_sample_count(vectorizer: Vectorizer):
    train_matrix = vectorizer.fit_transform(CORPUS_TRAIN)
    test_matrix = vectorizer.transform(CORPUS_TEST)

    assert test_matrix.shape[0] == len(CORPUS_TEST)
    assert test_matrix.shape[1] == train_matrix.shape[1]