| """Tests del vectorizador TF-IDF.""" | |
| import pytest | |
| from src.features.vectorizer import Vectorizer | |
| # min_df=3 en configs/features.yaml: términos deben aparecer en ≥3 documentos | |
| CORPUS_TRAIN = [ | |
| "the quick brown fox jumps", | |
| "the lazy dog runs fast", | |
| "the fox and dog play together", | |
| "another quick fox story here", | |
| ] | |
| CORPUS_TEST = ["the fox is quick today"] | |
| def vectorizer(features_config: str) -> Vectorizer: | |
| return Vectorizer(config_path=features_config, method="tfidf") | |
| def test_fit_transform_output_shape(vectorizer: Vectorizer): | |
| matrix = vectorizer.fit_transform(CORPUS_TRAIN) | |
| assert matrix.shape[0] == len(CORPUS_TRAIN) | |
| assert matrix.shape[1] > 0 | |
| assert matrix.shape[1] <= 5000 | |
| def test_transform_preserves_sample_count(vectorizer: Vectorizer): | |
| train_matrix = vectorizer.fit_transform(CORPUS_TRAIN) | |
| test_matrix = vectorizer.transform(CORPUS_TEST) | |
| assert test_matrix.shape[0] == len(CORPUS_TEST) | |
| assert test_matrix.shape[1] == train_matrix.shape[1] | |