File size: 2,811 Bytes
6cda091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""
src/features/vectorizer.py

Vectorizador configurable desde YAML.
Traducción directa del notebook 03 a código de producción.

Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000.
Justificación:
    - Bigramas capturan contexto: 'black thug' es distinto a 'black' solo
    - max_features=5000 equilibra vocabulario vs overfitting (800 muestras train)
    - sublinear_tf=True evita que repetir una palabra infle artificialmente su peso

Uso:
    vec = Vectorizer()
    X_train_vec = vec.fit_transform(X_train_text)
    X_test_vec  = vec.transform(X_test_text)
"""

import yaml
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from src.utils.logger import get_logger

logger = get_logger(__name__)


class Vectorizer:
    """
    Wrapper sobre TfidfVectorizer / CountVectorizer.
    Parámetros controlados por configs/features.yaml.

    Regla crítica: fit() SOLO sobre train, transform() sobre train y test.
    Si se hace fit sobre todo el dataset antes del split → data leakage.
    """

    def __init__(self, config_path: str = "configs/features.yaml", method: str = None):
        with open(config_path) as f:
            cfg = yaml.safe_load(f)["vectorization"]

        self.method = method or cfg.get("method", "tfidf")
        c = cfg[self.method]

        if self.method == "tfidf":
            self.vectorizer = TfidfVectorizer(
                max_features  = c["max_features"],
                ngram_range   = tuple(c["ngram_range"]),
                sublinear_tf  = c.get("sublinear_tf", True),
                min_df        = c.get("min_df", 3),
                analyzer      = "word",
                strip_accents = "unicode",
            )
        else:
            self.vectorizer = CountVectorizer(
                max_features  = c["max_features"],
                ngram_range   = tuple(c["ngram_range"]),
                min_df        = c.get("min_df", 3),
                analyzer      = "word",
                strip_accents = "unicode",
            )

        logger.info(f"Vectorizer: {self.method} | max_features={c['max_features']} | ngram={c['ngram_range']}")

    def fit_transform(self, X_train):
        """Ajusta el vocabulario y transforma el train set."""
        logger.info("Vectorizando train set...")
        matrix = self.vectorizer.fit_transform(X_train)
        logger.info(f"  Shape: {matrix.shape} | Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}")
        return matrix

    def transform(self, X):
        """Transforma sin ajustar (para test/producción)."""
        return self.vectorizer.transform(X)

    def get_feature_names(self):
        return self.vectorizer.get_feature_names_out()

    @property
    def vocabulary_size(self) -> int:
        return len(self.vectorizer.vocabulary_)