""" src/features/vectorizer.py Vectorizador configurable desde YAML. Traducción directa del notebook 03 a código de producción. Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000. Justificación: - Bigramas capturan contexto: 'black thug' es distinto a 'black' solo - max_features=5000 equilibra vocabulario vs overfitting (800 muestras train) - sublinear_tf=True evita que repetir una palabra infle artificialmente su peso Uso: vec = Vectorizer() X_train_vec = vec.fit_transform(X_train_text) X_test_vec = vec.transform(X_test_text) """ import yaml from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from src.utils.logger import get_logger logger = get_logger(__name__) class Vectorizer: """ Wrapper sobre TfidfVectorizer / CountVectorizer. Parámetros controlados por configs/features.yaml. Regla crítica: fit() SOLO sobre train, transform() sobre train y test. Si se hace fit sobre todo el dataset antes del split → data leakage. """ def __init__(self, config_path: str = "configs/features.yaml", method: str = None): with open(config_path) as f: cfg = yaml.safe_load(f)["vectorization"] self.method = method or cfg.get("method", "tfidf") c = cfg[self.method] if self.method == "tfidf": self.vectorizer = TfidfVectorizer( max_features = c["max_features"], ngram_range = tuple(c["ngram_range"]), sublinear_tf = c.get("sublinear_tf", True), min_df = c.get("min_df", 3), analyzer = "word", strip_accents = "unicode", ) else: self.vectorizer = CountVectorizer( max_features = c["max_features"], ngram_range = tuple(c["ngram_range"]), min_df = c.get("min_df", 3), analyzer = "word", strip_accents = "unicode", ) logger.info(f"Vectorizer: {self.method} | max_features={c['max_features']} | ngram={c['ngram_range']}") def fit_transform(self, X_train): """Ajusta el vocabulario y transforma el train set.""" logger.info("Vectorizando train set...") matrix = self.vectorizer.fit_transform(X_train) logger.info(f" Shape: {matrix.shape} | Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}") return matrix def transform(self, X): """Transforma sin ajustar (para test/producción).""" return self.vectorizer.transform(X) def get_feature_names(self): return self.vectorizer.get_feature_names_out() @property def vocabulary_size(self) -> int: return len(self.vectorizer.vocabulary_)