Spaces:

devrup404
/

SignalMod

Running

SignalMod / src /features /vectorizer.py

JonnyBP

backup stable api and model service before pipeline testing

6cda091 7 days ago

2.81 kB

	"""
	src/features/vectorizer.py

	Vectorizador configurable desde YAML.
	Traducción directa del notebook 03 a código de producción.

	Decisión del proyecto: TF-IDF con ngram=(1,2) y max_features=5000.
	Justificación:
	- Bigramas capturan contexto: 'black thug' es distinto a 'black' solo
	- max_features=5000 equilibra vocabulario vs overfitting (800 muestras train)
	- sublinear_tf=True evita que repetir una palabra infle artificialmente su peso

	Uso:
	vec = Vectorizer()
	X_train_vec = vec.fit_transform(X_train_text)
	X_test_vec = vec.transform(X_test_text)
	"""

	import yaml
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from src.utils.logger import get_logger

	logger = get_logger(__name__)


	class Vectorizer:
	"""
	Wrapper sobre TfidfVectorizer / CountVectorizer.
	Parámetros controlados por configs/features.yaml.

	Regla crítica: fit() SOLO sobre train, transform() sobre train y test.
	Si se hace fit sobre todo el dataset antes del split → data leakage.
	"""

	def __init__(self, config_path: str = "configs/features.yaml", method: str = None):
	with open(config_path) as f:
	cfg = yaml.safe_load(f)["vectorization"]

	self.method = method or cfg.get("method", "tfidf")
	c = cfg[self.method]

	if self.method == "tfidf":
	self.vectorizer = TfidfVectorizer(
	max_features = c["max_features"],
	ngram_range = tuple(c["ngram_range"]),
	sublinear_tf = c.get("sublinear_tf", True),
	min_df = c.get("min_df", 3),
	analyzer = "word",
	strip_accents = "unicode",
	)
	else:
	self.vectorizer = CountVectorizer(
	max_features = c["max_features"],
	ngram_range = tuple(c["ngram_range"]),
	min_df = c.get("min_df", 3),
	analyzer = "word",
	strip_accents = "unicode",
	)

	logger.info(f"Vectorizer: {self.method} \| max_features={c['max_features']} \| ngram={c['ngram_range']}")

	def fit_transform(self, X_train):
	"""Ajusta el vocabulario y transforma el train set."""
	logger.info("Vectorizando train set...")
	matrix = self.vectorizer.fit_transform(X_train)
	logger.info(f" Shape: {matrix.shape} \| Sparsidad: {1 - matrix.nnz/(matrix.shape[0]*matrix.shape[1]):.1%}")
	return matrix

	def transform(self, X):
	"""Transforma sin ajustar (para test/producción)."""
	return self.vectorizer.transform(X)

	def get_feature_names(self):
	return self.vectorizer.get_feature_names_out()

	@property
	def vocabulary_size(self) -> int:
	return len(self.vectorizer.vocabulary_)