| """ |
| src/features/text_preprocessor.py |
| |
| Pipeline de preprocesamiento NLP. |
| Traducción directa del notebook 02 a código de producción. |
| |
| Pasos: |
| 1. Lowercase |
| 2. Regex: URLs, @menciones, \\xa0, apostrofes, números |
| 3. spaCy: lematización (en_core_web_sm) |
| 4. NLTK: filtrado stopwords english + custom |
| |
| Uso: |
| preprocessor = TextPreprocessor() |
| clean_series = preprocessor.transform(df["Text"]) |
| clean_text = preprocessor.transform("texto crudo aqui") |
| """ |
|
|
| import re |
| import yaml |
| import nltk |
| import spacy |
| import pandas as pd |
| from pathlib import Path |
| from nltk.corpus import stopwords |
| from src.utils.logger import get_logger |
|
|
| logger = get_logger(__name__) |
|
|
| |
| for resource in ["stopwords", "punkt"]: |
| nltk.download(resource, quiet=True) |
|
|
|
|
| class TextPreprocessor: |
| """ |
| Pipeline NLP para hate speech detection. |
| Lee su configuración de configs/features.yaml. |
| """ |
|
|
| |
| |
| CUSTOM_STOPWORDS = { |
| "youtube", "video", "watch", "like", "comment", |
| "channel", "click", "subscribe", "link", |
| } |
|
|
| def __init__(self, config_path: str = "configs/features.yaml"): |
| |
| with open(config_path) as f: |
| cfg = yaml.safe_load(f)["preprocessing"] |
| self.cfg = cfg |
|
|
| |
| self.stop_words = set(stopwords.words("english")) | self.CUSTOM_STOPWORDS |
| self.min_len = cfg.get("min_token_length", 2) |
|
|
| |
| |
| self.nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"]) |
| logger.info(f"TextPreprocessor iniciado — spaCy {self.nlp.meta['version']}") |
|
|
| |
|
|
| def _lowercase(self, text: str) -> str: |
| """Paso 1: minúsculas. 'BLACK' y 'black' son la misma feature.""" |
| return str(text).lower() |
|
|
| def _clean_regex(self, text: str) -> str: |
| """ |
| Paso 2: elimina ruido estructural con regex. |
| Orden importante: primero lo más específico, luego lo general. |
| """ |
| text = re.sub(r"http\S+|www\.\S+", "", text) |
| text = re.sub(r"@\w+", "", text) |
| text = re.sub(r"[\n\t\r]", " ", text) |
| text = re.sub(r"[^\x00-\x7F]+", " ", text) |
| text = re.sub(r"'", "", text) |
| text = re.sub(r"\b\d+\b", "", text) |
| text = re.sub(r"\s+", " ", text) |
| return text.strip() |
|
|
| def _lemmatize(self, text: str) -> str: |
| """ |
| Paso 3+4: lematización con spaCy + filtrado de stopwords con NLTK. |
| |
| Por qué spaCy para lematizar: |
| Entiende gramática: 'running'→'run', 'cops'→'cop' |
| Un stemmer de NLTK simplemente corta: 'running'→'runn' |
| |
| Por qué NLTK para stopwords: |
| Lista curada de 179 palabras funcionales. |
| Más fácil de personalizar que la lista interna de spaCy. |
| |
| DECISIÓN del EDA: NO eliminar 'black','white','police','cop' |
| → Aparecen en ambas clases con contexto distinto. |
| El modelo necesita verlas para aprender por bigrams. |
| """ |
| doc = self.nlp(text) |
| tokens = [ |
| token.lemma_ |
| for token in doc |
| if not token.is_punct |
| and not token.is_space |
| and len(token.text) >= self.min_len |
| and token.lemma_ not in self.stop_words |
| ] |
| return " ".join(tokens) |
|
|
| def _transform_one(self, text: str) -> str: |
| text = self._lowercase(text) |
| text = self._clean_regex(text) |
| text = self._lemmatize(text) |
| return text |
|
|
| |
|
|
| def transform(self, data) -> str | pd.Series: |
| """ |
| Preprocesa un texto o una Serie completa. |
| |
| Args: |
| data: str o pd.Series con textos crudos. |
| |
| Returns: |
| str o pd.Series con textos limpios y lematizados. |
| """ |
| if isinstance(data, pd.Series): |
| logger.info(f"Preprocesando {len(data)} textos...") |
| result = data.apply(self._transform_one) |
| empty = (result == "").sum() |
| if empty > 0: |
| logger.warning(f" {empty} textos quedaron vacíos tras limpieza") |
| return result |
| return self._transform_one(data) |