Spaces:

devrup404
/

SignalMod

Running

File size: 10,649 Bytes

6cda091

"""
src/models/baseline.py

Modelos clásicos de ML para clasificación de texto.
Traducción directa de notebooks 04 y 05.

Todos los modelos siguen la misma interfaz:
    model.fit(X_train, y_train)
    model.predict(X)
    model.predict_proba(X)
    model.save(path)
    Model.load(path)

Uso desde el pipeline:
    model = build_model("lr", config_path="configs/models.yaml")
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
"""

import yaml
import joblib
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate
from src.utils.logger import get_logger

logger = get_logger(__name__)


# ── Clase base ────────────────────────────────────────────────────────────────
class BaseSklearnModel:
    """
    Interfaz común para todos los modelos sklearn del proyecto.
    Hereda LRModel y EnsembleModel.
    """

    def __init__(self):
        self.pipeline = None   # sklearn Pipeline (TF-IDF + clf)
        self.is_fitted = False

    def fit(self, X_train, y_train) -> "BaseSklearnModel":
        """Entrena el pipeline completo."""
        logger.info(f"Entrenando {self.__class__.__name__}...")
        self.pipeline.fit(X_train, y_train)
        self.is_fitted = True
        logger.info("  Entrenamiento completado")
        return self

    def predict(self, X) -> np.ndarray:
        self._check_fitted()
        return self.pipeline.predict(X)

    def predict_proba(self, X) -> np.ndarray:
        self._check_fitted()
        return self.pipeline.predict_proba(X)

    def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
        """
        Evaluación con StratifiedKFold.
        Devuelve medias y desviaciones estándar de las métricas.
        """
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
        results = cross_validate(
            self.pipeline, X_train, y_train,
            cv=cv,
            scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
            return_train_score=True,
            n_jobs=-1,
        )
        summary = {
            "cv_f1_mean"    : results["test_f1"].mean(),
            "cv_f1_std"     : results["test_f1"].std(),
            "cv_roc_mean"   : results["test_roc_auc"].mean(),
            "train_f1_mean" : results["train_f1"].mean(),
            "gap_pp"        : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
        }
        logger.info(
            f"  CV F1: {summary['cv_f1_mean']:.4f} ± {summary['cv_f1_std']:.4f} | "
            f"Gap: {summary['gap_pp']:.1f}pp"
        )
        return summary

    def save(self, path: str | Path) -> None:
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        joblib.dump(self.pipeline, path)
        logger.info(f"Modelo guardado: {path}")

    @classmethod
    def load(cls, path: str | Path) -> "BaseSklearnModel":
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError(f"Modelo no encontrado: {path}")
        instance = cls.__new__(cls)
        instance.pipeline = joblib.load(path)
        instance.is_fitted = True
        logger.info(f"Modelo cargado: {path}")
        return instance

    def _check_fitted(self):
        if not self.is_fitted:
            raise RuntimeError("El modelo no está entrenado. Llama a .fit() primero.")


# ── Logistic Regression ────────────────────────────────────────────────────────
class LRModel(BaseSklearnModel):
    """
    Logistic Regression + TF-IDF.

    Mejor modelo del proyecto (notebook 06):
        F1 test = 0.7579 | CV-test gap = 4.76pp
    Parámetros optimizados con Optuna sobre configs/best_params.yaml.
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
        best_params_path: str = "configs/best_params.yaml",
    ):
        super().__init__()

        # Intentar cargar best_params.yaml (resultado de Optuna)
        try:
            import yaml as _yaml
            with open(best_params_path) as f:
                best = _yaml.safe_load(f)
            bp = best.get("hyperparameters", {})
            logger.info("Parámetros cargados desde best_params.yaml")
        except FileNotFoundError:
            bp = {}
            logger.warning("best_params.yaml no encontrado — usando config por defecto")

        # Config base
        with open(config_path) as f:
            mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        # Prioridad: best_params > yaml config
        ngram_str = str(bp.get("ngram_range", "1_2"))
        ngram     = (1, 1) if ngram_str == "1_1" else (1, 2)

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = bp.get("max_features", vec_cfg["max_features"]),
                ngram_range   = ngram,
                sublinear_tf  = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
                min_df        = bp.get("min_df", vec_cfg["min_df"]),
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", LogisticRegression(
                C            = bp.get("C", mod_cfg["C"]),
                max_iter     = mod_cfg["max_iter"],
                class_weight = mod_cfg["class_weight"],
                solver       = mod_cfg["solver"],
                random_state = 42,
            )),
        ])
        logger.info(f"LRModel creado — C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")


# ── Random Forest ──────────────────────────────────────────────────────────────
class RFModel(BaseSklearnModel):
    """
    Random Forest + TF-IDF.
    Parámetros desde configs/models.yaml.
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
    ):
        super().__init__()

        with open(config_path) as f:
            rf_cfg  = yaml.safe_load(f)["models"]["random_forest"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = vec_cfg["max_features"],
                ngram_range   = (1, 1),   # RF + bigramas es muy lento
                sublinear_tf  = vec_cfg["sublinear_tf"],
                min_df        = vec_cfg["min_df"],
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", RandomForestClassifier(
                n_estimators     = rf_cfg["n_estimators"],
                max_depth        = rf_cfg.get("max_depth", 8),
                min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
                max_features     = "sqrt",
                class_weight     = rf_cfg["class_weight"],
                random_state     = 42,
                n_jobs           = -1,
            )),
        ])
        logger.info("RFModel creado")


# ── XGBoost ───────────────────────────────────────────────────────────────────
class XGBModel(BaseSklearnModel):
    """
    XGBoost + TF-IDF.
    Requiere: pip install xgboost
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
    ):
        super().__init__()

        try:
            from xgboost import XGBClassifier
        except ImportError:
            raise ImportError("Instala XGBoost: pip install xgboost")

        with open(config_path) as f:
            xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = vec_cfg["max_features"],
                ngram_range   = (1, 1),
                sublinear_tf  = True,
                min_df        = vec_cfg["min_df"],
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", XGBClassifier(
                n_estimators     = xgb_cfg.get("n_estimators", 200),
                max_depth        = xgb_cfg.get("max_depth", 3),
                learning_rate    = xgb_cfg.get("learning_rate", 0.05),
                subsample        = xgb_cfg.get("subsample", 0.8),
                colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
                use_label_encoder= False,
                eval_metric      = "logloss",
                random_state     = 42,
                verbosity        = 0,
            )),
        ])
        logger.info("XGBModel creado")


# ── Factory ───────────────────────────────────────────────────────────────────
def build_model(
    model_type: str,
    config_path: str = "configs/models.yaml",
    feat_config_path: str = "configs/features.yaml",
    best_params_path: str = "configs/best_params.yaml",
) -> BaseSklearnModel:
    """
    Construye el modelo indicado en la configuración.

    Args:
        model_type: "lr" | "rf" | "xgboost"

    Returns:
        Instancia del modelo listo para .fit()
    """
    builders = {
        "lr"     : lambda: LRModel(config_path, feat_config_path, best_params_path),
        "rf"     : lambda: RFModel(config_path, feat_config_path),
        "xgboost": lambda: XGBModel(config_path, feat_config_path),
    }
    if model_type not in builders:
        raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")

    logger.info(f"Construyendo modelo: {model_type}")
    return builders[model_type]()