File size: 10,649 Bytes
6cda091
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
"""
src/models/baseline.py

Modelos clΓ‘sicos de ML para clasificaciΓ³n de texto.
TraducciΓ³n directa de notebooks 04 y 05.

Todos los modelos siguen la misma interfaz:
    model.fit(X_train, y_train)
    model.predict(X)
    model.predict_proba(X)
    model.save(path)
    Model.load(path)

Uso desde el pipeline:
    model = build_model("lr", config_path="configs/models.yaml")
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
"""

import yaml
import joblib
import numpy as np
from pathlib import Path
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold, cross_validate
from src.utils.logger import get_logger

logger = get_logger(__name__)


# ── Clase base ────────────────────────────────────────────────────────────────
class BaseSklearnModel:
    """
    Interfaz comΓΊn para todos los modelos sklearn del proyecto.
    Hereda LRModel y EnsembleModel.
    """

    def __init__(self):
        self.pipeline = None   # sklearn Pipeline (TF-IDF + clf)
        self.is_fitted = False

    def fit(self, X_train, y_train) -> "BaseSklearnModel":
        """Entrena el pipeline completo."""
        logger.info(f"Entrenando {self.__class__.__name__}...")
        self.pipeline.fit(X_train, y_train)
        self.is_fitted = True
        logger.info("  Entrenamiento completado")
        return self

    def predict(self, X) -> np.ndarray:
        self._check_fitted()
        return self.pipeline.predict(X)

    def predict_proba(self, X) -> np.ndarray:
        self._check_fitted()
        return self.pipeline.predict_proba(X)

    def cross_validate(self, X_train, y_train, cv_folds: int = 5, rand: int = 42) -> dict:
        """
        EvaluaciΓ³n con StratifiedKFold.
        Devuelve medias y desviaciones estΓ‘ndar de las mΓ©tricas.
        """
        cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=rand)
        results = cross_validate(
            self.pipeline, X_train, y_train,
            cv=cv,
            scoring={"f1": "f1_weighted", "roc_auc": "roc_auc"},
            return_train_score=True,
            n_jobs=-1,
        )
        summary = {
            "cv_f1_mean"    : results["test_f1"].mean(),
            "cv_f1_std"     : results["test_f1"].std(),
            "cv_roc_mean"   : results["test_roc_auc"].mean(),
            "train_f1_mean" : results["train_f1"].mean(),
            "gap_pp"        : (results["train_f1"].mean() - results["test_f1"].mean()) * 100,
        }
        logger.info(
            f"  CV F1: {summary['cv_f1_mean']:.4f} Β± {summary['cv_f1_std']:.4f} | "
            f"Gap: {summary['gap_pp']:.1f}pp"
        )
        return summary

    def save(self, path: str | Path) -> None:
        path = Path(path)
        path.parent.mkdir(parents=True, exist_ok=True)
        joblib.dump(self.pipeline, path)
        logger.info(f"Modelo guardado: {path}")

    @classmethod
    def load(cls, path: str | Path) -> "BaseSklearnModel":
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError(f"Modelo no encontrado: {path}")
        instance = cls.__new__(cls)
        instance.pipeline = joblib.load(path)
        instance.is_fitted = True
        logger.info(f"Modelo cargado: {path}")
        return instance

    def _check_fitted(self):
        if not self.is_fitted:
            raise RuntimeError("El modelo no estΓ‘ entrenado. Llama a .fit() primero.")


# ── Logistic Regression ────────────────────────────────────────────────────────
class LRModel(BaseSklearnModel):
    """
    Logistic Regression + TF-IDF.

    Mejor modelo del proyecto (notebook 06):
        F1 test = 0.7579 | CV-test gap = 4.76pp
    ParΓ‘metros optimizados con Optuna sobre configs/best_params.yaml.
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
        best_params_path: str = "configs/best_params.yaml",
    ):
        super().__init__()

        # Intentar cargar best_params.yaml (resultado de Optuna)
        try:
            import yaml as _yaml
            with open(best_params_path) as f:
                best = _yaml.safe_load(f)
            bp = best.get("hyperparameters", {})
            logger.info("ParΓ‘metros cargados desde best_params.yaml")
        except FileNotFoundError:
            bp = {}
            logger.warning("best_params.yaml no encontrado β€” usando config por defecto")

        # Config base
        with open(config_path) as f:
            mod_cfg = yaml.safe_load(f)["models"]["logistic_regression"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        # Prioridad: best_params > yaml config
        ngram_str = str(bp.get("ngram_range", "1_2"))
        ngram     = (1, 1) if ngram_str == "1_1" else (1, 2)

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = bp.get("max_features", vec_cfg["max_features"]),
                ngram_range   = ngram,
                sublinear_tf  = bp.get("sublinear_tf", vec_cfg["sublinear_tf"]),
                min_df        = bp.get("min_df", vec_cfg["min_df"]),
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", LogisticRegression(
                C            = bp.get("C", mod_cfg["C"]),
                max_iter     = mod_cfg["max_iter"],
                class_weight = mod_cfg["class_weight"],
                solver       = mod_cfg["solver"],
                random_state = 42,
            )),
        ])
        logger.info(f"LRModel creado β€” C={bp.get('C', mod_cfg['C']):.4f} | ngram={ngram}")


# ── Random Forest ──────────────────────────────────────────────────────────────
class RFModel(BaseSklearnModel):
    """
    Random Forest + TF-IDF.
    ParΓ‘metros desde configs/models.yaml.
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
    ):
        super().__init__()

        with open(config_path) as f:
            rf_cfg  = yaml.safe_load(f)["models"]["random_forest"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = vec_cfg["max_features"],
                ngram_range   = (1, 1),   # RF + bigramas es muy lento
                sublinear_tf  = vec_cfg["sublinear_tf"],
                min_df        = vec_cfg["min_df"],
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", RandomForestClassifier(
                n_estimators     = rf_cfg["n_estimators"],
                max_depth        = rf_cfg.get("max_depth", 8),
                min_samples_leaf = rf_cfg.get("min_samples_leaf", 4),
                max_features     = "sqrt",
                class_weight     = rf_cfg["class_weight"],
                random_state     = 42,
                n_jobs           = -1,
            )),
        ])
        logger.info("RFModel creado")


# ── XGBoost ───────────────────────────────────────────────────────────────────
class XGBModel(BaseSklearnModel):
    """
    XGBoost + TF-IDF.
    Requiere: pip install xgboost
    """

    def __init__(
        self,
        config_path: str = "configs/models.yaml",
        feat_config_path: str = "configs/features.yaml",
    ):
        super().__init__()

        try:
            from xgboost import XGBClassifier
        except ImportError:
            raise ImportError("Instala XGBoost: pip install xgboost")

        with open(config_path) as f:
            xgb_cfg = yaml.safe_load(f)["models"]["xgboost"]
        with open(feat_config_path) as f:
            vec_cfg = yaml.safe_load(f)["vectorization"]["tfidf"]

        self.pipeline = Pipeline([
            ("tfidf", TfidfVectorizer(
                max_features  = vec_cfg["max_features"],
                ngram_range   = (1, 1),
                sublinear_tf  = True,
                min_df        = vec_cfg["min_df"],
                analyzer      = "word",
                strip_accents = "unicode",
            )),
            ("clf", XGBClassifier(
                n_estimators     = xgb_cfg.get("n_estimators", 200),
                max_depth        = xgb_cfg.get("max_depth", 3),
                learning_rate    = xgb_cfg.get("learning_rate", 0.05),
                subsample        = xgb_cfg.get("subsample", 0.8),
                colsample_bytree = xgb_cfg.get("colsample_bytree", 0.8),
                use_label_encoder= False,
                eval_metric      = "logloss",
                random_state     = 42,
                verbosity        = 0,
            )),
        ])
        logger.info("XGBModel creado")


# ── Factory ───────────────────────────────────────────────────────────────────
def build_model(
    model_type: str,
    config_path: str = "configs/models.yaml",
    feat_config_path: str = "configs/features.yaml",
    best_params_path: str = "configs/best_params.yaml",
) -> BaseSklearnModel:
    """
    Construye el modelo indicado en la configuraciΓ³n.

    Args:
        model_type: "lr" | "rf" | "xgboost"

    Returns:
        Instancia del modelo listo para .fit()
    """
    builders = {
        "lr"     : lambda: LRModel(config_path, feat_config_path, best_params_path),
        "rf"     : lambda: RFModel(config_path, feat_config_path),
        "xgboost": lambda: XGBModel(config_path, feat_config_path),
    }
    if model_type not in builders:
        raise ValueError(f"model_type debe ser uno de: {list(builders.keys())}")

    logger.info(f"Construyendo modelo: {model_type}")
    return builders[model_type]()