File size: 2,998 Bytes
ea6f215
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from abc import ABC, abstractmethod
from typing import Any, Dict, List, Optional, Tuple
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import cross_val_score
from src.core import setup_logger

logger = setup_logger(__name__)

try:
    import shap
except ImportError:
    shap = None

# --- MODEL BUILDING ---

class ModelBuildingStrategy(ABC):
    @abstractmethod
    def build_and_train_model(self, X_train: pd.DataFrame, y_train: pd.Series) -> RegressorMixin:
        pass

class XGBoostStrategy(ModelBuildingStrategy):
    def __init__(self, **params):
        self.params = params

    def build_and_train_model(self, X_train: pd.DataFrame, y_train: pd.Series) -> Pipeline:
        from xgboost import XGBRegressor
        logger.info("Building XGBoost model.")
        
        # Filtering logic for Rossmann
        valid_mask = (y_train > 0)
        if "Open" in X_train.columns:
            valid_mask = valid_mask & (X_train["Open"] == 1)
        
        X_filtered = X_train[valid_mask]
        y_log = np.log1p(y_train[valid_mask])
        
        pipeline = Pipeline([
            ("scaler", StandardScaler()),
            ("model", XGBRegressor(**self.params))
        ])
        pipeline.fit(X_filtered, y_log)
        return pipeline

# --- EVALUATION ---

class ModelEvaluator:
    @staticmethod
    def calculate_rmspe(y_true, y_pred):
        mask = y_true > 0
        return np.sqrt(np.mean(((y_true[mask] - y_pred[mask]) / y_true[mask])**2)) * 100

    @staticmethod
    def evaluate(model, X_test, y_test):
        y_pred_log = model.predict(X_test)
        y_pred = np.expm1(y_pred_log)
        y_true = y_test if not isinstance(y_test, pd.Series) else y_test.values
        
        mse = mean_squared_error(y_true, y_pred)
        mae = mean_absolute_error(y_true, y_pred)
        rmspe = ModelEvaluator.calculate_rmspe(y_true, y_pred)
        
        return {"MSE": mse, "MAE": mae, "RMSPE": rmspe}

# --- EXPLAINABILITY ---

class ModelExplainer:
    def __init__(self, model, X_train):
        self.model = model
        self.X_train = X_train
        if shap is None:
            logger.warning("SHAP not installed. Explainer will not function.")

    def plot_importance(self, X, save_path=None):
        if hasattr(self.model, 'named_steps'):
            importances = self.model.named_steps['model'].feature_importances_
        else:
            importances = self.model.feature_importances_
        
        feat_imp = pd.Series(importances, index=X.columns).sort_values(ascending=False)
        plt.figure(figsize=(10, 6))
        feat_imp.head(20).plot(kind='bar')
        if save_path:
            plt.savefig(save_path)
            plt.close()
        return feat_imp