import mlflow.pyfunc import pandas as pd import shap from catboost import CatBoostRegressor import numpy as np from src.features.build_features import SAPTables, EPCFeatureEngineer class EpcEnergyPipeline(mlflow.pyfunc.PythonModel): """ MLflow-wrapped EPC energy model: - loads CatBoost model - computes SAP-aligned features - predicts energy / CO2 / EPC score """ def __init__(self, cb_model_path=None,sap_tables=None): self.cb_model_path = cb_model_path self.sap_tables = sap_tables self.explainer = None def load_context(self, context): model_path = context.artifacts.get("catboost_model", self.cb_model_path) sap_dir = context.artifacts.get("sap_tables", self.sap_tables) self.model = CatBoostRegressor() self.model.load_model(model_path) self.sap = SAPTables.from_local_dir(sap_dir) self.feature_engineer = EPCFeatureEngineer(self.sap) self.explainer = shap.TreeExplainer(self.model) def build_features(self, model_input): features = self.feature_engineer.transform(model_input) return features def predict(self, context, model_input): if not isinstance(model_input, pd.DataFrame): model_input = pd.DataFrame(model_input) enriched = self.build_features(model_input) return np.expm1(self.model.predict(enriched)) def explain_predictions(self, model_input): if not isinstance(model_input, pd.DataFrame): model_input = pd.DataFrame(model_input) X = self.build_features(model_input) shap_values = self.explainer(X) preds = self.model.predict(X) return { "prediction": float(preds[0]), "base_value": float(self.explainer.expected_value), "shap_values": shap_values.values.tolist(), "feature_names": X.columns.tolist(), "data": X.to_dict(orient="records"), }