epc_only_data_model / src /models /EpcEnergyPipeline.py
zcemg08's picture
first code
d11b44e
import mlflow.pyfunc
import pandas as pd
import shap
from catboost import CatBoostRegressor
import numpy as np
from src.features.build_features import SAPTables, EPCFeatureEngineer
class EpcEnergyPipeline(mlflow.pyfunc.PythonModel):
"""
MLflow-wrapped EPC energy model:
- loads CatBoost model
- computes SAP-aligned features
- predicts energy / CO2 / EPC score
"""
def __init__(self, cb_model_path=None,sap_tables=None):
self.cb_model_path = cb_model_path
self.sap_tables = sap_tables
self.explainer = None
def load_context(self, context):
model_path = context.artifacts.get("catboost_model", self.cb_model_path)
sap_dir = context.artifacts.get("sap_tables", self.sap_tables)
self.model = CatBoostRegressor()
self.model.load_model(model_path)
self.sap = SAPTables.from_local_dir(sap_dir)
self.feature_engineer = EPCFeatureEngineer(self.sap)
self.explainer = shap.TreeExplainer(self.model)
def build_features(self, model_input):
features = self.feature_engineer.transform(model_input)
return features
def predict(self, context, model_input):
if not isinstance(model_input, pd.DataFrame):
model_input = pd.DataFrame(model_input)
enriched = self.build_features(model_input)
return np.expm1(self.model.predict(enriched))
def explain_predictions(self, model_input):
if not isinstance(model_input, pd.DataFrame):
model_input = pd.DataFrame(model_input)
X = self.build_features(model_input)
shap_values = self.explainer(X)
preds = self.model.predict(X)
return {
"prediction": float(preds[0]),
"base_value": float(self.explainer.expected_value),
"shap_values": shap_values.values.tolist(),
"feature_names": X.columns.tolist(),
"data": X.to_dict(orient="records"),
}