import pandas as pd import sklearn import seaborn as sns import matplotlib.pyplot as plt import sys import os import numpy as np from sklearn.model_selection import train_test_split,cross_val_score from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix,classification_report import optuna from sklearn.linear_model import LogisticRegression from sklearn.compose import make_column_transformer from imblearn.pipeline import Pipeline from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import VotingClassifier from sklearn.ensemble import StackingClassifier from sklearn.base import BaseEstimator, TransformerMixin from sklearn.impute import SimpleImputer from sklearn.preprocessing import RobustScaler import joblib import shap from huggingface_hub import login, HfApi, create_repo from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError from pprint import pprint from xgboost import XGBClassifier # Added for XGBoost from sklearn.ensemble import RandomForestClassifier # Added for RandomForest # custom class inheritance from featureengineer import FeatureEngineer from outliercapper import OutlierCapper api = HfApi() Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv" Xtest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtest.csv" ytrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytrain.csv" ytest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytest.csv" X_train = pd.read_csv(Xtrain_path) Xtest = pd.read_csv(Xtest_path) y_train = pd.read_csv(ytrain_path) ytest = pd.read_csv(ytest_path) class FeatureEngineer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): # Ensure X is a DataFrame and copy it. if isinstance(X, pd.DataFrame): df = X.copy() else: # These are the expected column names after initial preprocessing # They should be consistent with the features defined in the overall dataset. print("columna names #######################\n",df.columns) df.columns = (df.columns .str.strip() .str.replace(" ","_") .str.replace(r"[^\w]","_",regex=True) .str.lower() ) print("columna names #######################\n",df.columns) core_sensor_cols =df.columns.tolist() # ===== diff features for col_name in df.select_dtypes(include=np.number).columns: df[f"{col_name}_diff"] = df[col_name].diff() # ===== rolling mean for col_name in core_sensor_cols: if col_name in df.columns: df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean() # ===== anomaly flag (3-sigma) for col_name in core_sensor_cols: if col_name in df.columns: std = df[col_name].std() if std > 1e-9: # Use a small epsilon to check for non-zero std df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int) else: df[f"{col_name}_anom"] = 0 # No anomaly if data is constant # ===== aggregates # Corrected: Use actual string column names instead of integer indices df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure','coolant_pressure']].sum(axis=1) df = df.fillna(0) # Return DataFrame with new column names for easier debugging and feature name extraction return df class OutlierCapper(BaseEstimator, TransformerMixin): def fit(self, X, y=None): self.bounds = [] # If X is a DataFrame, convert to numpy array for percentile calculation to avoid FutureWarning X_np = X.values if isinstance(X, pd.DataFrame) else X for i in range(X_np.shape[1]): Q1 = np.percentile(X_np[:, i], 25) Q3 = np.percentile(X_np[:, i], 75) IQR = Q3 - Q1 self.bounds.append((Q1-1.5*IQR, Q3+1.5*IQR)) return self def transform(self, X): # If X is a DataFrame, convert to numpy array for manipulation, then back to DataFrame if needed X_transformed = X.copy() if isinstance(X_transformed, pd.DataFrame): column_names = X_transformed.columns X_np = X_transformed.values else: column_names = None # Column names are lost if X is already numpy X_np = X_transformed for i, (low, high) in enumerate(self.bounds): X_np[:, i] = np.clip(X_np[:, i], low, high) if column_names is not None: return pd.DataFrame(X_np, columns=column_names) # Return DataFrame to preserve column names else: return X_np # Return numpy array if no original column names def create_pipe(model): return Pipeline([ ("feat", FeatureEngineer()), # feature engineering ("impute", SimpleImputer(strategy="median")), # SimpleImputer works on numpy arrays ("outlier", OutlierCapper()), # OutlierCapper now returns DataFrame if input was DataFrame ("scale", RobustScaler()), # RobustScaler outputs numpy arrays ("model", model) ]) df=X_train.copy() #renaming columns for easy processing df.columns = (df.columns .str.strip() .str.replace(" ","_") .str.replace(r"[^\w]","_",regex=True) .str.lower() ) print("printing 10 row",df.head(10)) # Split into X (features) and y (target) #Xtrain =X_train.copy() Xtrain=df.copy() ytrain =y_train.copy() ytrain.columns=(ytrain.columns .str.strip() .str.replace(" ","_") .str.replace(r"[^\w]","_",regex=True) ) Xtest.columns=(Xtest.columns .str.strip() .str.replace(" ","_") .str.replace(r"[^\w]","_",regex=True) .str.lower() ) print("########################### independent, dependent varial split completed ################################") # Extract column names as lists for the ColumnTransformer num_feat_cols = Xtrain.select_dtypes(include=[np.number]).columns.tolist() cat_feat_cols = Xtrain.select_dtypes(include=['object']).columns.tolist() print("########################### test train split completed ################################") print("########################### preprocessing creation completed ################################") # Set the clas weight to handle class imbalance class_weight = ytrain.value_counts().get(0, 0) / ytrain.value_counts().get(1, 1) # Added .get to handle potential missing classes gracefully print("class_weight distribution",class_weight) # hyper parameter for DT def objective_dt(trial): params = { "max_depth": trial.suggest_int("max_depth", 2, 15), "min_samples_split": trial.suggest_int("min_samples_split", 2, 20), "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10), "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]), "class_weight": 'balanced', "random_state": 42 } model = DecisionTreeClassifier(**params) pipeline=create_pipe(model) score = cross_val_score( pipeline, Xtrain, ytrain, # ytrain is a DataFrame, convert to Series if it's 1 column cv=5, scoring="recall" ).mean() return score study_dt = optuna.create_study(direction="maximize") study_dt.optimize(objective_dt, n_trials=25) best_dt = DecisionTreeClassifier(**study_dt.best_params, class_weight="balanced") best_dt_pipeline =create_pipe(best_dt) best_dt_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series best_dt = best_dt_pipeline # Assign the fitted pipeline as best_dt print("Decision Tree best parameters",study_dt.best_params) # prediction with test data for model preformance y_pred_dt = best_dt_pipeline.predict(Xtest) y_pred_proba_dt=best_dt_pipeline.predict_proba(Xtest)[:,1] acc_dt=accuracy_score(ytest, y_pred_dt) f1_dt=f1_score(ytest, y_pred_dt) rec_dt=recall_score(ytest, y_pred_dt) pre_dt=precision_score(ytest, y_pred_dt) roc_dt=roc_auc_score(ytest, y_pred_proba_dt) cl_rep_dt=classification_report(ytest, y_pred_dt) con_rep_dt=confusion_matrix(ytest, y_pred_dt) modelperf_dt=pd.DataFrame([{ "Model":"Decision Tree", "Accuracy":acc_dt, "f1_score":f1_dt, "recall":rec_dt, "precision":pre_dt, "f1score":f1_dt, "roc":roc_dt }]) print(modelperf_dt) print("########################### Decision tree completed ################################") # rf hyper parameter tuning def objective_rf(trial): params = { "n_estimators": trial.suggest_int("n_estimators", 100, 500), "max_depth": trial.suggest_int("max_depth", 5, 20), "min_samples_split": trial.suggest_int("min_samples_split", 2, 15), "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10), "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]), "class_weight": "balanced", "random_state": 42, "n_jobs": -1 } model = RandomForestClassifier(**params) pipeline =create_pipe(model) score = cross_val_score( pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series cv=5, scoring="recall" ).mean() return score study_rf = optuna.create_study(direction="maximize") study_rf.optimize(objective_rf, n_trials=25) best_rf = RandomForestClassifier(**study_rf.best_params, class_weight="balanced") best_rf_pipeline = create_pipe(best_rf) best_rf_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series best_rf = best_rf_pipeline # Assign the fitted pipeline as best_rf print("Random Forest best parameters",study_rf.best_params) # prediction with test data for model preformance y_pred_rf = best_rf_pipeline.predict(Xtest) y_pred_proba_rf=best_rf_pipeline.predict_proba(Xtest)[:,1] acc_rf=accuracy_score(ytest, y_pred_rf) f1_rf=f1_score(ytest, y_pred_rf) rec_rf=recall_score(ytest, y_pred_rf) pre_rf=precision_score(ytest, y_pred_rf) roc_rf=roc_auc_score(ytest, y_pred_proba_rf) cl_rep_rf=classification_report(ytest, y_pred_rf) con_rep_rr=confusion_matrix(ytest, y_pred_rf) modelperf_rf=pd.DataFrame([{ "Model":"Random Forest", "Accuracy":acc_rf, "f1_score":f1_rf, "recall":rec_rf, "precision":pre_rf, "f1score":f1_rf, "roc":roc_rf }]) print(modelperf_rf) print("########################### RandomForest completed ################################") # XGB optuna hyperparameter tuning def objective_xgb(trial): params = { "n_estimators": trial.suggest_int("n_estimators", 200, 600), "max_depth": trial.suggest_int("max_depth", 3, 10), "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True), "subsample": trial.suggest_float("subsample", 0.6, 1.0), "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0), "gamma": trial.suggest_float("gamma", 0, 5), "reg_alpha": trial.suggest_float("reg_alpha", 0, 5), "reg_lambda": trial.suggest_float("reg_lambda", 0, 5), "eval_metric": "logloss", "random_state": 42 } model = XGBClassifier(**params) pipeline =create_pipe(model) score = cross_val_score( pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series cv=5, scoring="recall" ).mean() return score study_xgb = optuna.create_study(direction="maximize") study_xgb.optimize(objective_xgb, n_trials=25) best_xgb = XGBClassifier(**study_xgb.best_params) best_xgb_pipeline = create_pipe(best_xgb) best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series best_xgb = best_xgb_pipeline # Assign the fitted pipeline as best_xgb print("XGBoost best parameters",study_xgb.best_params) # prediction with test data for model preformance y_pred_xgb= best_xgb_pipeline.predict(Xtest) y_pred_proba_xgb=best_xgb_pipeline.predict_proba(Xtest)[:,1] acc_xgb=accuracy_score(ytest, y_pred_xgb) f1_xgb=f1_score(ytest, y_pred_xgb) rec_xgb=recall_score(ytest, y_pred_xgb) pre_xgb=precision_score(ytest, y_pred_xgb) roc_xgb=roc_auc_score(ytest, y_pred_proba_xgb) cl_rep_xgb=classification_report(ytest, y_pred_xgb) con_rep_xgb=confusion_matrix(ytest, y_pred_xgb) modelperf_xgb=pd.DataFrame([{ "Model":"XGBoost", "Accuracy":acc_xgb, "f1_score":f1_xgb, "recall":rec_xgb, "precision":pre_xgb, "f1score":f1_xgb, "roc":roc_xgb }]) print(modelperf_xgb) print("########################### XGboost completed completed ################################") # voting model voting_model = VotingClassifier( estimators=[ ("dt", best_dt), ("rf", best_rf), ("xgb", best_xgb) ], voting="soft", weights=[1, 2, 3] ) voting_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series print("########################### voting completed ################################") print("voting score") # Iterate through estimators to predict and print probabilities for name, model in voting_model.named_estimators_.items(): # The estimator in VotingClassifier is the entire pipeline # We need to access the actual model within the pipeline for prediction if it's not the final step. # However, for voting, the pipeline itself should have a predict_proba method if voting='soft'. # Xtest is processed by the full pipeline of the base estimator probs = model.predict_proba(Xtest)[:,1] print(name,probs) #evaluation from sklearn.metrics import classification_report y_pred = voting_model.predict(Xtest) acc=accuracy_score(ytest, y_pred) f1=f1_score(ytest, y_pred,pos_label=1) rec=recall_score(ytest, y_pred,pos_label=1) pre=precision_score(ytest, y_pred,pos_label=1) roc=roc_auc_score(ytest, y_pred) pref_df=pd.DataFrame([{ "Accuracy":acc, "f1_score":f1, "recall":rec, "precision":pre ,"roc_auc":roc }]) print("performance\n",pref_df) stack_model = StackingClassifier( estimators=[ ("dt", best_dt), ("rf",best_rf), ("xgb",best_xgb) ], final_estimator=LogisticRegression(), passthrough=False, cv=5, verbose=1 ) stack_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series print("########################### stacking completed ################################") # prediction with test data for model preformance y_pred = stack_model.predict(Xtest) y_pred_proba=stack_model.predict_proba(Xtest)[:,1] acc=accuracy_score(ytest, y_pred) f1=f1_score(ytest, y_pred) rec=recall_score(ytest, y_pred) pre=precision_score(ytest, y_pred) roc=roc_auc_score(ytest, y_pred_proba) cl_rep=classification_report(ytest, y_pred) con_rep=confusion_matrix(ytest, y_pred) f1_scr=f1_score(ytest, y_pred) print("accuracy score",acc) print("f1 score",f1) print("recall score",rec) print("precision score",pre) print("roc auc score",roc) print("\n classification_report\n", cl_rep) print("\nconfusion_matrix\n", con_rep) print("f1_score",f1_scr) co_eff=pd.DataFrame( stack_model.final_estimator_.coef_, columns= [ name for name, _ in stack_model.estimators] ) print("stack estimator co-err \n",co_eff) # comparing voiting and stacking cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42) scoring={ "accuracy":"accuracy", "f1":"f1", "recall":"recall", "precision":"precision", "roc_auc":"roc_auc" } # comparing both voting and stacking through CV and scoring on 5 metrices vote_cv=cross_validate(voting_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring) stack_cv=cross_validate(stack_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring) results= pd.DataFrame({ "voting":{ k: np.mean(vote_cv[f"test_{k}"]) for k in scoring }, "stacking":{ k: np.mean(stack_cv[f"test_{k}"]) for k in scoring }} ) # printing the model results against each indiviual model print("model evaluation results \n",results) # primary - recalll , secondary - f1 , tie-break - ,roc-auc, higher score model selected for final deployment best_model = stack_model if results.loc["recall","stacking"]>results.loc["recall","voting"] else voting_model best_model_name = "Stacking" if results.loc["recall","stacking"]>results.loc["recall","voting"] else "Voting" best_model.fit(Xtrain,ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series y_pred=best_model.predict(Xtest) y_prob=best_model.predict_proba(Xtest)[:,1] print("selected model: ",best_model_name) # getting the best model parameters for furture deployment params=best_model.get_params() pd.DataFrame(params.items(),columns=['parameter','value']) for name,model in best_model.named_estimators_.items(): print(f"\n * Base model - {name}") pprint(model.get_params()) # printing the model performance (FP / FN evaluation) print("best slected model | classification report \n",classification_report(ytest, y_pred)) print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred)) ### model concludion of feature importance best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series # Corrected: Access the actual XGBoost model from the pipeline xgb_mdl=best_xgb_pipeline.named_steps["model"] # Corrected: Transform Xtrain through the pipeline up to the scaler Xtrain_transformed_df = best_xgb_pipeline.named_steps["feat"].transform(Xtrain) # Feat outputs DF Xtrain_transformed_df = best_xgb_pipeline.named_steps["impute"].transform(Xtrain_transformed_df) Xtrain_transformed_df = best_xgb_pipeline.named_steps["outlier"].transform(Xtrain_transformed_df) Xtrain_transformed = best_xgb_pipeline.named_steps["scale"].transform(Xtrain_transformed_df) # Scaler outputs numpy # Corrected: Generate feature names explicitly after FeatureEngineer and other steps def get_feature_names(original_cols): feature_names = original_cols[:] for col in original_cols: feature_names.append(f"{col}_diff") for col in original_cols: feature_names.append(f"{col}_roll5") for col in original_cols: feature_names.append(f"{col}_anom") feature_names.append("temp_gap") feature_names.append("pressure_sum") return feature_names original_feature_cols = Xtrain.columns.tolist() fea_name = get_feature_names(original_feature_cols) explain=shap.TreeExplainer(xgb_mdl) shap_values=explain.shap_values(Xtrain_transformed) # For summary_plot, it's better to pass the transformed data if shap_values were computed on it shap.summary_plot(shap_values, pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names feature_names=fea_name) ## summary SHAP plot shap.summary_plot(shap_values, pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names feature_names=fea_name, plot_type="bar", show=False) ax= plt.gca() for p in ax.patches: ax.text( p.get_width(), p.get_y()+p.get_height()/2, f"{p.get_width():.2f}", va="center", ) plt.show() # Save the model locally model_path = "Breakdown_prediction/best_engine_PM_prediction_v1.joblib" joblib.dump(best_model, model_path,compress=("lzma",9))# job lfile > 110 NB |reduce to 20~40 MB # Log the model artifact #mlflow.log_artifact(model_path, artifact_path="model") #print(f"Model saved as artifact at: {model_path}") # Upload to Hugging Face repo_id = "sudhirpgcmma02/Engine_PM" repo_type = "model" # Step 1: Check if the space exists try: api.repo_info(repo_id=repo_id, repo_type=repo_type) print(f"Space '{repo_id}' already exists. Using it.") except RepositoryNotFoundError: print(f"Space '{repo_id}' not found. Creating new space...") create_repo(repo_id=repo_id, repo_type=repo_type, private=False) print(f"Space '{repo_id}' created.") # create_repo("churn-model", repo_type="model", private=False) api.upload_file( path_or_fileobj="Breakdown_prediction/best_engine_PM_prediction_v1.joblib", path_in_repo="Breakdown_prediction/best_engine_PM_prediction_v1.joblib", repo_id=repo_id, repo_type=repo_type, )