sudhirpgcmma02 commited on
Commit
4bd40a2
·
verified ·
1 Parent(s): 70146c3

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +568 -0
train.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sklearn
3
+ import seaborn as sns
4
+ import matplotlib.pyplot as plt
5
+ import sys
6
+ import os
7
+ import numpy as np
8
+ from sklearn.model_selection import train_test_split,cross_val_score
9
+ from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
10
+ from sklearn.tree import DecisionTreeClassifier
11
+ from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
12
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix,classification_report
13
+ import optuna
14
+ from sklearn.linear_model import LogisticRegression
15
+ from sklearn.compose import make_column_transformer
16
+ from imblearn.pipeline import Pipeline
17
+ from imblearn.over_sampling import SMOTE
18
+ from sklearn.tree import DecisionTreeClassifier
19
+ from sklearn.ensemble import VotingClassifier
20
+ from sklearn.ensemble import StackingClassifier
21
+ from sklearn.base import BaseEstimator, TransformerMixin
22
+ from sklearn.impute import SimpleImputer
23
+ from sklearn.preprocessing import RobustScaler
24
+ import joblib
25
+ import shap
26
+ from huggingface_hub import login, HfApi, create_repo
27
+ from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
28
+ from pprint import pprint
29
+ from xgboost import XGBClassifier # Added for XGBoost
30
+ from sklearn.ensemble import RandomForestClassifier # Added for RandomForest
31
+
32
+ api = HfApi()
33
+
34
+ Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
35
+ Xtest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtest.csv"
36
+ ytrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytrain.csv"
37
+ ytest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytest.csv"
38
+
39
+ X_train = pd.read_csv(Xtrain_path)
40
+ Xtest = pd.read_csv(Xtest_path)
41
+ y_train = pd.read_csv(ytrain_path)
42
+ ytest = pd.read_csv(ytest_path)
43
+
44
+
45
+ class FeatureEngineer(BaseEstimator, TransformerMixin):
46
+
47
+ def fit(self, X, y=None):
48
+ return self
49
+
50
+ def transform(self, X):
51
+ # Ensure X is a DataFrame and copy it.
52
+ if isinstance(X, pd.DataFrame):
53
+ df = X.copy()
54
+ else:
55
+ # These are the expected column names after initial preprocessing
56
+ # They should be consistent with the features defined in the overall dataset.
57
+ expected_column_names = [
58
+ 'Engine_rpm', 'Lub_oil_pressure', 'Fuel_pressure',
59
+ 'Coolant_pressure', 'lub_oil_temp', 'Coolant_temp'
60
+ ]
61
+ df = pd.DataFrame(X, columns=expected_column_names)
62
+
63
+ df.columns = (df.columns
64
+ .str.strip()
65
+ .str.replace(" ","_")
66
+ .str.replace(r"[^\w]","_",regex=True)
67
+ )
68
+
69
+ engine_rpm_col = 'Engine_rpm'
70
+ lub_oil_pressure_col = 'Lub_oil_pressure'
71
+ fuel_pressure_col = 'Fuel_pressure'
72
+ coolant_pressure_col = 'Coolant_pressure'
73
+ lub_oil_temp_col = 'lub_oil_temp'
74
+ coolant_temp_col = 'Coolant_temp'
75
+
76
+ core_sensor_cols = [
77
+ engine_rpm_col, lub_oil_pressure_col, fuel_pressure_col,
78
+ coolant_pressure_col, lub_oil_temp_col, coolant_temp_col
79
+ ]
80
+
81
+ # ===== diff features
82
+ for col_name in df.select_dtypes(include=np.number).columns:
83
+ df[f"{col_name}_diff"] = df[col_name].diff()
84
+
85
+ # ===== rolling mean
86
+ for col_name in core_sensor_cols:
87
+ if col_name in df.columns:
88
+ df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean()
89
+
90
+ # ===== anomaly flag (3-sigma)
91
+ for col_name in core_sensor_cols:
92
+ if col_name in df.columns:
93
+ std = df[col_name].std()
94
+ if std > 1e-9: # Use a small epsilon to check for non-zero std
95
+ df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int)
96
+ else:
97
+ df[f"{col_name}_anom"] = 0 # No anomaly if data is constant
98
+
99
+ # ===== aggregates
100
+ # Corrected: Use actual string column names instead of integer indices
101
+ df["temp_gap"] = df[lub_oil_temp_col] - df[coolant_temp_col] # oil vs coolant
102
+ df["pressure_sum"] = df[[lub_oil_pressure_col, fuel_pressure_col, coolant_pressure_col]].sum(axis=1)
103
+
104
+ df = df.fillna(0)
105
+
106
+ # Return DataFrame with new column names for easier debugging and feature name extraction
107
+ return df
108
+
109
+ class OutlierCapper(BaseEstimator, TransformerMixin):
110
+
111
+ def fit(self, X, y=None):
112
+
113
+ self.bounds = []
114
+
115
+ # If X is a DataFrame, convert to numpy array for percentile calculation to avoid FutureWarning
116
+ X_np = X.values if isinstance(X, pd.DataFrame) else X
117
+
118
+ for i in range(X_np.shape[1]):
119
+ Q1 = np.percentile(X_np[:, i], 25)
120
+ Q3 = np.percentile(X_np[:, i], 75)
121
+ IQR = Q3 - Q1
122
+ self.bounds.append((Q1-1.5*IQR, Q3+1.5*IQR))
123
+
124
+ return self
125
+
126
+ def transform(self, X):
127
+
128
+ # If X is a DataFrame, convert to numpy array for manipulation, then back to DataFrame if needed
129
+ X_transformed = X.copy()
130
+ if isinstance(X_transformed, pd.DataFrame):
131
+ column_names = X_transformed.columns
132
+ X_np = X_transformed.values
133
+ else:
134
+ column_names = None # Column names are lost if X is already numpy
135
+ X_np = X_transformed
136
+
137
+ for i, (low, high) in enumerate(self.bounds):
138
+ X_np[:, i] = np.clip(X_np[:, i], low, high)
139
+
140
+ if column_names is not None:
141
+ return pd.DataFrame(X_np, columns=column_names) # Return DataFrame to preserve column names
142
+ else:
143
+ return X_np # Return numpy array if no original column names
144
+
145
+ def create_pipe(model):
146
+
147
+ return Pipeline([
148
+ ("feat", FeatureEngineer()), # feature engineering
149
+ ("impute", SimpleImputer(strategy="median")), # SimpleImputer works on numpy arrays
150
+ ("outlier", OutlierCapper()), # OutlierCapper now returns DataFrame if input was DataFrame
151
+ ("scale", RobustScaler()), # RobustScaler outputs numpy arrays
152
+ ("model", model)
153
+ ])
154
+
155
+ df=X_train.copy()
156
+ #renaming columns for easy processing
157
+ df.columns = (df.columns
158
+ .str.strip()
159
+ .str.replace(" ","_")
160
+ .str.replace(r"[^\w]","_",regex=True)
161
+ )
162
+ print(df.head(10))
163
+
164
+ # Split into X (features) and y (target)
165
+ Xtrain =X_train.copy()
166
+ ytrain =y_train.copy()
167
+ print("########################### independent, dependent varial split completed ################################")
168
+
169
+ # Extract column names as lists for the ColumnTransformer
170
+ num_feat_cols = Xtrain.select_dtypes(include=[np.number]).columns.tolist()
171
+ cat_feat_cols = Xtrain.select_dtypes(include=['object']).columns.tolist()
172
+
173
+
174
+ print("########################### test train split completed ################################")
175
+
176
+ print("########################### preprocessing creation completed ################################")
177
+
178
+ # Set the clas weight to handle class imbalance
179
+ class_weight = ytrain.value_counts().get(0, 0) / ytrain.value_counts().get(1, 1) # Added .get to handle potential missing classes gracefully
180
+ print("class_weight distribution",class_weight)
181
+
182
+ # hyper parameter for DT
183
+
184
+ def objective_dt(trial):
185
+ params = {
186
+ "max_depth": trial.suggest_int("max_depth", 2, 15),
187
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
188
+ "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
189
+ "criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
190
+ "class_weight": 'balanced',
191
+ "random_state": 42
192
+ }
193
+
194
+ model = DecisionTreeClassifier(**params)
195
+
196
+ pipeline=create_pipe(model)
197
+ score = cross_val_score(
198
+ pipeline, Xtrain, ytrain, # ytrain is a DataFrame, convert to Series if it's 1 column
199
+ cv=5, scoring="recall"
200
+ ).mean()
201
+
202
+ return score
203
+
204
+ study_dt = optuna.create_study(direction="maximize")
205
+ study_dt.optimize(objective_dt, n_trials=25)
206
+
207
+ best_dt = DecisionTreeClassifier(**study_dt.best_params, class_weight="balanced")
208
+ best_dt_pipeline =create_pipe(best_dt)
209
+ best_dt_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
210
+ best_dt = best_dt_pipeline # Assign the fitted pipeline as best_dt
211
+ print("Decision Tree best parameters",study_dt.best_params)
212
+ # prediction with test data for model preformance
213
+ y_pred_dt = best_dt_pipeline.predict(Xtest)
214
+ y_pred_proba_dt=best_dt_pipeline.predict_proba(Xtest)[:,1]
215
+
216
+ acc_dt=accuracy_score(ytest, y_pred_dt)
217
+ f1_dt=f1_score(ytest, y_pred_dt)
218
+ rec_dt=recall_score(ytest, y_pred_dt)
219
+ pre_dt=precision_score(ytest, y_pred_dt)
220
+ roc_dt=roc_auc_score(ytest, y_pred_proba_dt)
221
+ cl_rep_dt=classification_report(ytest, y_pred_dt)
222
+ con_rep_dt=confusion_matrix(ytest, y_pred_dt)
223
+
224
+
225
+ modelperf_dt=pd.DataFrame([{
226
+ "Model":"Decision Tree",
227
+ "Accuracy":acc_dt,
228
+ "f1_score":f1_dt,
229
+ "recall":rec_dt,
230
+ "precision":pre_dt,
231
+ "f1score":f1_dt,
232
+ "roc":roc_dt
233
+
234
+ }])
235
+ print(modelperf_dt)
236
+ print("########################### Decision tree completed ################################")
237
+
238
+ # rf hyper parameter tuning
239
+
240
+ def objective_rf(trial):
241
+ params = {
242
+ "n_estimators": trial.suggest_int("n_estimators", 100, 500),
243
+ "max_depth": trial.suggest_int("max_depth", 5, 20),
244
+ "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
245
+ "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
246
+ "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
247
+ "class_weight": "balanced",
248
+ "random_state": 42,
249
+ "n_jobs": -1
250
+ }
251
+
252
+ model = RandomForestClassifier(**params)
253
+
254
+ pipeline =create_pipe(model)
255
+ score = cross_val_score(
256
+ pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series
257
+ cv=5, scoring="recall"
258
+ ).mean()
259
+
260
+ return score
261
+
262
+ study_rf = optuna.create_study(direction="maximize")
263
+ study_rf.optimize(objective_rf, n_trials=25)
264
+
265
+ best_rf = RandomForestClassifier(**study_rf.best_params, class_weight="balanced")
266
+ best_rf_pipeline = create_pipe(best_rf)
267
+ best_rf_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
268
+ best_rf = best_rf_pipeline # Assign the fitted pipeline as best_rf
269
+ print("Random Forest best parameters",study_rf.best_params)
270
+ # prediction with test data for model preformance
271
+ y_pred_rf = best_rf_pipeline.predict(Xtest)
272
+ y_pred_proba_rf=best_rf_pipeline.predict_proba(Xtest)[:,1]
273
+
274
+ acc_rf=accuracy_score(ytest, y_pred_rf)
275
+ f1_rf=f1_score(ytest, y_pred_rf)
276
+ rec_rf=recall_score(ytest, y_pred_rf)
277
+ pre_rf=precision_score(ytest, y_pred_rf)
278
+ roc_rf=roc_auc_score(ytest, y_pred_proba_rf)
279
+ cl_rep_rf=classification_report(ytest, y_pred_rf)
280
+ con_rep_rr=confusion_matrix(ytest, y_pred_rf)
281
+
282
+ modelperf_rf=pd.DataFrame([{
283
+ "Model":"Random Forest",
284
+ "Accuracy":acc_rf,
285
+ "f1_score":f1_rf,
286
+ "recall":rec_rf,
287
+ "precision":pre_rf,
288
+ "f1score":f1_rf,
289
+ "roc":roc_rf
290
+
291
+ }])
292
+ print(modelperf_rf)
293
+
294
+ print("########################### RandomForest completed ################################")
295
+
296
+ # XGB optuna hyperparameter tuning
297
+
298
+
299
+ def objective_xgb(trial):
300
+ params = {
301
+ "n_estimators": trial.suggest_int("n_estimators", 200, 600),
302
+ "max_depth": trial.suggest_int("max_depth", 3, 10),
303
+ "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
304
+ "subsample": trial.suggest_float("subsample", 0.6, 1.0),
305
+ "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
306
+ "gamma": trial.suggest_float("gamma", 0, 5),
307
+ "reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
308
+ "reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
309
+ "eval_metric": "logloss",
310
+ "random_state": 42
311
+ }
312
+
313
+ model = XGBClassifier(**params)
314
+
315
+ pipeline =create_pipe(model)
316
+ score = cross_val_score(
317
+ pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series
318
+ cv=5, scoring="recall"
319
+ ).mean()
320
+
321
+ return score
322
+
323
+ study_xgb = optuna.create_study(direction="maximize")
324
+ study_xgb.optimize(objective_xgb, n_trials=25)
325
+
326
+ best_xgb = XGBClassifier(**study_xgb.best_params)
327
+ best_xgb_pipeline = create_pipe(best_xgb)
328
+ best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
329
+ best_xgb = best_xgb_pipeline # Assign the fitted pipeline as best_xgb
330
+ print("XGBoost best parameters",study_xgb.best_params)
331
+ # prediction with test data for model preformance
332
+ y_pred_xgb= best_xgb_pipeline.predict(Xtest)
333
+ y_pred_proba_xgb=best_xgb_pipeline.predict_proba(Xtest)[:,1]
334
+
335
+ acc_xgb=accuracy_score(ytest, y_pred_xgb)
336
+ f1_xgb=f1_score(ytest, y_pred_xgb)
337
+ rec_xgb=recall_score(ytest, y_pred_xgb)
338
+ pre_xgb=precision_score(ytest, y_pred_xgb)
339
+ roc_xgb=roc_auc_score(ytest, y_pred_proba_xgb)
340
+ cl_rep_xgb=classification_report(ytest, y_pred_xgb)
341
+ con_rep_xgb=confusion_matrix(ytest, y_pred_xgb)
342
+
343
+ modelperf_xgb=pd.DataFrame([{
344
+ "Model":"XGBoost",
345
+ "Accuracy":acc_xgb,
346
+ "f1_score":f1_xgb,
347
+ "recall":rec_xgb,
348
+ "precision":pre_xgb,
349
+ "f1score":f1_xgb,
350
+ "roc":roc_xgb
351
+
352
+ }])
353
+ print(modelperf_xgb)
354
+
355
+ print("########################### XGboost completed completed ################################")
356
+
357
+
358
+ # voting model
359
+ voting_model = VotingClassifier(
360
+ estimators=[
361
+ ("dt", best_dt),
362
+ ("rf", best_rf),
363
+ ("xgb", best_xgb)
364
+ ],
365
+ voting="soft",
366
+ weights=[1, 2, 3]
367
+ )
368
+
369
+ voting_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
370
+ print("########################### voting completed ################################")
371
+ print("voting score")
372
+ # Iterate through estimators to predict and print probabilities
373
+ for name, model in voting_model.named_estimators_.items():
374
+ # The estimator in VotingClassifier is the entire pipeline
375
+ # We need to access the actual model within the pipeline for prediction if it's not the final step.
376
+ # However, for voting, the pipeline itself should have a predict_proba method if voting='soft'.
377
+ # Xtest is processed by the full pipeline of the base estimator
378
+ probs = model.predict_proba(Xtest)[:,1]
379
+ print(name,probs)
380
+ #evaluation
381
+ from sklearn.metrics import classification_report
382
+ y_pred = voting_model.predict(Xtest)
383
+ acc=accuracy_score(ytest, y_pred)
384
+ f1=f1_score(ytest, y_pred,pos_label=1)
385
+ rec=recall_score(ytest, y_pred,pos_label=1)
386
+ pre=precision_score(ytest, y_pred,pos_label=1)
387
+ roc=roc_auc_score(ytest, y_pred)
388
+
389
+ pref_df=pd.DataFrame([{
390
+ "Accuracy":acc,
391
+ "f1_score":f1,
392
+ "recall":rec,
393
+ "precision":pre
394
+ ,"roc_auc":roc
395
+ }])
396
+ print("performance\n",pref_df)
397
+
398
+
399
+ stack_model = StackingClassifier(
400
+ estimators=[
401
+ ("dt", best_dt),
402
+ ("rf",best_rf),
403
+ ("xgb",best_xgb)
404
+ ],
405
+ final_estimator=LogisticRegression(),
406
+ passthrough=False,
407
+ cv=5,
408
+ verbose=1
409
+ )
410
+
411
+ stack_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
412
+ print("########################### stacking completed ################################")
413
+ # prediction with test data for model preformance
414
+ y_pred = stack_model.predict(Xtest)
415
+ y_pred_proba=stack_model.predict_proba(Xtest)[:,1]
416
+
417
+ acc=accuracy_score(ytest, y_pred)
418
+ f1=f1_score(ytest, y_pred)
419
+ rec=recall_score(ytest, y_pred)
420
+ pre=precision_score(ytest, y_pred)
421
+ roc=roc_auc_score(ytest, y_pred_proba)
422
+ cl_rep=classification_report(ytest, y_pred)
423
+ con_rep=confusion_matrix(ytest, y_pred)
424
+ f1_scr=f1_score(ytest, y_pred)
425
+
426
+ print("accuracy score",acc)
427
+ print("f1 score",f1)
428
+ print("recall score",rec)
429
+ print("precision score",pre)
430
+ print("roc auc score",roc)
431
+ print("\n classification_report\n", cl_rep)
432
+ print("\nconfusion_matrix\n", con_rep)
433
+ print("f1_score",f1_scr)
434
+
435
+ co_eff=pd.DataFrame(
436
+ stack_model.final_estimator_.coef_,
437
+ columns= [ name for name, _ in stack_model.estimators]
438
+ )
439
+ print("stack estimator co-err \n",co_eff)
440
+
441
+ # comparing voiting and stacking
442
+ cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
443
+
444
+ scoring={
445
+ "accuracy":"accuracy",
446
+ "f1":"f1",
447
+ "recall":"recall",
448
+ "precision":"precision",
449
+ "roc_auc":"roc_auc"
450
+ }
451
+ # comparing both voting and stacking through CV and scoring on 5 metrices
452
+ vote_cv=cross_validate(voting_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring)
453
+ stack_cv=cross_validate(stack_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring)
454
+
455
+ results= pd.DataFrame({
456
+ "voting":{
457
+ k: np.mean(vote_cv[f"test_{k}"]) for k in scoring
458
+ },
459
+ "stacking":{
460
+ k: np.mean(stack_cv[f"test_{k}"]) for k in scoring
461
+ }}
462
+ )
463
+
464
+ # printing the model results against each indiviual model
465
+ print("model evaluation results \n",results)
466
+
467
+ # primary - recalll , secondary - f1 , tie-break - ,roc-auc, higher score model selected for final deployment
468
+ best_model = stack_model if results.loc["recall","stacking"]>results.loc["recall","voting"] else voting_model
469
+ best_model_name = "Stacking" if results.loc["recall","stacking"]>results.loc["recall","voting"] else "Voting"
470
+
471
+ best_model.fit(Xtrain,ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
472
+ y_pred=best_model.predict(Xtest)
473
+ y_prob=best_model.predict_proba(Xtest)[:,1]
474
+ print("selected model: ",best_model_name)
475
+ # getting the best model parameters for furture deployment
476
+ params=best_model.get_params()
477
+ pd.DataFrame(params.items(),columns=['parameter','value'])
478
+ for name,model in best_model.named_estimators_.items():
479
+ print(f"\n * Base model - {name}")
480
+ pprint(model.get_params())
481
+
482
+ print("\n final estimator (meta model) ")
483
+ pprint(best_model.final_estimator_.get_params())
484
+
485
+ # printing the model performance (FP / FN evaluation)
486
+ print("best slected model | classification report \n",classification_report(ytest, y_pred))
487
+ print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred))
488
+
489
+ ### model concludion of feature importance
490
+ best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
491
+ # Corrected: Access the actual XGBoost model from the pipeline
492
+ xgb_mdl=best_xgb_pipeline.named_steps["model"]
493
+
494
+ # Corrected: Transform Xtrain through the pipeline up to the scaler
495
+ Xtrain_transformed_df = best_xgb_pipeline.named_steps["feat"].transform(Xtrain) # Feat outputs DF
496
+ Xtrain_transformed_df = best_xgb_pipeline.named_steps["impute"].transform(Xtrain_transformed_df)
497
+ Xtrain_transformed_df = best_xgb_pipeline.named_steps["outlier"].transform(Xtrain_transformed_df)
498
+ Xtrain_transformed = best_xgb_pipeline.named_steps["scale"].transform(Xtrain_transformed_df) # Scaler outputs numpy
499
+
500
+ # Corrected: Generate feature names explicitly after FeatureEngineer and other steps
501
+ def get_feature_names(original_cols):
502
+ feature_names = original_cols[:]
503
+ for col in original_cols:
504
+ feature_names.append(f"{col}_diff")
505
+ for col in original_cols:
506
+ feature_names.append(f"{col}_roll5")
507
+ for col in original_cols:
508
+ feature_names.append(f"{col}_anom")
509
+ feature_names.append("temp_gap")
510
+ feature_names.append("pressure_sum")
511
+ return feature_names
512
+
513
+ original_feature_cols = Xtrain.columns.tolist()
514
+ fea_name = get_feature_names(original_feature_cols)
515
+
516
+ explain=shap.TreeExplainer(xgb_mdl)
517
+ shap_values=explain.shap_values(Xtrain_transformed)
518
+
519
+ # For summary_plot, it's better to pass the transformed data if shap_values were computed on it
520
+ shap.summary_plot(shap_values,
521
+ pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names
522
+ feature_names=fea_name)
523
+
524
+ ## summary SHAP plot
525
+ shap.summary_plot(shap_values,
526
+ pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names
527
+ feature_names=fea_name,
528
+ plot_type="bar",
529
+ show=False)
530
+ ax= plt.gca()
531
+ for p in ax.patches:
532
+ ax.text(
533
+ p.get_width(),
534
+ p.get_y()+p.get_height()/2,
535
+ f"{p.get_width():.2f}",
536
+ va="center",
537
+ )
538
+ plt.show()
539
+
540
+
541
+ # Save the model locally
542
+ model_path = "best_engine_PM_prediction_v1.joblib"
543
+ joblib.dump(best_model, model_path)
544
+
545
+ # Log the model artifact
546
+ #mlflow.log_artifact(model_path, artifact_path="model")
547
+ #print(f"Model saved as artifact at: {model_path}")
548
+
549
+ # Upload to Hugging Face
550
+ repo_id = "sudhirpgcmma02/Engine_PM"
551
+ repo_type = "model"
552
+
553
+ # Step 1: Check if the space exists
554
+ try:
555
+ api.repo_info(repo_id=repo_id, repo_type=repo_type)
556
+ print(f"Space '{repo_id}' already exists. Using it.")
557
+ except RepositoryNotFoundError:
558
+ print(f"Space '{repo_id}' not found. Creating new space...")
559
+ create_repo(repo_id=repo_id, repo_type=repo_type, private=False)
560
+ print(f"Space '{repo_id}' created.")
561
+
562
+ # create_repo("churn-model", repo_type="model", private=False)
563
+ api.upload_file(
564
+ path_or_fileobj="best_engine_PM_prediction_v1.joblib",
565
+ path_in_repo="best_engine_PM_prediction_v1.joblib",
566
+ repo_id=repo_id,
567
+ repo_type=repo_type,
568
+ )