Spaces:

sudhirpgcmma02
/

Engine_PM

Running

App Files Files Community

Engine_PM / train.py

sudhirpgcmma02

Upload train.py with huggingface_hub

a7c6674 verified about 8 hours ago

raw

history blame contribute delete

20.6 kB

	import pandas as pd
	import sklearn
	import seaborn as sns
	import matplotlib.pyplot as plt
	import sys
	import os
	import numpy as np
	from sklearn.model_selection import train_test_split,cross_val_score
	from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_validate
	from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix,classification_report
	import optuna
	from sklearn.linear_model import LogisticRegression
	from sklearn.compose import make_column_transformer
	from imblearn.pipeline import Pipeline
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import VotingClassifier
	from sklearn.ensemble import StackingClassifier
	from sklearn.base import BaseEstimator, TransformerMixin
	from sklearn.impute import SimpleImputer
	from sklearn.preprocessing import RobustScaler
	import joblib
	import shap
	from huggingface_hub import login, HfApi, create_repo
	from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError
	from pprint import pprint
	from xgboost import XGBClassifier # Added for XGBoost
	from sklearn.ensemble import RandomForestClassifier # Added for RandomForest
	# custom class inheritance
	from featureengineer import FeatureEngineer
	from outliercapper import OutlierCapper


	api = HfApi()

	Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
	Xtest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtest.csv"
	ytrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytrain.csv"
	ytest_path = "hf://datasets/sudhirpgcmma02/Engine_PM/ytest.csv"

	X_train = pd.read_csv(Xtrain_path)
	Xtest = pd.read_csv(Xtest_path)
	y_train = pd.read_csv(ytrain_path)
	ytest = pd.read_csv(ytest_path)


	class FeatureEngineer(BaseEstimator, TransformerMixin):

	def fit(self, X, y=None):
	return self

	def transform(self, X):
	# Ensure X is a DataFrame and copy it.
	if isinstance(X, pd.DataFrame):
	df = X.copy()
	else:
	# These are the expected column names after initial preprocessing
	# They should be consistent with the features defined in the overall dataset.


	print("columna names #######################\n",df.columns)
	df.columns = (df.columns
	.str.strip()
	.str.replace(" ","_")
	.str.replace(r"[^\w]","_",regex=True)
	.str.lower()
	)
	print("columna names #######################\n",df.columns)

	core_sensor_cols =df.columns.tolist()

	# ===== diff features
	for col_name in df.select_dtypes(include=np.number).columns:
	df[f"{col_name}_diff"] = df[col_name].diff()

	# ===== rolling mean
	for col_name in core_sensor_cols:
	if col_name in df.columns:
	df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean()

	# ===== anomaly flag (3-sigma)
	for col_name in core_sensor_cols:
	if col_name in df.columns:
	std = df[col_name].std()
	if std > 1e-9: # Use a small epsilon to check for non-zero std
	df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int)
	else:
	df[f"{col_name}_anom"] = 0 # No anomaly if data is constant

	# ===== aggregates
	# Corrected: Use actual string column names instead of integer indices

	df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant
	df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure','coolant_pressure']].sum(axis=1)

	df = df.fillna(0)

	# Return DataFrame with new column names for easier debugging and feature name extraction
	return df

	class OutlierCapper(BaseEstimator, TransformerMixin):

	def fit(self, X, y=None):

	self.bounds = []

	# If X is a DataFrame, convert to numpy array for percentile calculation to avoid FutureWarning
	X_np = X.values if isinstance(X, pd.DataFrame) else X

	for i in range(X_np.shape[1]):
	Q1 = np.percentile(X_np[:, i], 25)
	Q3 = np.percentile(X_np[:, i], 75)
	IQR = Q3 - Q1
	self.bounds.append((Q1-1.5IQR, Q3+1.5IQR))

	return self

	def transform(self, X):

	# If X is a DataFrame, convert to numpy array for manipulation, then back to DataFrame if needed
	X_transformed = X.copy()
	if isinstance(X_transformed, pd.DataFrame):
	column_names = X_transformed.columns
	X_np = X_transformed.values
	else:
	column_names = None # Column names are lost if X is already numpy
	X_np = X_transformed

	for i, (low, high) in enumerate(self.bounds):
	X_np[:, i] = np.clip(X_np[:, i], low, high)

	if column_names is not None:
	return pd.DataFrame(X_np, columns=column_names) # Return DataFrame to preserve column names
	else:
	return X_np # Return numpy array if no original column names

	def create_pipe(model):

	return Pipeline([
	("feat", FeatureEngineer()), # feature engineering
	("impute", SimpleImputer(strategy="median")), # SimpleImputer works on numpy arrays
	("outlier", OutlierCapper()), # OutlierCapper now returns DataFrame if input was DataFrame
	("scale", RobustScaler()), # RobustScaler outputs numpy arrays
	("model", model)
	])

	df=X_train.copy()
	#renaming columns for easy processing
	df.columns = (df.columns
	.str.strip()
	.str.replace(" ","_")
	.str.replace(r"[^\w]","_",regex=True)
	.str.lower()
	)
	print("printing 10 row",df.head(10))

	# Split into X (features) and y (target)
	#Xtrain =X_train.copy()
	Xtrain=df.copy()
	ytrain =y_train.copy()

	ytrain.columns=(ytrain.columns
	.str.strip()
	.str.replace(" ","_")
	.str.replace(r"[^\w]","_",regex=True)
	)
	Xtest.columns=(Xtest.columns
	.str.strip()
	.str.replace(" ","_")
	.str.replace(r"[^\w]","_",regex=True)
	.str.lower()
	)
	print("########################### independent, dependent varial split completed ################################")

	# Extract column names as lists for the ColumnTransformer
	num_feat_cols = Xtrain.select_dtypes(include=[np.number]).columns.tolist()
	cat_feat_cols = Xtrain.select_dtypes(include=['object']).columns.tolist()


	print("########################### test train split completed ################################")

	print("########################### preprocessing creation completed ################################")

	# Set the clas weight to handle class imbalance
	class_weight = ytrain.value_counts().get(0, 0) / ytrain.value_counts().get(1, 1) # Added .get to handle potential missing classes gracefully
	print("class_weight distribution",class_weight)

	# hyper parameter for DT

	def objective_dt(trial):
	params = {
	"max_depth": trial.suggest_int("max_depth", 2, 15),
	"min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
	"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
	"criterion": trial.suggest_categorical("criterion", ["gini", "entropy"]),
	"class_weight": 'balanced',
	"random_state": 42
	}

	model = DecisionTreeClassifier(**params)

	pipeline=create_pipe(model)
	score = cross_val_score(
	pipeline, Xtrain, ytrain, # ytrain is a DataFrame, convert to Series if it's 1 column
	cv=5, scoring="recall"
	).mean()

	return score

	study_dt = optuna.create_study(direction="maximize")
	study_dt.optimize(objective_dt, n_trials=25)

	best_dt = DecisionTreeClassifier(**study_dt.best_params, class_weight="balanced")
	best_dt_pipeline =create_pipe(best_dt)
	best_dt_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	best_dt = best_dt_pipeline # Assign the fitted pipeline as best_dt
	print("Decision Tree best parameters",study_dt.best_params)
	# prediction with test data for model preformance
	y_pred_dt = best_dt_pipeline.predict(Xtest)
	y_pred_proba_dt=best_dt_pipeline.predict_proba(Xtest)[:,1]

	acc_dt=accuracy_score(ytest, y_pred_dt)
	f1_dt=f1_score(ytest, y_pred_dt)
	rec_dt=recall_score(ytest, y_pred_dt)
	pre_dt=precision_score(ytest, y_pred_dt)
	roc_dt=roc_auc_score(ytest, y_pred_proba_dt)
	cl_rep_dt=classification_report(ytest, y_pred_dt)
	con_rep_dt=confusion_matrix(ytest, y_pred_dt)


	modelperf_dt=pd.DataFrame([{
	"Model":"Decision Tree",
	"Accuracy":acc_dt,
	"f1_score":f1_dt,
	"recall":rec_dt,
	"precision":pre_dt,
	"f1score":f1_dt,
	"roc":roc_dt

	}])
	print(modelperf_dt)
	print("########################### Decision tree completed ################################")

	# rf hyper parameter tuning

	def objective_rf(trial):
	params = {
	"n_estimators": trial.suggest_int("n_estimators", 100, 500),
	"max_depth": trial.suggest_int("max_depth", 5, 20),
	"min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
	"min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
	"max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
	"class_weight": "balanced",
	"random_state": 42,
	"n_jobs": -1
	}

	model = RandomForestClassifier(**params)

	pipeline =create_pipe(model)
	score = cross_val_score(
	pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series
	cv=5, scoring="recall"
	).mean()

	return score

	study_rf = optuna.create_study(direction="maximize")
	study_rf.optimize(objective_rf, n_trials=25)

	best_rf = RandomForestClassifier(**study_rf.best_params, class_weight="balanced")
	best_rf_pipeline = create_pipe(best_rf)
	best_rf_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	best_rf = best_rf_pipeline # Assign the fitted pipeline as best_rf
	print("Random Forest best parameters",study_rf.best_params)
	# prediction with test data for model preformance
	y_pred_rf = best_rf_pipeline.predict(Xtest)
	y_pred_proba_rf=best_rf_pipeline.predict_proba(Xtest)[:,1]

	acc_rf=accuracy_score(ytest, y_pred_rf)
	f1_rf=f1_score(ytest, y_pred_rf)
	rec_rf=recall_score(ytest, y_pred_rf)
	pre_rf=precision_score(ytest, y_pred_rf)
	roc_rf=roc_auc_score(ytest, y_pred_proba_rf)
	cl_rep_rf=classification_report(ytest, y_pred_rf)
	con_rep_rr=confusion_matrix(ytest, y_pred_rf)

	modelperf_rf=pd.DataFrame([{
	"Model":"Random Forest",
	"Accuracy":acc_rf,
	"f1_score":f1_rf,
	"recall":rec_rf,
	"precision":pre_rf,
	"f1score":f1_rf,
	"roc":roc_rf

	}])
	print(modelperf_rf)

	print("########################### RandomForest completed ################################")

	# XGB optuna hyperparameter tuning


	def objective_xgb(trial):
	params = {
	"n_estimators": trial.suggest_int("n_estimators", 200, 600),
	"max_depth": trial.suggest_int("max_depth", 3, 10),
	"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
	"subsample": trial.suggest_float("subsample", 0.6, 1.0),
	"colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
	"gamma": trial.suggest_float("gamma", 0, 5),
	"reg_alpha": trial.suggest_float("reg_alpha", 0, 5),
	"reg_lambda": trial.suggest_float("reg_lambda", 0, 5),
	"eval_metric": "logloss",
	"random_state": 42
	}

	model = XGBClassifier(**params)

	pipeline =create_pipe(model)
	score = cross_val_score(
	pipeline, Xtrain, ytrain.iloc[:,0], # Ensure ytrain is a 1D array/Series
	cv=5, scoring="recall"
	).mean()

	return score

	study_xgb = optuna.create_study(direction="maximize")
	study_xgb.optimize(objective_xgb, n_trials=25)

	best_xgb = XGBClassifier(**study_xgb.best_params)
	best_xgb_pipeline = create_pipe(best_xgb)
	best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	best_xgb = best_xgb_pipeline # Assign the fitted pipeline as best_xgb
	print("XGBoost best parameters",study_xgb.best_params)
	# prediction with test data for model preformance
	y_pred_xgb= best_xgb_pipeline.predict(Xtest)
	y_pred_proba_xgb=best_xgb_pipeline.predict_proba(Xtest)[:,1]

	acc_xgb=accuracy_score(ytest, y_pred_xgb)
	f1_xgb=f1_score(ytest, y_pred_xgb)
	rec_xgb=recall_score(ytest, y_pred_xgb)
	pre_xgb=precision_score(ytest, y_pred_xgb)
	roc_xgb=roc_auc_score(ytest, y_pred_proba_xgb)
	cl_rep_xgb=classification_report(ytest, y_pred_xgb)
	con_rep_xgb=confusion_matrix(ytest, y_pred_xgb)

	modelperf_xgb=pd.DataFrame([{
	"Model":"XGBoost",
	"Accuracy":acc_xgb,
	"f1_score":f1_xgb,
	"recall":rec_xgb,
	"precision":pre_xgb,
	"f1score":f1_xgb,
	"roc":roc_xgb

	}])
	print(modelperf_xgb)

	print("########################### XGboost completed completed ################################")


	# voting model
	voting_model = VotingClassifier(
	estimators=[
	("dt", best_dt),
	("rf", best_rf),
	("xgb", best_xgb)
	],
	voting="soft",
	weights=[1, 2, 3]
	)

	voting_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	print("########################### voting completed ################################")
	print("voting score")
	# Iterate through estimators to predict and print probabilities
	for name, model in voting_model.named_estimators_.items():
	# The estimator in VotingClassifier is the entire pipeline
	# We need to access the actual model within the pipeline for prediction if it's not the final step.
	# However, for voting, the pipeline itself should have a predict_proba method if voting='soft'.
	# Xtest is processed by the full pipeline of the base estimator
	probs = model.predict_proba(Xtest)[:,1]
	print(name,probs)
	#evaluation
	from sklearn.metrics import classification_report
	y_pred = voting_model.predict(Xtest)
	acc=accuracy_score(ytest, y_pred)
	f1=f1_score(ytest, y_pred,pos_label=1)
	rec=recall_score(ytest, y_pred,pos_label=1)
	pre=precision_score(ytest, y_pred,pos_label=1)
	roc=roc_auc_score(ytest, y_pred)

	pref_df=pd.DataFrame([{
	"Accuracy":acc,
	"f1_score":f1,
	"recall":rec,
	"precision":pre
	,"roc_auc":roc
	}])
	print("performance\n",pref_df)


	stack_model = StackingClassifier(
	estimators=[
	("dt", best_dt),
	("rf",best_rf),
	("xgb",best_xgb)
	],
	final_estimator=LogisticRegression(),
	passthrough=False,
	cv=5,
	verbose=1
	)

	stack_model.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	print("########################### stacking completed ################################")
	# prediction with test data for model preformance
	y_pred = stack_model.predict(Xtest)
	y_pred_proba=stack_model.predict_proba(Xtest)[:,1]

	acc=accuracy_score(ytest, y_pred)
	f1=f1_score(ytest, y_pred)
	rec=recall_score(ytest, y_pred)
	pre=precision_score(ytest, y_pred)
	roc=roc_auc_score(ytest, y_pred_proba)
	cl_rep=classification_report(ytest, y_pred)
	con_rep=confusion_matrix(ytest, y_pred)
	f1_scr=f1_score(ytest, y_pred)

	print("accuracy score",acc)
	print("f1 score",f1)
	print("recall score",rec)
	print("precision score",pre)
	print("roc auc score",roc)
	print("\n classification_report\n", cl_rep)
	print("\nconfusion_matrix\n", con_rep)
	print("f1_score",f1_scr)

	co_eff=pd.DataFrame(
	stack_model.final_estimator_.coef_,
	columns= [ name for name, _ in stack_model.estimators]
	)
	print("stack estimator co-err \n",co_eff)

	# comparing voiting and stacking
	cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

	scoring={
	"accuracy":"accuracy",
	"f1":"f1",
	"recall":"recall",
	"precision":"precision",
	"roc_auc":"roc_auc"
	}
	# comparing both voting and stacking through CV and scoring on 5 metrices
	vote_cv=cross_validate(voting_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring)
	stack_cv=cross_validate(stack_model,Xtrain,ytrain.iloc[:,0],cv=cv,scoring=scoring)

	results= pd.DataFrame({
	"voting":{
	k: np.mean(vote_cv[f"test_{k}"]) for k in scoring
	},
	"stacking":{
	k: np.mean(stack_cv[f"test_{k}"]) for k in scoring
	}}
	)

	# printing the model results against each indiviual model
	print("model evaluation results \n",results)

	# primary - recalll , secondary - f1 , tie-break - ,roc-auc, higher score model selected for final deployment
	best_model = stack_model if results.loc["recall","stacking"]>results.loc["recall","voting"] else voting_model
	best_model_name = "Stacking" if results.loc["recall","stacking"]>results.loc["recall","voting"] else "Voting"

	best_model.fit(Xtrain,ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	y_pred=best_model.predict(Xtest)
	y_prob=best_model.predict_proba(Xtest)[:,1]
	print("selected model: ",best_model_name)
	# getting the best model parameters for furture deployment
	params=best_model.get_params()
	pd.DataFrame(params.items(),columns=['parameter','value'])
	for name,model in best_model.named_estimators_.items():
	print(f"\n * Base model - {name}")
	pprint(model.get_params())


	# printing the model performance (FP / FN evaluation)
	print("best slected model \| classification report \n",classification_report(ytest, y_pred))
	print("best slected model \| confusion matrix \n",confusion_matrix(ytest, y_pred))

	### model concludion of feature importance
	best_xgb_pipeline.fit(Xtrain, ytrain.iloc[:,0]) # Ensure ytrain is a 1D array/Series
	# Corrected: Access the actual XGBoost model from the pipeline
	xgb_mdl=best_xgb_pipeline.named_steps["model"]

	# Corrected: Transform Xtrain through the pipeline up to the scaler
	Xtrain_transformed_df = best_xgb_pipeline.named_steps["feat"].transform(Xtrain) # Feat outputs DF
	Xtrain_transformed_df = best_xgb_pipeline.named_steps["impute"].transform(Xtrain_transformed_df)
	Xtrain_transformed_df = best_xgb_pipeline.named_steps["outlier"].transform(Xtrain_transformed_df)
	Xtrain_transformed = best_xgb_pipeline.named_steps["scale"].transform(Xtrain_transformed_df) # Scaler outputs numpy

	# Corrected: Generate feature names explicitly after FeatureEngineer and other steps
	def get_feature_names(original_cols):
	feature_names = original_cols[:]
	for col in original_cols:
	feature_names.append(f"{col}_diff")
	for col in original_cols:
	feature_names.append(f"{col}_roll5")
	for col in original_cols:
	feature_names.append(f"{col}_anom")
	feature_names.append("temp_gap")
	feature_names.append("pressure_sum")
	return feature_names

	original_feature_cols = Xtrain.columns.tolist()
	fea_name = get_feature_names(original_feature_cols)

	explain=shap.TreeExplainer(xgb_mdl)
	shap_values=explain.shap_values(Xtrain_transformed)

	# For summary_plot, it's better to pass the transformed data if shap_values were computed on it
	shap.summary_plot(shap_values,
	pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names
	feature_names=fea_name)

	## summary SHAP plot
	shap.summary_plot(shap_values,
	pd.DataFrame(Xtrain_transformed, columns=fea_name), # Pass as DataFrame with names
	feature_names=fea_name,
	plot_type="bar",
	show=False)
	ax= plt.gca()
	for p in ax.patches:
	ax.text(
	p.get_width(),
	p.get_y()+p.get_height()/2,
	f"{p.get_width():.2f}",
	va="center",
	)
	plt.show()


	# Save the model locally
	model_path = "Breakdown_prediction/best_engine_PM_prediction_v1.joblib"
	joblib.dump(best_model, model_path,compress=("lzma",9))# job lfile > 110 NB \|reduce to 20~40 MB

	# Log the model artifact
	#mlflow.log_artifact(model_path, artifact_path="model")
	#print(f"Model saved as artifact at: {model_path}")

	# Upload to Hugging Face
	repo_id = "sudhirpgcmma02/Engine_PM"
	repo_type = "model"

	# Step 1: Check if the space exists
	try:
	api.repo_info(repo_id=repo_id, repo_type=repo_type)
	print(f"Space '{repo_id}' already exists. Using it.")
	except RepositoryNotFoundError:
	print(f"Space '{repo_id}' not found. Creating new space...")
	create_repo(repo_id=repo_id, repo_type=repo_type, private=False)
	print(f"Space '{repo_id}' created.")

	# create_repo("churn-model", repo_type="model", private=False)
	api.upload_file(
	path_or_fileobj="Breakdown_prediction/best_engine_PM_prediction_v1.joblib",
	path_in_repo="Breakdown_prediction/best_engine_PM_prediction_v1.joblib",
	repo_id=repo_id,
	repo_type=repo_type,
	)