Diabetes / src /model_trainer.py
Naveen-2007's picture
Initial deploy: Diabetes Risk Predictor with Enhanced Chatbot
048f639
import os
import sys
from dataclasses import dataclass
from catboost import CatBoostClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from src.exception import CustomException
from src.logger import logging
from src.utils import save_object,evaluate_models
@dataclass
class ModelTrainerConfig:
trained_model_file_path=os.path.join("artifacts","model.pkl")
class ModelTrainer:
def __init__(self):
self.model_trainer_config=ModelTrainerConfig()
def initiate_model_trainer(self,train_array,test_array):
try:
logging.info("Split training and test input data")
X_train,y_train,X_test,y_test=(
train_array[:,:-1],
train_array[:,-1],
test_array[:,:-1],
test_array[:,-1]
)
models = {
"Random Forest": RandomForestClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Gradient Boosting": GradientBoostingClassifier(),
"Logistic Regression": LogisticRegression(max_iter=1000),
"XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
"CatBoosting Classifier": CatBoostClassifier(verbose=False),
"AdaBoost Classifier": AdaBoostClassifier(),
}
params={
"Decision Tree": {
'criterion':['gini','entropy'],
},
"Random Forest":{
'n_estimators': [16,32,64]
},
"Gradient Boosting":{
'learning_rate':[.1,.01,.05],
'subsample':[0.7,0.8,0.9],
'n_estimators': [16,32,64]
},
"Logistic Regression":{
'C':[0.01,0.1,1,10]
},
"XGBClassifier":{
'learning_rate':[.1,.01,.05],
'n_estimators': [16,32,64]
},
"CatBoosting Classifier":{
'depth': [4,6],
'learning_rate': [0.01, 0.05, 0.1],
'iterations': [50, 100]
},
"AdaBoost Classifier":{
'learning_rate':[.1,.01,0.5],
'n_estimators': [16,32,64]
}
}
model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
models=models,param=params)
## To get best model score from dict
best_model_score = max(sorted(model_report.values()))
## To get best model name from dict
best_model_name = list(model_report.keys())[
list(model_report.values()).index(best_model_score)
]
best_model = models[best_model_name]
# If no model meets a high threshold, log a warning but continue to
# save the best model found. This avoids aborting the pipeline for
# modest accuracy scores; adjust threshold as needed.
if best_model_score < 0.5:
logging.warning(f"Best model score {best_model_score:.3f} is below 0.5; saving best found model anyway.")
else:
logging.info(f"Best found model on both training and testing dataset")
save_object(
file_path=self.model_trainer_config.trained_model_file_path,
obj=best_model
)
predicted = best_model.predict(X_test)
accuracy = accuracy_score(y_test, predicted)
return accuracy
except Exception as e:
raise CustomException(e,sys)