Spaces:

Naveen-2007
/

Diabetes

Running

App Files Files Community

Diabetes / src /model_trainer.py

Naveen-2007

Initial deploy: Diabetes Risk Predictor with Enhanced Chatbot

048f639 3 months ago

raw

history blame contribute delete

4.13 kB

	import os
	import sys
	from dataclasses import dataclass

	from catboost import CatBoostClassifier
	from sklearn.ensemble import (
	AdaBoostClassifier,
	GradientBoostingClassifier,
	RandomForestClassifier,
	)
	from sklearn.linear_model import LogisticRegression
	from sklearn.metrics import accuracy_score
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.tree import DecisionTreeClassifier
	from xgboost import XGBClassifier

	from src.exception import CustomException
	from src.logger import logging

	from src.utils import save_object,evaluate_models

	@dataclass
	class ModelTrainerConfig:
	trained_model_file_path=os.path.join("artifacts","model.pkl")

	class ModelTrainer:
	def __init__(self):
	self.model_trainer_config=ModelTrainerConfig()


	def initiate_model_trainer(self,train_array,test_array):
	try:
	logging.info("Split training and test input data")
	X_train,y_train,X_test,y_test=(
	train_array[:,:-1],
	train_array[:,-1],
	test_array[:,:-1],
	test_array[:,-1]
	)
	models = {
	"Random Forest": RandomForestClassifier(),
	"Decision Tree": DecisionTreeClassifier(),
	"Gradient Boosting": GradientBoostingClassifier(),
	"Logistic Regression": LogisticRegression(max_iter=1000),
	"XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
	"CatBoosting Classifier": CatBoostClassifier(verbose=False),
	"AdaBoost Classifier": AdaBoostClassifier(),
	}
	params={
	"Decision Tree": {
	'criterion':['gini','entropy'],
	},
	"Random Forest":{
	'n_estimators': [16,32,64]
	},
	"Gradient Boosting":{
	'learning_rate':[.1,.01,.05],
	'subsample':[0.7,0.8,0.9],
	'n_estimators': [16,32,64]
	},
	"Logistic Regression":{
	'C':[0.01,0.1,1,10]
	},
	"XGBClassifier":{
	'learning_rate':[.1,.01,.05],
	'n_estimators': [16,32,64]
	},
	"CatBoosting Classifier":{
	'depth': [4,6],
	'learning_rate': [0.01, 0.05, 0.1],
	'iterations': [50, 100]
	},
	"AdaBoost Classifier":{
	'learning_rate':[.1,.01,0.5],
	'n_estimators': [16,32,64]
	}

	}

	model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
	models=models,param=params)

	## To get best model score from dict
	best_model_score = max(sorted(model_report.values()))

	## To get best model name from dict

	best_model_name = list(model_report.keys())[
	list(model_report.values()).index(best_model_score)
	]
	best_model = models[best_model_name]

	# If no model meets a high threshold, log a warning but continue to
	# save the best model found. This avoids aborting the pipeline for
	# modest accuracy scores; adjust threshold as needed.
	if best_model_score < 0.5:
	logging.warning(f"Best model score {best_model_score:.3f} is below 0.5; saving best found model anyway.")
	else:
	logging.info(f"Best found model on both training and testing dataset")

	save_object(
	file_path=self.model_trainer_config.trained_model_file_path,
	obj=best_model
	)

	predicted = best_model.predict(X_test)

	accuracy = accuracy_score(y_test, predicted)
	return accuracy





	except Exception as e:
	raise CustomException(e,sys)