Spaces:
Running
Running
| import os | |
| import sys | |
| from dataclasses import dataclass | |
| from catboost import CatBoostClassifier | |
| from sklearn.ensemble import ( | |
| AdaBoostClassifier, | |
| GradientBoostingClassifier, | |
| RandomForestClassifier, | |
| ) | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.metrics import accuracy_score | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.tree import DecisionTreeClassifier | |
| from xgboost import XGBClassifier | |
| from src.exception import CustomException | |
| from src.logger import logging | |
| from src.utils import save_object,evaluate_models | |
| class ModelTrainerConfig: | |
| trained_model_file_path=os.path.join("artifacts","model.pkl") | |
| class ModelTrainer: | |
| def __init__(self): | |
| self.model_trainer_config=ModelTrainerConfig() | |
| def initiate_model_trainer(self,train_array,test_array): | |
| try: | |
| logging.info("Split training and test input data") | |
| X_train,y_train,X_test,y_test=( | |
| train_array[:,:-1], | |
| train_array[:,-1], | |
| test_array[:,:-1], | |
| test_array[:,-1] | |
| ) | |
| models = { | |
| "Random Forest": RandomForestClassifier(), | |
| "Decision Tree": DecisionTreeClassifier(), | |
| "Gradient Boosting": GradientBoostingClassifier(), | |
| "Logistic Regression": LogisticRegression(max_iter=1000), | |
| "XGBClassifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss'), | |
| "CatBoosting Classifier": CatBoostClassifier(verbose=False), | |
| "AdaBoost Classifier": AdaBoostClassifier(), | |
| } | |
| params={ | |
| "Decision Tree": { | |
| 'criterion':['gini','entropy'], | |
| }, | |
| "Random Forest":{ | |
| 'n_estimators': [16,32,64] | |
| }, | |
| "Gradient Boosting":{ | |
| 'learning_rate':[.1,.01,.05], | |
| 'subsample':[0.7,0.8,0.9], | |
| 'n_estimators': [16,32,64] | |
| }, | |
| "Logistic Regression":{ | |
| 'C':[0.01,0.1,1,10] | |
| }, | |
| "XGBClassifier":{ | |
| 'learning_rate':[.1,.01,.05], | |
| 'n_estimators': [16,32,64] | |
| }, | |
| "CatBoosting Classifier":{ | |
| 'depth': [4,6], | |
| 'learning_rate': [0.01, 0.05, 0.1], | |
| 'iterations': [50, 100] | |
| }, | |
| "AdaBoost Classifier":{ | |
| 'learning_rate':[.1,.01,0.5], | |
| 'n_estimators': [16,32,64] | |
| } | |
| } | |
| model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test, | |
| models=models,param=params) | |
| ## To get best model score from dict | |
| best_model_score = max(sorted(model_report.values())) | |
| ## To get best model name from dict | |
| best_model_name = list(model_report.keys())[ | |
| list(model_report.values()).index(best_model_score) | |
| ] | |
| best_model = models[best_model_name] | |
| # If no model meets a high threshold, log a warning but continue to | |
| # save the best model found. This avoids aborting the pipeline for | |
| # modest accuracy scores; adjust threshold as needed. | |
| if best_model_score < 0.5: | |
| logging.warning(f"Best model score {best_model_score:.3f} is below 0.5; saving best found model anyway.") | |
| else: | |
| logging.info(f"Best found model on both training and testing dataset") | |
| save_object( | |
| file_path=self.model_trainer_config.trained_model_file_path, | |
| obj=best_model | |
| ) | |
| predicted = best_model.predict(X_test) | |
| accuracy = accuracy_score(y_test, predicted) | |
| return accuracy | |
| except Exception as e: | |
| raise CustomException(e,sys) | |