SURESHBEEKHANI's picture
Upload 29 files
46d8269 verified
import os
import sys
from dataclasses import dataclass
# Importing necessary libraries for machine learning models and evaluation
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, VotingClassifier) # type: ignore
from sklearn.neighbors import KNeighborsClassifier # type: ignore
from xgboost import XGBClassifier # type: ignore
from catboost import CatBoostClassifier # type: ignore
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV) # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import numpy as np
from ..exception import CustomException
from src.logger import logging
from src.utils import save_object, evaluate_models, model_metrics, print_evaluated_results
@dataclass
class ModelTrainerConfig:
"""Configuration for Model Trainer."""
trained_model_file_path: str = os.path.join('artifacts', 'model.pkl')
class ModelTrainer:
"""Class for training machine learning models."""
def __init__(self):
self.model_trainer_config = ModelTrainerConfig()
def initiate_model_training(self, train_array: np.ndarray, test_array: np.ndarray):
"""Initiates model training process."""
try:
logging.info('Splitting dependent and independent variables from train and test data')
x_train, y_train, x_test, y_test = (
train_array[:, :-1],
train_array[:, -1],
test_array[:, :-1],
test_array[:, -1]
)
models = {
"Logistic Regression": LogisticRegression(),
"K-Neighbors Classifier": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest Classifier": RandomForestClassifier(),
"XGB Classifier": XGBClassifier(),
"CatBoost Classifier": CatBoostClassifier(verbose=False),
"AdaBoost Classifier": AdaBoostClassifier(),
}
model_report = evaluate_models(x_train, y_train, x_test, y_test, models)
logging.info(f'Model Report: {model_report}')
best_model_name, best_model_score = self.get_best_model(model_report)
# Check if the best model score is satisfactory
if best_model_score < 0.6:
logging.info('Best model has R2 Score less than 60%')
print(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')
print('\n====================================================================================\n')
logging.info(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')
# Hyperparameter tuning for CatBoost
logging.info('Hyperparameter tuning started for CatBoost')
cbr = CatBoostClassifier(verbose=False)
param_dist = {
'depth': [4, 5, 6, 7, 8, 9, 10],
'learning_rate': [0.01, 0.02, 0.03, 0.04],
'iterations': [300, 400, 500, 600]
}
rscv = RandomizedSearchCV(cbr, param_dist, scoring='r2', cv=5, n_jobs=-1)
rscv.fit(x_train, y_train)
# Print the tuned parameters and score
print(f'Best CatBoost Parameters: {rscv.best_params_}')
print(f'Best CatBoost Score: {rscv.best_score_}')
print('\n====================================================================================\n')
best_cbr = rscv.best_estimator_
logging.info('Hyperparameter tuning complete for CatBoost')
# Hyperparameter tuning for KNN
logging.info('Hyperparameter tuning started for KNN')
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(2, 31))}
grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(x_train, y_train)
# Print the tuned parameters and score
print(f'Best KNN Parameters: {grid.best_params_}')
print(f'Best KNN Score: {grid.best_score_}')
print('\n====================================================================================\n')
best_knn = grid.best_estimator_
logging.info('Hyperparameter tuning complete for KNN')
# Create and train Voting Classifier
logging.info('Voting Classifier training started')
voting_classifier = VotingClassifier(
estimators=[('catboost', best_cbr), ('xgb', XGBClassifier()), ('knn', best_knn)],
weights=[3, 2, 1]
)
voting_classifier.fit(x_train, y_train)
print('Final Model Evaluation:\n')
print_evaluated_results(x_train, y_train, x_test, y_test, voting_classifier)
logging.info('Voting Classifier training completed')
# Save the trained model
save_object(file_path=self.model_trainer_config.trained_model_file_path, obj=voting_classifier)
logging.info('Model pickle file saved')
# Evaluate final model on test data
y_test_pred = voting_classifier.predict(x_test)
mae, rmse, r2 = model_metrics(y_test, y_test_pred)
logging.info(f'Test MAE: {mae}')
logging.info(f'Test RMSE: {rmse}')
logging.info(f'Test R2 Score: {r2}')
logging.info('Final Model Training Completed')
return mae, rmse, r2
except Exception as e:
logging.info('Exception occurred during model training')
raise CustomException(e, sys)
def get_best_model(self, model_report: dict):
"""Get the best model based on the evaluation report."""
best_model_name = max(model_report, key=model_report.get)
best_model_score = model_report[best_model_name]
return best_model_name, best_model_score