Spaces:

SURESHBEEKHANI
/

Heart_Attack_Prediction

Build error

File size: 6,262 Bytes

46d8269

import os
import sys
from dataclasses import dataclass

# Importing necessary libraries for machine learning models and evaluation
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.tree import DecisionTreeClassifier  # type: ignore
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, VotingClassifier)  # type: ignore
from sklearn.neighbors import KNeighborsClassifier  # type: ignore
from xgboost import XGBClassifier  # type: ignore
from catboost import CatBoostClassifier  # type: ignore
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV)  # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt  # type: ignore
import numpy as np

from ..exception import CustomException
from src.logger import logging
from src.utils import save_object, evaluate_models, model_metrics, print_evaluated_results

@dataclass
class ModelTrainerConfig:
    """Configuration for Model Trainer."""
    trained_model_file_path: str = os.path.join('artifacts', 'model.pkl')

class ModelTrainer:
    """Class for training machine learning models."""
    
    def __init__(self):
        self.model_trainer_config = ModelTrainerConfig()

    def initiate_model_training(self, train_array: np.ndarray, test_array: np.ndarray):
        """Initiates model training process."""
        try:
            logging.info('Splitting dependent and independent variables from train and test data')
            x_train, y_train, x_test, y_test = (
                train_array[:, :-1],
                train_array[:, -1],
                test_array[:, :-1],
                test_array[:, -1]
            )

            models = {
                "Logistic Regression": LogisticRegression(),
                "K-Neighbors Classifier": KNeighborsClassifier(),
                "Decision Tree": DecisionTreeClassifier(),
                "Random Forest Classifier": RandomForestClassifier(),
                "XGB Classifier": XGBClassifier(),
                "CatBoost Classifier": CatBoostClassifier(verbose=False),
                "AdaBoost Classifier": AdaBoostClassifier(),
            }

            model_report = evaluate_models(x_train, y_train, x_test, y_test, models)
            logging.info(f'Model Report: {model_report}')

            best_model_name, best_model_score = self.get_best_model(model_report)

            # Check if the best model score is satisfactory
            if best_model_score < 0.6:
    
                logging.info('Best model has R2 Score less than 60%')
                
            print(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')
            print('\n====================================================================================\n')
            logging.info(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')

            # Hyperparameter tuning for CatBoost
            logging.info('Hyperparameter tuning started for CatBoost')
            cbr = CatBoostClassifier(verbose=False)
            param_dist = {
                'depth': [4, 5, 6, 7, 8, 9, 10],
                'learning_rate': [0.01, 0.02, 0.03, 0.04],
                'iterations': [300, 400, 500, 600]
            }
            rscv = RandomizedSearchCV(cbr, param_dist, scoring='r2', cv=5, n_jobs=-1)
            rscv.fit(x_train, y_train)

            # Print the tuned parameters and score
            print(f'Best CatBoost Parameters: {rscv.best_params_}')
            print(f'Best CatBoost Score: {rscv.best_score_}')
            print('\n====================================================================================\n')

            best_cbr = rscv.best_estimator_
            logging.info('Hyperparameter tuning complete for CatBoost')

            # Hyperparameter tuning for KNN
            logging.info('Hyperparameter tuning started for KNN')
            knn = KNeighborsClassifier()
            param_grid = {'n_neighbors': list(range(2, 31))}
            grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2', n_jobs=-1)
            grid.fit(x_train, y_train)

            # Print the tuned parameters and score
            print(f'Best KNN Parameters: {grid.best_params_}')
            print(f'Best KNN Score: {grid.best_score_}')
            print('\n====================================================================================\n')

            best_knn = grid.best_estimator_
            logging.info('Hyperparameter tuning complete for KNN')

            # Create and train Voting Classifier
            logging.info('Voting Classifier training started')
            voting_classifier = VotingClassifier(
                estimators=[('catboost', best_cbr), ('xgb', XGBClassifier()), ('knn', best_knn)],
                weights=[3, 2, 1]
            )
            voting_classifier.fit(x_train, y_train)

            print('Final Model Evaluation:\n')
            print_evaluated_results(x_train, y_train, x_test, y_test, voting_classifier)
            logging.info('Voting Classifier training completed')

            # Save the trained model
            save_object(file_path=self.model_trainer_config.trained_model_file_path, obj=voting_classifier)
            logging.info('Model pickle file saved')

            # Evaluate final model on test data
            y_test_pred = voting_classifier.predict(x_test)
            mae, rmse, r2 = model_metrics(y_test, y_test_pred)
            logging.info(f'Test MAE: {mae}')
            logging.info(f'Test RMSE: {rmse}')
            logging.info(f'Test R2 Score: {r2}')
            logging.info('Final Model Training Completed')
            
            return mae, rmse, r2 

        except Exception as e:
            logging.info('Exception occurred during model training')
            raise CustomException(e, sys)

    def get_best_model(self, model_report: dict):
        """Get the best model based on the evaluation report."""
        best_model_name = max(model_report, key=model_report.get)
        best_model_score = model_report[best_model_name]
        return best_model_name, best_model_score