Spaces:
Build error
Build error
File size: 6,262 Bytes
46d8269 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import os
import sys
from dataclasses import dataclass
# Importing necessary libraries for machine learning models and evaluation
from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.tree import DecisionTreeClassifier # type: ignore
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier, VotingClassifier) # type: ignore
from sklearn.neighbors import KNeighborsClassifier # type: ignore
from xgboost import XGBClassifier # type: ignore
from catboost import CatBoostClassifier # type: ignore
from sklearn.model_selection import (GridSearchCV, RandomizedSearchCV) # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import numpy as np
from ..exception import CustomException
from src.logger import logging
from src.utils import save_object, evaluate_models, model_metrics, print_evaluated_results
@dataclass
class ModelTrainerConfig:
"""Configuration for Model Trainer."""
trained_model_file_path: str = os.path.join('artifacts', 'model.pkl')
class ModelTrainer:
"""Class for training machine learning models."""
def __init__(self):
self.model_trainer_config = ModelTrainerConfig()
def initiate_model_training(self, train_array: np.ndarray, test_array: np.ndarray):
"""Initiates model training process."""
try:
logging.info('Splitting dependent and independent variables from train and test data')
x_train, y_train, x_test, y_test = (
train_array[:, :-1],
train_array[:, -1],
test_array[:, :-1],
test_array[:, -1]
)
models = {
"Logistic Regression": LogisticRegression(),
"K-Neighbors Classifier": KNeighborsClassifier(),
"Decision Tree": DecisionTreeClassifier(),
"Random Forest Classifier": RandomForestClassifier(),
"XGB Classifier": XGBClassifier(),
"CatBoost Classifier": CatBoostClassifier(verbose=False),
"AdaBoost Classifier": AdaBoostClassifier(),
}
model_report = evaluate_models(x_train, y_train, x_test, y_test, models)
logging.info(f'Model Report: {model_report}')
best_model_name, best_model_score = self.get_best_model(model_report)
# Check if the best model score is satisfactory
if best_model_score < 0.6:
logging.info('Best model has R2 Score less than 60%')
print(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')
print('\n====================================================================================\n')
logging.info(f'Best Model Found: Model Name: {best_model_name}, R2 Score: {best_model_score}')
# Hyperparameter tuning for CatBoost
logging.info('Hyperparameter tuning started for CatBoost')
cbr = CatBoostClassifier(verbose=False)
param_dist = {
'depth': [4, 5, 6, 7, 8, 9, 10],
'learning_rate': [0.01, 0.02, 0.03, 0.04],
'iterations': [300, 400, 500, 600]
}
rscv = RandomizedSearchCV(cbr, param_dist, scoring='r2', cv=5, n_jobs=-1)
rscv.fit(x_train, y_train)
# Print the tuned parameters and score
print(f'Best CatBoost Parameters: {rscv.best_params_}')
print(f'Best CatBoost Score: {rscv.best_score_}')
print('\n====================================================================================\n')
best_cbr = rscv.best_estimator_
logging.info('Hyperparameter tuning complete for CatBoost')
# Hyperparameter tuning for KNN
logging.info('Hyperparameter tuning started for KNN')
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': list(range(2, 31))}
grid = GridSearchCV(knn, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid.fit(x_train, y_train)
# Print the tuned parameters and score
print(f'Best KNN Parameters: {grid.best_params_}')
print(f'Best KNN Score: {grid.best_score_}')
print('\n====================================================================================\n')
best_knn = grid.best_estimator_
logging.info('Hyperparameter tuning complete for KNN')
# Create and train Voting Classifier
logging.info('Voting Classifier training started')
voting_classifier = VotingClassifier(
estimators=[('catboost', best_cbr), ('xgb', XGBClassifier()), ('knn', best_knn)],
weights=[3, 2, 1]
)
voting_classifier.fit(x_train, y_train)
print('Final Model Evaluation:\n')
print_evaluated_results(x_train, y_train, x_test, y_test, voting_classifier)
logging.info('Voting Classifier training completed')
# Save the trained model
save_object(file_path=self.model_trainer_config.trained_model_file_path, obj=voting_classifier)
logging.info('Model pickle file saved')
# Evaluate final model on test data
y_test_pred = voting_classifier.predict(x_test)
mae, rmse, r2 = model_metrics(y_test, y_test_pred)
logging.info(f'Test MAE: {mae}')
logging.info(f'Test RMSE: {rmse}')
logging.info(f'Test R2 Score: {r2}')
logging.info('Final Model Training Completed')
return mae, rmse, r2
except Exception as e:
logging.info('Exception occurred during model training')
raise CustomException(e, sys)
def get_best_model(self, model_report: dict):
"""Get the best model based on the evaluation report."""
best_model_name = max(model_report, key=model_report.get)
best_model_score = model_report[best_model_name]
return best_model_name, best_model_score
|