import sys import pandas as pd from pathlib import Path import pandas as pd from xgboost import XGBClassifier from src.core.constants import PARAMS_FILE from src.core.logger import logging from src.core.exception import AppException from src.core.configuration import AppConfiguration from src.utils import read_yaml, save_obj import gc class ModelTrainer: def __init__(self, config = AppConfiguration()): """ Initializes the ModelTrainer object by creating a model training configuration. Args: config (AppConfiguration): The configuration object containing the application configuration. """ try: self.model_training_config = config.model_training_config() except Exception as e: logging.error(f"Failed to create model training configuration: {e}", exc_info=True) raise AppException(e, sys) def train(self, df): """ Trains ML model on the given training data and saves the model. Args: df: The training dataframe """ y_train = df['Label'].values X_train = df.drop(columns='Label') try: config_params = read_yaml(PARAMS_FILE) params = config_params.model_training model = XGBClassifier(n_estimators=params.hyperparameters.n_estimators, learning_rate=params.hyperparameters.learning_rate, max_depth=params.hyperparameters.max_depth, tree_method=params.hyperparameters.tree_method, max_bin=params.hyperparameters.max_bin, scale_pos_weight=params.hyperparameters.scale_pos_weight, gamma=params.hyperparameters.gamma, reg_lambda=params.hyperparameters.reg_lambda, subsample=params.hyperparameters.subsample, colsample_bytree=params.hyperparameters.colsample_bytree, n_jobs=-1, random_state=42, use_label_encoder=False, predictor="cpu_predictor" ) logging.info("Model training started") model.fit(X_train, y_train) # get XGB booster booster = model.get_booster() save_model_path = self.model_training_config.models_dir # save models booster.save_model(Path(save_model_path, "booster.json")) save_obj(location_path=save_model_path, obj=model, obj_name=f"model.joblib") logging.info(f"Model trained and saved at: {save_model_path}") with open(Path(save_model_path, "model_meta.txt"), 'w') as f: f.write(f"{params.model_name} model has been trained successfully.\n\n {params}") # free memory del X_train, y_train, model gc.collect() except Exception as e: logging.error(f"Failed to train model: {e}", exc_info=True) raise AppException(e, sys) def initiate_model_training(): """ Main function to initiate the model training workflow. It reads the training dataset, trains an ML model and saves the model. Raises: AppException: If an error occurs during model training. """ obj = ModelTrainer() try: logging.info(f"{'='*20}Model Training{'='*20}") train_data_path = obj.model_training_config.training_data_path if not train_data_path: logging.error("Training dataset path not found") return df = pd.read_feather(train_data_path) df.dropna(how='any', inplace=True) obj.train(df) del df gc.collect() logging.info(f"{'='*20}Model Training Completed Successfully{'='*20} \n\n") except Exception as e: logging.error(f"Error during model training: {e}", exc_info=True) raise AppException(e, sys) if __name__ == "__main__": initiate_model_training()