Spaces:
Sleeping
Sleeping
File size: 4,221 Bytes
4c01182 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | import sys
import pandas as pd
from pathlib import Path
import pandas as pd
from xgboost import XGBClassifier
from src.core.constants import PARAMS_FILE
from src.core.logger import logging
from src.core.exception import AppException
from src.core.configuration import AppConfiguration
from src.utils import read_yaml, save_obj
import gc
class ModelTrainer:
def __init__(self, config = AppConfiguration()):
"""
Initializes the ModelTrainer object by creating a model training configuration.
Args:
config (AppConfiguration): The configuration object containing the application configuration.
"""
try:
self.model_training_config = config.model_training_config()
except Exception as e:
logging.error(f"Failed to create model training configuration: {e}", exc_info=True)
raise AppException(e, sys)
def train(self, df):
"""
Trains ML model on the given training data and saves the model.
Args:
df: The training dataframe
"""
y_train = df['Label'].values
X_train = df.drop(columns='Label')
try:
config_params = read_yaml(PARAMS_FILE)
params = config_params.model_training
model = XGBClassifier(n_estimators=params.hyperparameters.n_estimators,
learning_rate=params.hyperparameters.learning_rate,
max_depth=params.hyperparameters.max_depth,
tree_method=params.hyperparameters.tree_method,
max_bin=params.hyperparameters.max_bin,
scale_pos_weight=params.hyperparameters.scale_pos_weight,
gamma=params.hyperparameters.gamma,
reg_lambda=params.hyperparameters.reg_lambda,
subsample=params.hyperparameters.subsample,
colsample_bytree=params.hyperparameters.colsample_bytree,
n_jobs=-1, random_state=42, use_label_encoder=False, predictor="cpu_predictor"
)
logging.info("Model training started")
model.fit(X_train, y_train)
# get XGB booster
booster = model.get_booster()
save_model_path = self.model_training_config.models_dir
# save models
booster.save_model(Path(save_model_path, "booster.json"))
save_obj(location_path=save_model_path, obj=model, obj_name=f"model.joblib")
logging.info(f"Model trained and saved at: {save_model_path}")
with open(Path(save_model_path, "model_meta.txt"), 'w') as f:
f.write(f"{params.model_name} model has been trained successfully.\n\n {params}")
# free memory
del X_train, y_train, model
gc.collect()
except Exception as e:
logging.error(f"Failed to train model: {e}", exc_info=True)
raise AppException(e, sys)
def initiate_model_training():
"""
Main function to initiate the model training workflow. It reads the training dataset,
trains an ML model and saves the model.
Raises:
AppException: If an error occurs during model training.
"""
obj = ModelTrainer()
try:
logging.info(f"{'='*20}Model Training{'='*20}")
train_data_path = obj.model_training_config.training_data_path
if not train_data_path:
logging.error("Training dataset path not found")
return
df = pd.read_feather(train_data_path)
df.dropna(how='any', inplace=True)
obj.train(df)
del df
gc.collect()
logging.info(f"{'='*20}Model Training Completed Successfully{'='*20} \n\n")
except Exception as e:
logging.error(f"Error during model training: {e}", exc_info=True)
raise AppException(e, sys)
if __name__ == "__main__":
initiate_model_training() |