File size: 5,370 Bytes
51f7cb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import sys
from typing import Tuple

import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

from src.exception import MyException
from src.logger import logging
from src.utils.main_utils import load_numpy_array_data, load_object, save_object
from src.entity.config_entity import ModelTrainerConfig
from src.entity.artifact_entity import DataTransformationArtifact, ModelTrainerArtifact,RegressionMetricArtifact
from src.entity.estimator import MyModel
import mlflow
import mlflow.sklearn
from src.constants import REPO_OWNER,REPO_NAME

import dagshub
dagshub.init(repo_owner=REPO_OWNER, repo_name=REPO_NAME, mlflow=True)



class ModelTrainer:
    def __init__(self):
        pass

    async def init_config(
        self,
        data_transformation_artifact: DataTransformationArtifact,
        model_trainer_config: ModelTrainerConfig
    ):
        self.data_transformation_artifact = data_transformation_artifact
        self.model_trainer_config = model_trainer_config

    async def get_model_object_and_report(self, train: np.array, test: np.array) -> Tuple[object, dict]:
        try:
            logging.info("Training RandomForestRegressor with specified parameters")

            x_train, y_train = train[:, :-1], train[:, -1]
            x_test, y_test = test[:, :-1], test[:, -1]

            logging.info("Train-test split done.")

            model = RandomForestRegressor(
                n_estimators=self.model_trainer_config.n_estimators,
                min_samples_split=self.model_trainer_config.min_samples_split,
                min_samples_leaf=self.model_trainer_config.min_samples_leaf,
                max_depth=self.model_trainer_config.max_depth,
                criterion=self.model_trainer_config.criterion,
                random_state=self.model_trainer_config.random_state
            )

            logging.info("Model training started...")
            model.fit(x_train, y_train)
            logging.info("Model training completed.")

            # Predictions and evaluation metrics
            y_pred = model.predict(x_test)
            r2 = r2_score(y_test, y_pred)
            mse = mean_squared_error(y_test, y_pred)
            mae = mean_absolute_error(y_test, y_pred)
            metric_artifact = RegressionMetricArtifact(r2_score=r2, mse=mse, mae=mae)


            

            # MLflow logging
            with mlflow.start_run():
                mlflow.log_metric("r2_score", r2)
                mlflow.log_metric("mse", mse)
                mlflow.log_metric("mae", mae)

                mlflow.log_param("n_estimators", self.model_trainer_config.n_estimators)
                mlflow.log_param("min_samples_split", self.model_trainer_config.min_samples_split)
                mlflow.log_param("min_samples_leaf", self.model_trainer_config.min_samples_leaf)
                mlflow.log_param("max_depth", self.model_trainer_config.max_depth)
                mlflow.log_param("criterion", self.model_trainer_config.criterion)
                mlflow.log_param("random_state", self.model_trainer_config.random_state)

                mlflow.sklearn.log_model(model, artifact_path="model")

            return model, metric_artifact

        except Exception as e:
            raise MyException(e, sys) from e

    async def initiate_model_trainer(self) -> ModelTrainerArtifact:
        logging.info("Entered initiate_model_trainer method of ModelTrainer class")

        try:
            logging.info("Started Model Training Component")

            # Load transformed train-test data
            train_arr = await load_numpy_array_data(
                file_path=self.data_transformation_artifact.transformed_train_file_path
            )
            test_arr = await load_numpy_array_data(
                file_path=self.data_transformation_artifact.transformed_test_file_path
            )
            logging.info("Train-test data loaded successfully.")

            # Train model and get metrics
            trained_model, metric_artifact = await self.get_model_object_and_report(
                train=train_arr, test=test_arr
            )
            logging.info("Model object and metrics obtained.")

            # Load preprocessing object
            preprocessing_obj = await load_object(
                file_path=self.data_transformation_artifact.transformed_object_file_path
            )
            logging.info("Preprocessing object loaded.")

            logging.info("Saving new model as performance is better than previous one.")
            my_model = MyModel()
            await my_model.init_config(
                preprocessing_object=preprocessing_obj, trained_model_object=trained_model
            )
            await save_object(self.model_trainer_config.trained_model_file_path, my_model)
            logging.info("Saved final model object including preprocessing and trained model.")

            model_trainer_artifact = ModelTrainerArtifact(
                trained_model_file_path=self.model_trainer_config.trained_model_file_path,
                metric_artifact=metric_artifact
            )

            logging.info(f"Model trainer artifact: {model_trainer_artifact}")
            return model_trainer_artifact

        except Exception as e:
            raise MyException(e, sys) from e