Inder-26 commited on
Commit
c1c191a
·
1 Parent(s): ef7b899

mlflow and dagshub and you can see the models and compare them side by side and pick the best model according to you , I am using test_f1 as decision metric

Browse files
confusion_matrix.png ADDED
networksecurity/components/model_trainer.py CHANGED
@@ -1,126 +1,238 @@
1
- import os,sys,mlflow
 
 
 
 
 
 
2
  from networksecurity.exception.exception import NetworkSecurityException
3
  from networksecurity.logging.logger import logging
4
 
5
  from networksecurity.entity.config_entity import ModelTrainerConfig
6
- from networksecurity.entity.artifact_entity import DataTransformationArtifact,ModelTrainerArtifact
7
-
8
- from networksecurity.utils.ml_utils.model.estimator import NetworkModel
9
- from networksecurity.utils.main_utils.utils import save_object,load_object,load_numpy_array_data,evaluate_models
10
- from networksecurity.utils.ml_utils.metric.classfication_metric import get_classification_score
 
 
 
 
 
 
 
 
 
 
11
 
12
  from sklearn.linear_model import LogisticRegression
13
- from sklearn.metrics import r2_score
14
  from sklearn.tree import DecisionTreeClassifier
15
- from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier
16
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  class ModelTrainer:
18
- def __init__(self,model_trainer_config:ModelTrainerConfig,
19
- data_transformation_artifact:DataTransformationArtifact):
 
 
 
20
  try:
21
  logging.info(f"{'>>'*20} Model Trainer {'<<'*20}")
22
  self.model_trainer_config = model_trainer_config
23
  self.data_transformation_artifact = data_transformation_artifact
24
  except Exception as e:
25
- raise NetworkSecurityException(e,sys)
26
-
27
- def track_model_mlflow(self, best_model, classification_metric):
28
- with mlflow.start_run():
29
- f1_score = classification_metric.f1_score
30
- precision_score = classification_metric.precision_score
31
- recall_score = classification_metric.recall_score
32
-
33
- mlflow.log_metric("F1_score", f1_score)
34
- mlflow.log_metric("Precision Score", precision_score)
35
- mlflow.log_metric("Recall Score", recall_score)
36
- mlflow.sklearn.log_model(best_model, "is the best model")
37
-
38
- def train_model(self,X_train,X_test,y_train,y_test):
39
- model = {
40
  "Logistic Regression": LogisticRegression(),
41
  "Decision Tree": DecisionTreeClassifier(),
42
  "Random Forest": RandomForestClassifier(),
43
  "Gradient Boosting": GradientBoostingClassifier(),
44
- "AdaBoost": AdaBoostClassifier()
45
  }
 
46
  params = {
47
  "Decision Tree": {
48
- 'criterion':['gini','entropy','log_loss'],
49
- #'splitter':['best','random'],
50
- #'max_features':['sqrt','log2']
51
  },
52
  "Random Forest": {
53
- #'criterion':['gini','entropy','log_loss'],
54
- #'max_features':['sqrt','log2'],
55
- 'n_estimators':[8,16,32,128,256]
56
  },
57
  "Gradient Boosting": {
58
- 'learning_rate':[.1,.01,.05,.001],
59
- 'subsample':[0.6,0.7,0.75,0.85,0.9],
60
- 'n_estimators':[8,16,32,64,128,256]
61
  },
62
  "AdaBoost": {
63
- 'learning_rate':[.1,.01,.001],
64
- 'n_estimators':[8,16,32,64,128,256]
65
  },
66
  "Logistic Regression": {},
67
  }
68
 
69
- model_report: dict = evaluate_models(X_train=X_train,y_train=y_train,
70
- X_test=X_test,y_test=y_test,models=model,params=params)
71
-
72
- ## To get the best model score from dict
73
- best_model_score = max(sorted(model_report.values()))
74
-
75
- ## To get the best model name from dict
76
- best_model_name = list(model_report.keys())[
77
- list(model_report.values()).index(best_model_score)]
78
- best_model = model[best_model_name]
79
- logging.info(f"Best model found , Model Name : {best_model_name} , R2 Score : {best_model_score}")
80
-
81
- y_train_pred = best_model.predict(X_train)
82
- y_test_pred = best_model.predict(X_test)
83
-
84
- classification_train_metric=get_classification_score(y_true=y_train, y_pred=y_train_pred)
85
- classification_test_metric=get_classification_score(y_true=y_test, y_pred=y_test_pred)
86
-
87
-
88
- ## Track with experiments with mlflow
89
- self.track_model_mlflow(best_model,classification_train_metric)
90
- self.track_model_mlflow(best_model,classification_test_metric)
91
-
92
-
93
-
94
- preprocessor = load_object(file_path=self.data_transformation_artifact.transformed_object_file_path)
95
- model_dir_path = os.path.dirname(self.model_trainer_config.trained_model_file_path)
96
- os.makedirs(model_dir_path, exist_ok=True)
97
-
98
- Network_model = NetworkModel(preprocessor=preprocessor, model=best_model)
99
- save_object(file_path=self.model_trainer_config.trained_model_file_path, obj=Network_model)
100
- logging.info(f"Trained model saved at : {self.model_trainer_config.trained_model_file_path}")
101
-
102
- model_trainer_artifact=ModelTrainerArtifact(trained_model_file_path=self.model_trainer_config.trained_model_file_path,
103
- train_metric_artifact=classification_train_metric,
104
- test_metric_artifact=classification_test_metric)
105
- logging.info(f"Model Trainer Artifact : {model_trainer_artifact}")
106
- return model_trainer_artifact
107
-
108
- def initiate_model_trainer(self)->ModelTrainerArtifact:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  try:
110
- logging.info("Loading transformed training array and transformed test array")
111
- train_file_path = self.data_transformation_artifact.transformed_train_file_path
112
- test_file_path = self.data_transformation_artifact.transformed_test_file_path
 
 
 
113
 
114
- ## Load numpy array
115
- train_array = load_numpy_array_data(file_path=train_file_path)
116
- test_array = load_numpy_array_data(file_path=test_file_path)
117
- logging.info("Splitting training and test input and target feature")
118
- X_train,y_train = train_array[:,:-1],train_array[:,-1]
119
- X_test,y_test = test_array[:,:-1],test_array[:,-1]
120
 
121
- model_trainer_artifact = self.train_model(X_train=X_train, X_test=X_test,
122
- y_train=y_train, y_test=y_test)
123
- return model_trainer_artifact
124
 
125
  except Exception as e:
126
- raise NetworkSecurityException(e,sys)
 
1
+ import os
2
+ import sys
3
+ import mlflow
4
+ import dagshub
5
+ import matplotlib.pyplot as plt
6
+ import seaborn as sns
7
+
8
  from networksecurity.exception.exception import NetworkSecurityException
9
  from networksecurity.logging.logger import logging
10
 
11
  from networksecurity.entity.config_entity import ModelTrainerConfig
12
+ from networksecurity.entity.artifact_entity import (
13
+ DataTransformationArtifact,
14
+ ModelTrainerArtifact,
15
+ )
16
+
17
+ from networksecurity.utils.main_utils.utils import (
18
+ save_object,
19
+ load_object,
20
+ load_numpy_array_data,
21
+ evaluate_models,
22
+ )
23
+
24
+ from networksecurity.utils.ml_utils.metric.classfication_metric import (
25
+ get_classification_score,
26
+ )
27
 
28
  from sklearn.linear_model import LogisticRegression
 
29
  from sklearn.tree import DecisionTreeClassifier
30
+ from sklearn.ensemble import (
31
+ RandomForestClassifier,
32
+ GradientBoostingClassifier,
33
+ AdaBoostClassifier,
34
+ )
35
+ from sklearn.metrics import (
36
+ confusion_matrix,
37
+ roc_curve,
38
+ precision_recall_curve,
39
+ )
40
+
41
+ # ---------------- Dagshub + MLflow ----------------
42
+ dagshub.init(
43
+ repo_owner="Inder-26",
44
+ repo_name="NetworkSecurity",
45
+ mlflow=True,
46
+ )
47
+
48
+
49
+ # ---------------- Helper: log visual artifacts ----------------
50
+ def log_classification_artifacts(y_true, y_pred, y_proba):
51
+ # Confusion Matrix
52
+ cm = confusion_matrix(y_true, y_pred)
53
+ plt.figure(figsize=(4, 4))
54
+ sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
55
+ plt.xlabel("Predicted")
56
+ plt.ylabel("Actual")
57
+ plt.title("Confusion Matrix")
58
+ plt.tight_layout()
59
+ plt.savefig("confusion_matrix.png")
60
+ mlflow.log_artifact("confusion_matrix.png")
61
+ plt.close()
62
+
63
+ # ROC Curve
64
+ fpr, tpr, _ = roc_curve(y_true, y_proba)
65
+ plt.figure()
66
+ plt.plot(fpr, tpr)
67
+ plt.plot([0, 1], [0, 1], "--")
68
+ plt.xlabel("False Positive Rate")
69
+ plt.ylabel("True Positive Rate")
70
+ plt.title("ROC Curve")
71
+ plt.tight_layout()
72
+ plt.savefig("roc_curve.png")
73
+ mlflow.log_artifact("roc_curve.png")
74
+ plt.close()
75
+
76
+ # Precision-Recall Curve
77
+ precision, recall, _ = precision_recall_curve(y_true, y_proba)
78
+ plt.figure()
79
+ plt.plot(recall, precision)
80
+ plt.xlabel("Recall")
81
+ plt.ylabel("Precision")
82
+ plt.title("Precision-Recall Curve")
83
+ plt.tight_layout()
84
+ plt.savefig("precision_recall_curve.png")
85
+ mlflow.log_artifact("precision_recall_curve.png")
86
+ plt.close()
87
+
88
+
89
+ # ---------------- Model Trainer ----------------
90
  class ModelTrainer:
91
+ def __init__(
92
+ self,
93
+ model_trainer_config: ModelTrainerConfig,
94
+ data_transformation_artifact: DataTransformationArtifact,
95
+ ):
96
  try:
97
  logging.info(f"{'>>'*20} Model Trainer {'<<'*20}")
98
  self.model_trainer_config = model_trainer_config
99
  self.data_transformation_artifact = data_transformation_artifact
100
  except Exception as e:
101
+ raise NetworkSecurityException(e, sys)
102
+
103
+ def train_model(self, X_train, X_test, y_train, y_test):
104
+
105
+ models = {
 
 
 
 
 
 
 
 
 
 
106
  "Logistic Regression": LogisticRegression(),
107
  "Decision Tree": DecisionTreeClassifier(),
108
  "Random Forest": RandomForestClassifier(),
109
  "Gradient Boosting": GradientBoostingClassifier(),
110
+ "AdaBoost": AdaBoostClassifier(),
111
  }
112
+
113
  params = {
114
  "Decision Tree": {
115
+ "criterion": ["gini", "entropy", "log_loss"]
 
 
116
  },
117
  "Random Forest": {
118
+ "n_estimators": [8, 16, 32, 128, 256]
 
 
119
  },
120
  "Gradient Boosting": {
121
+ "learning_rate": [0.1, 0.01, 0.05, 0.001],
122
+ "subsample": [0.6, 0.7, 0.75, 0.85, 0.9],
123
+ "n_estimators": [8, 16, 32, 64, 128, 256],
124
  },
125
  "AdaBoost": {
126
+ "learning_rate": [0.1, 0.01, 0.001],
127
+ "n_estimators": [8, 16, 32, 64, 128, 256],
128
  },
129
  "Logistic Regression": {},
130
  }
131
 
132
+ # ---------- Hyperparameter search ----------
133
+ model_report = evaluate_models(
134
+ X_train=X_train,
135
+ y_train=y_train,
136
+ X_test=X_test,
137
+ y_test=y_test,
138
+ models=models,
139
+ params=params,
140
+ )
141
+
142
+ # ---------- MLflow logging ----------
143
+ model_scores = {}
144
+ run_id_map = {}
145
+
146
+ for model_name, model in models.items():
147
+
148
+ with mlflow.start_run(run_name=model_name) as run:
149
+
150
+ model.fit(X_train, y_train)
151
+
152
+ y_train_pred = model.predict(X_train)
153
+ y_test_pred = model.predict(X_test)
154
+ y_test_proba = model.predict_proba(X_test)[:, 1]
155
+
156
+ train_metric = get_classification_score(y_train, y_train_pred)
157
+ test_metric = get_classification_score(y_test, y_test_pred)
158
+
159
+ # Params & tags
160
+ mlflow.log_params(model.get_params())
161
+ mlflow.set_tag("model_name", model_name)
162
+ mlflow.set_tag("stage", "experiment")
163
+
164
+ # Metrics (decision metric = test_f1)
165
+ mlflow.log_metric("train_f1", train_metric.f1_score)
166
+ mlflow.log_metric("test_f1", test_metric.f1_score)
167
+ mlflow.log_metric("train_precision", train_metric.precision_score)
168
+ mlflow.log_metric("test_precision", test_metric.precision_score)
169
+ mlflow.log_metric("train_recall", train_metric.recall_score)
170
+ mlflow.log_metric("test_recall", test_metric.recall_score)
171
+
172
+ # Visual evaluation (artifacts)
173
+ log_classification_artifacts(
174
+ y_true=y_test,
175
+ y_pred=y_test_pred,
176
+ y_proba=y_test_proba,
177
+ )
178
+
179
+ model_scores[model_name] = test_metric.f1_score
180
+ run_id_map[model_name] = run.info.run_id
181
+
182
+ # ---------- Best model selection ----------
183
+ best_model_name = max(model_scores, key=model_scores.get)
184
+ best_model = models[best_model_name]
185
+
186
+ logging.info(
187
+ f"Best Model: {best_model_name} | "
188
+ f"Test F1: {model_scores[best_model_name]}"
189
+ )
190
+
191
+ # ---------- Tag best model ----------
192
+ mlflow.start_run(run_id=run_id_map[best_model_name])
193
+ mlflow.set_tag("best_model", "true")
194
+ mlflow.end_run()
195
+
196
+ # ---------- Save final model for deployment ----------
197
+ preprocessor = load_object(
198
+ self.data_transformation_artifact.transformed_object_file_path
199
+ )
200
+
201
+ final_model_dir = os.path.join(os.getcwd(), "final_models")
202
+ os.makedirs(final_model_dir, exist_ok=True)
203
+
204
+ save_object(
205
+ os.path.join(final_model_dir, "model.pkl"),
206
+ best_model,
207
+ )
208
+ save_object(
209
+ os.path.join(final_model_dir, "preprocessor.pkl"),
210
+ preprocessor,
211
+ )
212
+
213
+ logging.info("Final model and preprocessor saved in final_model/")
214
+
215
+ return ModelTrainerArtifact(
216
+ trained_model_file_path=os.path.join(
217
+ final_model_dir, "model.pkl"
218
+ ),
219
+ train_metric_artifact=train_metric,
220
+ test_metric_artifact=test_metric,
221
+ )
222
+
223
+ def initiate_model_trainer(self) -> ModelTrainerArtifact:
224
  try:
225
+ train_array = load_numpy_array_data(
226
+ self.data_transformation_artifact.transformed_train_file_path
227
+ )
228
+ test_array = load_numpy_array_data(
229
+ self.data_transformation_artifact.transformed_test_file_path
230
+ )
231
 
232
+ X_train, y_train = train_array[:, :-1], train_array[:, -1]
233
+ X_test, y_test = test_array[:, :-1], test_array[:, -1]
 
 
 
 
234
 
235
+ return self.train_model(X_train, X_test, y_train, y_test)
 
 
236
 
237
  except Exception as e:
238
+ raise NetworkSecurityException(e, sys)
precision_recall_curve.png ADDED
requirements.txt CHANGED
@@ -8,4 +8,6 @@ pymongo[srv]==3.11
8
  scikit-learn
9
  pyaml
10
  mlflow
 
 
11
  #-e .
 
8
  scikit-learn
9
  pyaml
10
  mlflow
11
+ dagshub
12
+ seaborn
13
  #-e .
roc_curve.png ADDED