Inder-26 commited on
Commit
8268752
·
1 Parent(s): da68771

Cloud pushed and s3 data storage implemented

Browse files
app.py CHANGED
@@ -4,12 +4,12 @@ from networksecurity.utils.ml_utils.model.estimator import NetworkModel
4
 
5
  ca = certifi.where()
6
 
7
- from dotenv import load_dotenv
8
  load_dotenv()
9
  mong_db_url = os.getenv("MONGODB_URL_KEY")
10
  print(mong_db_url)
11
 
12
- import pymongo
13
  from networksecurity.exception.exception import NetworkSecurityException
14
  from networksecurity.logging.logger import logging
15
  from networksecurity.pipeline.training_pipeline import TraningPipeline
 
4
 
5
  ca = certifi.where()
6
 
7
+ from dotenv import load_dotenv
8
  load_dotenv()
9
  mong_db_url = os.getenv("MONGODB_URL_KEY")
10
  print(mong_db_url)
11
 
12
+ import pymongo
13
  from networksecurity.exception.exception import NetworkSecurityException
14
  from networksecurity.logging.logger import logging
15
  from networksecurity.pipeline.training_pipeline import TraningPipeline
confusion_matrix.png CHANGED
networksecurity/cloud/s3_syncer.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ class S3Sync:
4
+ def sync_folder_to_s3(self, folder, aws_bucket_url):
5
+ command = f"aws s3 sync {folder} {aws_bucket_url} "
6
+ os.system(command)
7
+
8
+ def sync_folder_from_s3(self, folder, aws_bucket_url):
9
+ command = f"aws s3 sync {aws_bucket_url} {folder} "
10
+ os.system(command)
networksecurity/components/model_trainer.py CHANGED
@@ -2,6 +2,9 @@ import os
2
  import sys
3
  import mlflow
4
  import dagshub
 
 
 
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
 
@@ -110,38 +113,12 @@ class ModelTrainer:
110
  "AdaBoost": AdaBoostClassifier(),
111
  }
112
 
113
- params = {
114
- "Decision Tree": {
115
- "criterion": ["gini", "entropy", "log_loss"]
116
- },
117
- "Random Forest": {
118
- "n_estimators": [8, 16, 32, 128, 256]
119
- },
120
- "Gradient Boosting": {
121
- "learning_rate": [0.1, 0.01, 0.05, 0.001],
122
- "subsample": [0.6, 0.7, 0.75, 0.85, 0.9],
123
- "n_estimators": [8, 16, 32, 64, 128, 256],
124
- },
125
- "AdaBoost": {
126
- "learning_rate": [0.1, 0.01, 0.001],
127
- "n_estimators": [8, 16, 32, 64, 128, 256],
128
- },
129
- "Logistic Regression": {},
130
- }
131
-
132
- # ---------- Hyperparameter search ----------
133
- model_report = evaluate_models(
134
- X_train=X_train,
135
- y_train=y_train,
136
- X_test=X_test,
137
- y_test=y_test,
138
- models=models,
139
- params=params,
140
- )
141
-
142
  # ---------- MLflow logging ----------
143
- model_scores = {}
144
- run_id_map = {}
 
 
 
145
 
146
  for model_name, model in models.items():
147
 
@@ -176,22 +153,21 @@ class ModelTrainer:
176
  y_proba=y_test_proba,
177
  )
178
 
179
- model_scores[model_name] = test_metric.f1_score
180
- run_id_map[model_name] = run.info.run_id
 
 
 
181
 
182
- # ---------- Best model selection ----------
183
- best_model_name = max(model_scores, key=model_scores.get)
184
- best_model = model_report[best_model_name]["model"]
185
 
186
  logging.info(
187
  f"Best Model: {best_model_name} | "
188
- f"Test F1: {model_scores[best_model_name]}"
189
  )
190
 
191
- # ---------- Tag best model ----------
192
- mlflow.start_run(run_id=run_id_map[best_model_name])
193
- mlflow.set_tag("best_model", "true")
194
- mlflow.end_run()
195
 
196
  # ---------- Save final model for deployment ----------
197
  preprocessor = load_object(
@@ -210,16 +186,20 @@ class ModelTrainer:
210
  preprocessor,
211
  )
212
 
213
- logging.info("Final model and preprocessor saved in final_model/")
 
 
 
 
 
214
 
215
  return ModelTrainerArtifact(
216
- trained_model_file_path=os.path.join(
217
- final_model_dir, "model.pkl"
218
- ),
219
- train_metric_artifact=train_metric,
220
- test_metric_artifact=test_metric,
221
  )
222
 
 
223
  def initiate_model_trainer(self) -> ModelTrainerArtifact:
224
  try:
225
  train_array = load_numpy_array_data(
 
2
  import sys
3
  import mlflow
4
  import dagshub
5
+ import matplotlib
6
+ matplotlib.use("Agg")
7
+
8
  import matplotlib.pyplot as plt
9
  import seaborn as sns
10
 
 
113
  "AdaBoost": AdaBoostClassifier(),
114
  }
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  # ---------- MLflow logging ----------
117
+ best_f1 = -1
118
+ best_model = None
119
+ best_model_name = None
120
+ best_run_id = None
121
+
122
 
123
  for model_name, model in models.items():
124
 
 
153
  y_proba=y_test_proba,
154
  )
155
 
156
+ if test_metric.f1_score > best_f1:
157
+ best_f1 = test_metric.f1_score
158
+ best_model = model
159
+ best_model_name = model_name
160
+ best_run_id = run.info.run_id
161
 
 
 
 
162
 
163
  logging.info(
164
  f"Best Model: {best_model_name} | "
165
+ f"Test F1: {best_f1}"
166
  )
167
 
168
+
169
+ with mlflow.start_run(run_id=best_run_id):
170
+ mlflow.set_tag("best_model", "true")
 
171
 
172
  # ---------- Save final model for deployment ----------
173
  preprocessor = load_object(
 
186
  preprocessor,
187
  )
188
 
189
+ logging.info(f"Final model and preprocessor saved in final_models")
190
+ y_train_pred = best_model.predict(X_train)
191
+ y_test_pred = best_model.predict(X_test)
192
+
193
+ best_train_metric = get_classification_score(y_train, y_train_pred)
194
+ best_test_metric = get_classification_score(y_test, y_test_pred)
195
 
196
  return ModelTrainerArtifact(
197
+ trained_model_file_path=os.path.join(final_model_dir, "model.pkl"),
198
+ train_metric_artifact=best_train_metric,
199
+ test_metric_artifact=best_test_metric,
 
 
200
  )
201
 
202
+
203
  def initiate_model_trainer(self) -> ModelTrainerArtifact:
204
  try:
205
  train_array = load_numpy_array_data(
networksecurity/constant/training_pipeline/__init__.py CHANGED
@@ -64,3 +64,5 @@ MODEL_TRAINER_TRAINED_MODEL_DIR: str = "trained_model"
64
  MODEL_TRAINER_TRAINED_MODEL_NAME: str = "model.pkl"
65
  MODEL_TRAINER_EXPECTED_SCORE: float = 0.6
66
  MODEL_TRAINER_OVER_FITTING_UNDER_FITTING_THRESHOLD: float = 0.05
 
 
 
64
  MODEL_TRAINER_TRAINED_MODEL_NAME: str = "model.pkl"
65
  MODEL_TRAINER_EXPECTED_SCORE: float = 0.6
66
  MODEL_TRAINER_OVER_FITTING_UNDER_FITTING_THRESHOLD: float = 0.05
67
+
68
+ TRAINING_BUCKET_NAME = "awsnetworksecuritybucket"
networksecurity/entity/config_entity.py CHANGED
@@ -12,6 +12,7 @@ class TrainingPipelineConfig:
12
  self.pipeline_name = training_pipeline.PIPELINE_NAME
13
  self.artifact_name = training_pipeline.ARTIFACT_DIR
14
  self.artifact_dir = os.path.join(self.artifact_name,timestamp)
 
15
  self.timestamp: str = timestamp
16
 
17
  class DataIngestionConfig:
 
12
  self.pipeline_name = training_pipeline.PIPELINE_NAME
13
  self.artifact_name = training_pipeline.ARTIFACT_DIR
14
  self.artifact_dir = os.path.join(self.artifact_name,timestamp)
15
+ self.model_dir = os.path.join("final_model")
16
  self.timestamp: str = timestamp
17
 
18
  class DataIngestionConfig:
networksecurity/pipeline/training_pipeline.py CHANGED
@@ -6,6 +6,8 @@ from networksecurity.components.data_ingestion import DataIngestion
6
  from networksecurity.components.data_validation import DataValidation
7
  from networksecurity.components.data_transformation import DataTransformation
8
  from networksecurity.components.model_trainer import ModelTrainer
 
 
9
 
10
  from networksecurity.entity.config_entity import (
11
  TrainingPipelineConfig,
@@ -26,6 +28,7 @@ class TraningPipeline:
26
  def __init__(self):
27
  try:
28
  self.training_pipeline_config = TrainingPipelineConfig()
 
29
  except Exception as e:
30
  raise NetworkSecurityException(e, sys)
31
 
@@ -92,6 +95,38 @@ class TraningPipeline:
92
  except Exception as e:
93
  raise NetworkSecurityException(e, sys)
94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  def run_pipeline(self):
96
  try:
97
  data_ingestion_artifact = self.start_data_ingestion()
@@ -99,6 +134,10 @@ class TraningPipeline:
99
  data_transformation_artifact = self.start_data_transformation(data_validation_artifact=data_validation_artifact)
100
  model_trainer_artifact = self.start_model_trainer(data_transformation_artifact=data_transformation_artifact)
101
  logging.info("Training pipeline completed successfully")
 
 
 
 
102
  return model_trainer_artifact
103
  except Exception as e:
104
  raise NetworkSecurityException(e, sys)
 
6
  from networksecurity.components.data_validation import DataValidation
7
  from networksecurity.components.data_transformation import DataTransformation
8
  from networksecurity.components.model_trainer import ModelTrainer
9
+ from networksecurity.constant.training_pipeline import TRAINING_BUCKET_NAME
10
+ from networksecurity.cloud.s3_syncer import S3Sync
11
 
12
  from networksecurity.entity.config_entity import (
13
  TrainingPipelineConfig,
 
28
  def __init__(self):
29
  try:
30
  self.training_pipeline_config = TrainingPipelineConfig()
31
+ self.s3_sync = S3Sync()
32
  except Exception as e:
33
  raise NetworkSecurityException(e, sys)
34
 
 
95
  except Exception as e:
96
  raise NetworkSecurityException(e, sys)
97
 
98
+ ## Local artifact is pushed to S3
99
+ def sync_artifact_dir_to_s3(self):
100
+ try:
101
+ aws_bucket_url = (
102
+ f"s3://{TRAINING_BUCKET_NAME}/artifact/"
103
+ f"{self.training_pipeline_config.timestamp}"
104
+ )
105
+
106
+ self.s3_sync.sync_folder_to_s3(
107
+ folder=self.training_pipeline_config.artifact_dir,
108
+ aws_bucket_url=aws_bucket_url
109
+ )
110
+ except Exception as e:
111
+ raise NetworkSecurityException(e, sys)
112
+
113
+
114
+ ## Local final_models is pushed to S3
115
+ def sync_saved_model_dir_to_s3(self):
116
+ try:
117
+ aws_bucket_url = (
118
+ f"s3://{TRAINING_BUCKET_NAME}/final_model/"
119
+ f"{self.training_pipeline_config.timestamp}"
120
+ )
121
+
122
+ self.s3_sync.sync_folder_to_s3(
123
+ folder=self.training_pipeline_config.model_dir,
124
+ aws_bucket_url=aws_bucket_url
125
+ )
126
+ except Exception as e:
127
+ raise NetworkSecurityException(e, sys)
128
+
129
+
130
  def run_pipeline(self):
131
  try:
132
  data_ingestion_artifact = self.start_data_ingestion()
 
134
  data_transformation_artifact = self.start_data_transformation(data_validation_artifact=data_validation_artifact)
135
  model_trainer_artifact = self.start_model_trainer(data_transformation_artifact=data_transformation_artifact)
136
  logging.info("Training pipeline completed successfully")
137
+
138
+ self.sync_artifact_dir_to_s3()
139
+ self.sync_saved_model_dir_to_s3()
140
+
141
  return model_trainer_artifact
142
  except Exception as e:
143
  raise NetworkSecurityException(e, sys)
precision_recall_curve.png CHANGED
requirements.txt CHANGED
@@ -1,16 +1,118 @@
1
- python-dotenv
2
- numpy
3
- pandas
4
- setuptools
5
- pymongo
6
- certifi
7
- pymongo[srv]==3.11
8
- scikit-learn
9
- pyaml
10
- mlflow
11
- dagshub
12
- seaborn
13
- fastapi
14
- uvicorn
15
- python-multipart
16
- #-e .
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ alembic==1.17.2
2
+ annotated-doc==0.0.4
3
+ annotated-types==0.7.0
4
+ anyio==4.12.0
5
+ appdirs==1.4.4
6
+ backoff==2.2.1
7
+ blinker==1.9.0
8
+ boto3==1.42.18
9
+ botocore==1.42.18
10
+ cachetools==6.2.4
11
+ certifi==2025.11.12
12
+ cffi==2.0.0
13
+ charset-normalizer==3.4.4
14
+ click==8.3.1
15
+ cloudpickle==3.1.2
16
+ colorama==0.4.6
17
+ contourpy==1.3.3
18
+ cryptography==46.0.3
19
+ cycler==0.12.1
20
+ dacite==1.6.0
21
+ dagshub==0.6.4
22
+ dagshub-annotation-converter==0.1.15
23
+ databricks-sdk==0.76.0
24
+ dataclasses-json==0.6.7
25
+ dill==0.4.0
26
+ dnspython==1.16.0
27
+ docker==7.1.0
28
+ fastapi==0.128.0
29
+ flask==3.1.2
30
+ flask-cors==6.0.2
31
+ fonttools==4.61.1
32
+ gitdb==4.0.12
33
+ gitpython==3.1.45
34
+ google-auth==2.45.0
35
+ gql==4.0.0
36
+ graphene==3.4.3
37
+ graphql-core==3.2.7
38
+ graphql-relay==3.2.0
39
+ greenlet==3.3.0
40
+ h11==0.16.0
41
+ httpcore==1.0.9
42
+ httpx==0.28.1
43
+ huey==2.5.5
44
+ idna==3.11
45
+ importlib-metadata==8.7.1
46
+ itsdangerous==2.2.0
47
+ jinja2==3.1.6
48
+ jmespath==1.0.1
49
+ joblib==1.5.3
50
+ kiwisolver==1.4.9
51
+ lxml==6.0.2
52
+ mako==1.3.10
53
+ markdown-it-py==4.0.0
54
+ markupsafe==3.0.3
55
+ marshmallow==3.26.2
56
+ matplotlib==3.10.8
57
+ mdurl==0.1.2
58
+ mlflow==3.8.1
59
+ mlflow-skinny==3.8.1
60
+ mlflow-tracing==3.8.1
61
+ multidict==6.7.0
62
+ mypy-extensions==1.1.0
63
+ # -e file:///D:/Coding%20Central/NetworkSecurity
64
+ numpy==2.4.0
65
+ opentelemetry-api==1.39.1
66
+ opentelemetry-proto==1.39.1
67
+ opentelemetry-sdk==1.39.1
68
+ opentelemetry-semantic-conventions==0.60b1
69
+ packaging==25.0
70
+ pandas==2.3.3
71
+ pathvalidate==3.3.1
72
+ pillow==12.0.0
73
+ propcache==0.4.1
74
+ protobuf==6.33.2
75
+ pyaml==25.7.0
76
+ pyarrow==22.0.0
77
+ pyasn1==0.6.1
78
+ pyasn1-modules==0.4.2
79
+ pycparser==2.23
80
+ pydantic==2.12.5
81
+ pydantic-core==2.41.5
82
+ pygments==2.19.2
83
+ pymongo==3.11.0
84
+ pyparsing==3.3.1
85
+ python-dateutil==2.9.0.post0
86
+ python-dotenv==1.2.1
87
+ python-multipart==0.0.21
88
+ pytz==2025.2
89
+ pywin32==311
90
+ pyyaml==6.0.3
91
+ requests==2.32.5
92
+ requests-toolbelt==1.0.0
93
+ rich==14.2.0
94
+ rsa==4.9.1
95
+ s3transfer==0.16.0
96
+ scikit-learn==1.8.0
97
+ scipy==1.16.3
98
+ seaborn==0.13.2
99
+ semver==3.0.4
100
+ setuptools==80.9.0
101
+ six==1.17.0
102
+ smmap==5.0.2
103
+ sqlalchemy==2.0.45
104
+ sqlparse==0.5.5
105
+ starlette==0.50.0
106
+ tenacity==9.1.2
107
+ threadpoolctl==3.6.0
108
+ treelib==1.8.0
109
+ typing-extensions==4.15.0
110
+ typing-inspect==0.9.0
111
+ typing-inspection==0.4.2
112
+ tzdata==2025.3
113
+ urllib3==2.6.2
114
+ uvicorn==0.40.0
115
+ waitress==3.0.2
116
+ werkzeug==3.1.4
117
+ yarl==1.22.0
118
+ zipp==3.23.0
roc_curve.png CHANGED