Inder-26 commited on
Commit
eae2854
·
1 Parent(s): f228efb

Data Transformation done

Browse files
main.py CHANGED
@@ -1,8 +1,10 @@
1
  from networksecurity.components.data_ingestion import DataIngestion
2
  from networksecurity.components.data_validation import DataValidation
 
 
3
  from networksecurity.exception.exception import NetworkSecurityException
4
  from networksecurity.logging.logger import logging
5
- from networksecurity.entity.config_entity import DataIngestionConfig,DataValidationConfig
6
  from networksecurity.entity.config_entity import TraningPipelineConfig
7
  import sys
8
 
@@ -10,16 +12,23 @@ if __name__ == "__main__":
10
  try:
11
  traningpipelineconfig=TraningPipelineConfig()
12
  dataingestionconfig=DataIngestionConfig(traningpipelineconfig)
13
- dataingestion=DataIngestion(dataingestionconfig)
14
  logging.info("Initiate the data ingestion")
15
- dataingestionartifact=dataingestion.initiate_data_ingestion()
 
16
  print(dataingestionartifact)
17
  data_validation_config=DataValidationConfig(traningpipelineconfig)
18
- datavalidation=DataValidation(dataingestionartifact,data_validation_config)
19
  logging.info("Initiate the data validation")
20
- data_validation_artifact=datavalidation.initiate_data_validation()
21
  logging.info(f"Data validation completed {data_validation_artifact}")
22
  print(data_validation_artifact)
 
 
 
 
 
 
23
 
24
  except Exception as e:
25
  raise NetworkSecurityException(e, sys)
 
1
  from networksecurity.components.data_ingestion import DataIngestion
2
  from networksecurity.components.data_validation import DataValidation
3
+ from networksecurity.components.data_transformation import DataTransformation
4
+
5
  from networksecurity.exception.exception import NetworkSecurityException
6
  from networksecurity.logging.logger import logging
7
+ from networksecurity.entity.config_entity import DataIngestionConfig,DataValidationConfig,DataTransformationConfig
8
  from networksecurity.entity.config_entity import TraningPipelineConfig
9
  import sys
10
 
 
12
  try:
13
  traningpipelineconfig=TraningPipelineConfig()
14
  dataingestionconfig=DataIngestionConfig(traningpipelineconfig)
15
+ data_ingestion=DataIngestion(dataingestionconfig)
16
  logging.info("Initiate the data ingestion")
17
+ dataingestionartifact=data_ingestion.initiate_data_ingestion()
18
+ logging.info(f"Data ingestion completed {dataingestionartifact}")
19
  print(dataingestionartifact)
20
  data_validation_config=DataValidationConfig(traningpipelineconfig)
21
+ data_validation=DataValidation(dataingestionartifact,data_validation_config)
22
  logging.info("Initiate the data validation")
23
+ data_validation_artifact=data_validation.initiate_data_validation()
24
  logging.info(f"Data validation completed {data_validation_artifact}")
25
  print(data_validation_artifact)
26
+ data_transformation_config=DataTransformationConfig(traningpipelineconfig)
27
+ data_transformation=DataTransformation(data_transformation_config,data_validation_artifact)
28
+ logging.info("Initiate the data transformation")
29
+ data_transformation_artifact=data_transformation.initiate_data_transformation()
30
+ logging.info(f"Data transformation completed {data_transformation_artifact}")
31
+ print(data_transformation_artifact)
32
 
33
  except Exception as e:
34
  raise NetworkSecurityException(e, sys)
networksecurity/components/data_transformation.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os,sys
2
+ import numpy as np
3
+ import pandas as pd
4
+ from sklearn.impute import KNNImputer
5
+ from sklearn.pipeline import Pipeline
6
+ from networksecurity.constant.training_pipeline import TARGET_COLUMN, DATA_TRANSFORMATION_IMPUTER_PARAMS
7
+ from networksecurity.entity.artifact_entity import DataTransformationArtifact, DataValidationArtifact
8
+ from networksecurity.entity.config_entity import DataTransformationConfig
9
+ from networksecurity.exception.exception import NetworkSecurityException
10
+ from networksecurity.logging.logger import logging
11
+ from networksecurity.utils.main_utils.utils import save_numpy_array_data, save_object
12
+
13
+ class DataTransformation:
14
+ def __init__(self,
15
+ data_transformation_config: DataTransformationConfig,
16
+ data_validation_artifact: DataValidationArtifact):
17
+ try:
18
+ logging.info(f"{'>>'*20} Data Transformation {'<<'*20}")
19
+ self.data_transformation_config = data_transformation_config
20
+ self.data_validation_artifact = data_validation_artifact
21
+ except Exception as e:
22
+ raise NetworkSecurityException(e, sys)
23
+
24
+ @staticmethod
25
+ def read_data(file_path) -> pd.DataFrame:
26
+ try:
27
+ return pd.read_csv(file_path)
28
+ except Exception as e:
29
+ raise NetworkSecurityException(e, sys)
30
+
31
+ def get_data_transformer_object(cls)-> Pipeline:
32
+ """
33
+ It initializes the KNNImputer object with the parameters specfied in the training_pipeline.py file
34
+ and returns the pipeline object with the KNNImputer object as the first step.
35
+
36
+ Args:
37
+ cls: DataTransformation
38
+
39
+ Returns:
40
+ Pipeline: Pipeline object with the KNNImputer object as the first step.
41
+ """
42
+ logging.info("Entered the get_data_transformer_object method of Data_Transformation class")
43
+ try:
44
+ imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
45
+ logging.info("Created KNNImputer object with the parameters specified in the training_pipeline.py file")
46
+ preprocessor = Pipeline(steps=[("imputer", imputer)])
47
+ return preprocessor
48
+ except Exception as e:
49
+ raise NetworkSecurityException(e, sys)
50
+
51
+ def initiate_data_transformation(self)->DataTransformationArtifact:
52
+ logging.info("Entered initiate_data_transformation method of DataTransformation class")
53
+ try:
54
+ logging.info("Starting data transformation")
55
+ train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
56
+ test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
57
+
58
+ ## Training dataframe
59
+ input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
60
+ target_feature_train_df = train_df[TARGET_COLUMN]
61
+ target_feature_train_df = target_feature_train_df.replace(-1, 0)
62
+
63
+ ## Testing dataframe
64
+ input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
65
+ target_feature_test_df = test_df[TARGET_COLUMN]
66
+ target_feature_test_df = target_feature_test_df.replace(-1, 0)
67
+
68
+ preprocessor = self.get_data_transformer_object()
69
+
70
+ preprocessor_object=preprocessor.fit(input_feature_train_df)
71
+ transformed_input_train_feature = preprocessor_object.transform(input_feature_train_df)
72
+ transformed_input_test_feature = preprocessor_object.transform(input_feature_test_df)
73
+
74
+ train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
75
+ test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
76
+
77
+ ## Save numpy array data and preprocessor object
78
+ save_numpy_array_data(
79
+ file_path=self.data_transformation_config.transformed_train_file_path,
80
+ array=train_arr
81
+ )
82
+ save_numpy_array_data(
83
+ file_path=self.data_transformation_config.transformed_test_file_path,
84
+ array=test_arr
85
+ )
86
+ save_object(
87
+ file_path=self.data_transformation_config.transformed_object_file_path,
88
+ obj=preprocessor_object
89
+ )
90
+
91
+ data_transformation_artifact = DataTransformationArtifact(
92
+ transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
93
+ transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
94
+ transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
95
+ )
96
+ return data_transformation_artifact
97
+ logging.info("Data transformation completed")
98
+ except Exception as e:
99
+ raise NetworkSecurityException(e, sys)
networksecurity/constant/training_pipeline/__init__.py CHANGED
@@ -36,3 +36,16 @@ DATA_VALIDATION_VALID_DIR: str ="validated"
36
  DATA_VALIDATION_INVALID_DIR: str ="invalid"
37
  DATA_VALIDATION_DRIFT_REPORT_DIR: str ="drift_report"
38
  DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str ="report.yaml"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  DATA_VALIDATION_INVALID_DIR: str ="invalid"
37
  DATA_VALIDATION_DRIFT_REPORT_DIR: str ="drift_report"
38
  DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str ="report.yaml"
39
+
40
+ """
41
+ Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME
42
+ """
43
+ DATA_TRANSFORMATION_DIR_NAME: str ="data_transformation"
44
+ DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str ="transformed"
45
+ DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR: str ="transformed_object"
46
+ DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
47
+ "missing_values": np.nan,
48
+ "n_neighbors": 3,
49
+ "weights": "uniform",
50
+ }
51
+ PREPROCESSING_OBJECT_FILE_NAME: str = "preprocessing_object.pkl"
networksecurity/entity/artifact_entity.py CHANGED
@@ -12,4 +12,10 @@ class DataValidationArtifact:
12
  valid_test_file_path: str
13
  invalid_train_file_path: str
14
  invalid_test_file_path: str
15
- drift_report_file_path: str
 
 
 
 
 
 
 
12
  valid_test_file_path: str
13
  invalid_train_file_path: str
14
  invalid_test_file_path: str
15
+ drift_report_file_path: str
16
+
17
+ @dataclass
18
+ class DataTransformationArtifact:
19
+ transformed_train_file_path: str
20
+ transformed_test_file_path: str
21
+ transformed_object_file_path: str
networksecurity/entity/config_entity.py CHANGED
@@ -83,3 +83,28 @@ class DataValidationConfig:
83
  training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
84
  training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
85
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
84
  training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
85
  )
86
+
87
+ class DataTransformationConfig:
88
+ def __init__(self, training_pipeline_config: TraningPipelineConfig):
89
+ self.data_transformation_dir: str = os.path.join(
90
+ training_pipeline_config.artifact_dir,
91
+ training_pipeline.DATA_TRANSFORMATION_DIR_NAME
92
+ )
93
+
94
+ self.transformed_train_file_path: str = os.path.join(
95
+ self.data_transformation_dir,
96
+ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
97
+ training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy")
98
+ )
99
+
100
+ self.transformed_test_file_path: str = os.path.join(
101
+ self.data_transformation_dir,
102
+ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
103
+ training_pipeline.TEST_FILE_NAME.replace("csv", "npy")
104
+ )
105
+
106
+ self.transformed_object_file_path: str = os.path.join(
107
+ self.data_transformation_dir,
108
+ training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
109
+ training_pipeline.PREPROCESSING_OBJECT_FILE_NAME
110
+ )
networksecurity/utils/main_utils/utils.py CHANGED
@@ -29,5 +29,29 @@ def write_yaml_file(file_path: str, content: object, replace: bool = False) -> N
29
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
30
  with open(file_path, 'w') as file:
31
  yaml.dump(content, file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  except Exception as e:
33
  raise NetworkSecurityException(e, sys)
 
29
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
30
  with open(file_path, 'w') as file:
31
  yaml.dump(content, file)
32
+ except Exception as e:
33
+ raise NetworkSecurityException(e, sys)
34
+
35
+ def save_numpy_array_data(file_path: str, array: np.array):
36
+ """
37
+ Save numpy array data to file
38
+ file_path : str : file path to save the numpy array
39
+ array : np.array : numpy array data to be saved
40
+ """
41
+ try:
42
+ dir_path = os.path.dirname(file_path)
43
+ os.makedirs(dir_path, exist_ok=True)
44
+ with open(file_path, 'wb') as file_obj:
45
+ np.save(file_obj, array)
46
+ except Exception as e:
47
+ raise NetworkSecurityException(e, sys)
48
+
49
+ def save_object(file_path: str, obj: object) -> None:
50
+ try:
51
+ logging.info("Entered the save_object method of Main Utils")
52
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
53
+ with open(file_path, 'wb') as file_obj:
54
+ pickle.dump(obj, file_obj)
55
+ logging.info("Exited the save_object method of Main Utils")
56
  except Exception as e:
57
  raise NetworkSecurityException(e, sys)
requirements.txt CHANGED
@@ -6,6 +6,5 @@ pymongo
6
  certifi
7
  pymongo[srv]==3.11
8
  scikit-learn
9
- dill
10
  pyaml
11
  #-e .
 
6
  certifi
7
  pymongo[srv]==3.11
8
  scikit-learn
 
9
  pyaml
10
  #-e .