Spaces:
Runtime error
Runtime error
Inder-26 commited on
Commit ·
eae2854
1
Parent(s): f228efb
Data Transformation done
Browse files- main.py +14 -5
- networksecurity/components/data_transformation.py +99 -0
- networksecurity/constant/training_pipeline/__init__.py +13 -0
- networksecurity/entity/artifact_entity.py +7 -1
- networksecurity/entity/config_entity.py +25 -0
- networksecurity/utils/main_utils/utils.py +24 -0
- requirements.txt +0 -1
main.py
CHANGED
|
@@ -1,8 +1,10 @@
|
|
| 1 |
from networksecurity.components.data_ingestion import DataIngestion
|
| 2 |
from networksecurity.components.data_validation import DataValidation
|
|
|
|
|
|
|
| 3 |
from networksecurity.exception.exception import NetworkSecurityException
|
| 4 |
from networksecurity.logging.logger import logging
|
| 5 |
-
from networksecurity.entity.config_entity import DataIngestionConfig,DataValidationConfig
|
| 6 |
from networksecurity.entity.config_entity import TraningPipelineConfig
|
| 7 |
import sys
|
| 8 |
|
|
@@ -10,16 +12,23 @@ if __name__ == "__main__":
|
|
| 10 |
try:
|
| 11 |
traningpipelineconfig=TraningPipelineConfig()
|
| 12 |
dataingestionconfig=DataIngestionConfig(traningpipelineconfig)
|
| 13 |
-
|
| 14 |
logging.info("Initiate the data ingestion")
|
| 15 |
-
dataingestionartifact=
|
|
|
|
| 16 |
print(dataingestionartifact)
|
| 17 |
data_validation_config=DataValidationConfig(traningpipelineconfig)
|
| 18 |
-
|
| 19 |
logging.info("Initiate the data validation")
|
| 20 |
-
data_validation_artifact=
|
| 21 |
logging.info(f"Data validation completed {data_validation_artifact}")
|
| 22 |
print(data_validation_artifact)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
except Exception as e:
|
| 25 |
raise NetworkSecurityException(e, sys)
|
|
|
|
| 1 |
from networksecurity.components.data_ingestion import DataIngestion
|
| 2 |
from networksecurity.components.data_validation import DataValidation
|
| 3 |
+
from networksecurity.components.data_transformation import DataTransformation
|
| 4 |
+
|
| 5 |
from networksecurity.exception.exception import NetworkSecurityException
|
| 6 |
from networksecurity.logging.logger import logging
|
| 7 |
+
from networksecurity.entity.config_entity import DataIngestionConfig,DataValidationConfig,DataTransformationConfig
|
| 8 |
from networksecurity.entity.config_entity import TraningPipelineConfig
|
| 9 |
import sys
|
| 10 |
|
|
|
|
| 12 |
try:
|
| 13 |
traningpipelineconfig=TraningPipelineConfig()
|
| 14 |
dataingestionconfig=DataIngestionConfig(traningpipelineconfig)
|
| 15 |
+
data_ingestion=DataIngestion(dataingestionconfig)
|
| 16 |
logging.info("Initiate the data ingestion")
|
| 17 |
+
dataingestionartifact=data_ingestion.initiate_data_ingestion()
|
| 18 |
+
logging.info(f"Data ingestion completed {dataingestionartifact}")
|
| 19 |
print(dataingestionartifact)
|
| 20 |
data_validation_config=DataValidationConfig(traningpipelineconfig)
|
| 21 |
+
data_validation=DataValidation(dataingestionartifact,data_validation_config)
|
| 22 |
logging.info("Initiate the data validation")
|
| 23 |
+
data_validation_artifact=data_validation.initiate_data_validation()
|
| 24 |
logging.info(f"Data validation completed {data_validation_artifact}")
|
| 25 |
print(data_validation_artifact)
|
| 26 |
+
data_transformation_config=DataTransformationConfig(traningpipelineconfig)
|
| 27 |
+
data_transformation=DataTransformation(data_transformation_config,data_validation_artifact)
|
| 28 |
+
logging.info("Initiate the data transformation")
|
| 29 |
+
data_transformation_artifact=data_transformation.initiate_data_transformation()
|
| 30 |
+
logging.info(f"Data transformation completed {data_transformation_artifact}")
|
| 31 |
+
print(data_transformation_artifact)
|
| 32 |
|
| 33 |
except Exception as e:
|
| 34 |
raise NetworkSecurityException(e, sys)
|
networksecurity/components/data_transformation.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os,sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.impute import KNNImputer
|
| 5 |
+
from sklearn.pipeline import Pipeline
|
| 6 |
+
from networksecurity.constant.training_pipeline import TARGET_COLUMN, DATA_TRANSFORMATION_IMPUTER_PARAMS
|
| 7 |
+
from networksecurity.entity.artifact_entity import DataTransformationArtifact, DataValidationArtifact
|
| 8 |
+
from networksecurity.entity.config_entity import DataTransformationConfig
|
| 9 |
+
from networksecurity.exception.exception import NetworkSecurityException
|
| 10 |
+
from networksecurity.logging.logger import logging
|
| 11 |
+
from networksecurity.utils.main_utils.utils import save_numpy_array_data, save_object
|
| 12 |
+
|
| 13 |
+
class DataTransformation:
|
| 14 |
+
def __init__(self,
|
| 15 |
+
data_transformation_config: DataTransformationConfig,
|
| 16 |
+
data_validation_artifact: DataValidationArtifact):
|
| 17 |
+
try:
|
| 18 |
+
logging.info(f"{'>>'*20} Data Transformation {'<<'*20}")
|
| 19 |
+
self.data_transformation_config = data_transformation_config
|
| 20 |
+
self.data_validation_artifact = data_validation_artifact
|
| 21 |
+
except Exception as e:
|
| 22 |
+
raise NetworkSecurityException(e, sys)
|
| 23 |
+
|
| 24 |
+
@staticmethod
|
| 25 |
+
def read_data(file_path) -> pd.DataFrame:
|
| 26 |
+
try:
|
| 27 |
+
return pd.read_csv(file_path)
|
| 28 |
+
except Exception as e:
|
| 29 |
+
raise NetworkSecurityException(e, sys)
|
| 30 |
+
|
| 31 |
+
def get_data_transformer_object(cls)-> Pipeline:
|
| 32 |
+
"""
|
| 33 |
+
It initializes the KNNImputer object with the parameters specfied in the training_pipeline.py file
|
| 34 |
+
and returns the pipeline object with the KNNImputer object as the first step.
|
| 35 |
+
|
| 36 |
+
Args:
|
| 37 |
+
cls: DataTransformation
|
| 38 |
+
|
| 39 |
+
Returns:
|
| 40 |
+
Pipeline: Pipeline object with the KNNImputer object as the first step.
|
| 41 |
+
"""
|
| 42 |
+
logging.info("Entered the get_data_transformer_object method of Data_Transformation class")
|
| 43 |
+
try:
|
| 44 |
+
imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
|
| 45 |
+
logging.info("Created KNNImputer object with the parameters specified in the training_pipeline.py file")
|
| 46 |
+
preprocessor = Pipeline(steps=[("imputer", imputer)])
|
| 47 |
+
return preprocessor
|
| 48 |
+
except Exception as e:
|
| 49 |
+
raise NetworkSecurityException(e, sys)
|
| 50 |
+
|
| 51 |
+
def initiate_data_transformation(self)->DataTransformationArtifact:
|
| 52 |
+
logging.info("Entered initiate_data_transformation method of DataTransformation class")
|
| 53 |
+
try:
|
| 54 |
+
logging.info("Starting data transformation")
|
| 55 |
+
train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
|
| 56 |
+
test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
|
| 57 |
+
|
| 58 |
+
## Training dataframe
|
| 59 |
+
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1)
|
| 60 |
+
target_feature_train_df = train_df[TARGET_COLUMN]
|
| 61 |
+
target_feature_train_df = target_feature_train_df.replace(-1, 0)
|
| 62 |
+
|
| 63 |
+
## Testing dataframe
|
| 64 |
+
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1)
|
| 65 |
+
target_feature_test_df = test_df[TARGET_COLUMN]
|
| 66 |
+
target_feature_test_df = target_feature_test_df.replace(-1, 0)
|
| 67 |
+
|
| 68 |
+
preprocessor = self.get_data_transformer_object()
|
| 69 |
+
|
| 70 |
+
preprocessor_object=preprocessor.fit(input_feature_train_df)
|
| 71 |
+
transformed_input_train_feature = preprocessor_object.transform(input_feature_train_df)
|
| 72 |
+
transformed_input_test_feature = preprocessor_object.transform(input_feature_test_df)
|
| 73 |
+
|
| 74 |
+
train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
|
| 75 |
+
test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
|
| 76 |
+
|
| 77 |
+
## Save numpy array data and preprocessor object
|
| 78 |
+
save_numpy_array_data(
|
| 79 |
+
file_path=self.data_transformation_config.transformed_train_file_path,
|
| 80 |
+
array=train_arr
|
| 81 |
+
)
|
| 82 |
+
save_numpy_array_data(
|
| 83 |
+
file_path=self.data_transformation_config.transformed_test_file_path,
|
| 84 |
+
array=test_arr
|
| 85 |
+
)
|
| 86 |
+
save_object(
|
| 87 |
+
file_path=self.data_transformation_config.transformed_object_file_path,
|
| 88 |
+
obj=preprocessor_object
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
data_transformation_artifact = DataTransformationArtifact(
|
| 92 |
+
transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
|
| 93 |
+
transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
|
| 94 |
+
transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
|
| 95 |
+
)
|
| 96 |
+
return data_transformation_artifact
|
| 97 |
+
logging.info("Data transformation completed")
|
| 98 |
+
except Exception as e:
|
| 99 |
+
raise NetworkSecurityException(e, sys)
|
networksecurity/constant/training_pipeline/__init__.py
CHANGED
|
@@ -36,3 +36,16 @@ DATA_VALIDATION_VALID_DIR: str ="validated"
|
|
| 36 |
DATA_VALIDATION_INVALID_DIR: str ="invalid"
|
| 37 |
DATA_VALIDATION_DRIFT_REPORT_DIR: str ="drift_report"
|
| 38 |
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str ="report.yaml"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
DATA_VALIDATION_INVALID_DIR: str ="invalid"
|
| 37 |
DATA_VALIDATION_DRIFT_REPORT_DIR: str ="drift_report"
|
| 38 |
DATA_VALIDATION_DRIFT_REPORT_FILE_NAME: str ="report.yaml"
|
| 39 |
+
|
| 40 |
+
"""
|
| 41 |
+
Data Transformation related constant start with DATA_TRANSFORMATION VAR NAME
|
| 42 |
+
"""
|
| 43 |
+
DATA_TRANSFORMATION_DIR_NAME: str ="data_transformation"
|
| 44 |
+
DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR: str ="transformed"
|
| 45 |
+
DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR: str ="transformed_object"
|
| 46 |
+
DATA_TRANSFORMATION_IMPUTER_PARAMS: dict = {
|
| 47 |
+
"missing_values": np.nan,
|
| 48 |
+
"n_neighbors": 3,
|
| 49 |
+
"weights": "uniform",
|
| 50 |
+
}
|
| 51 |
+
PREPROCESSING_OBJECT_FILE_NAME: str = "preprocessing_object.pkl"
|
networksecurity/entity/artifact_entity.py
CHANGED
|
@@ -12,4 +12,10 @@ class DataValidationArtifact:
|
|
| 12 |
valid_test_file_path: str
|
| 13 |
invalid_train_file_path: str
|
| 14 |
invalid_test_file_path: str
|
| 15 |
-
drift_report_file_path: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
valid_test_file_path: str
|
| 13 |
invalid_train_file_path: str
|
| 14 |
invalid_test_file_path: str
|
| 15 |
+
drift_report_file_path: str
|
| 16 |
+
|
| 17 |
+
@dataclass
|
| 18 |
+
class DataTransformationArtifact:
|
| 19 |
+
transformed_train_file_path: str
|
| 20 |
+
transformed_test_file_path: str
|
| 21 |
+
transformed_object_file_path: str
|
networksecurity/entity/config_entity.py
CHANGED
|
@@ -83,3 +83,28 @@ class DataValidationConfig:
|
|
| 83 |
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
|
| 84 |
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
|
| 85 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_DIR,
|
| 84 |
training_pipeline.DATA_VALIDATION_DRIFT_REPORT_FILE_NAME,
|
| 85 |
)
|
| 86 |
+
|
| 87 |
+
class DataTransformationConfig:
|
| 88 |
+
def __init__(self, training_pipeline_config: TraningPipelineConfig):
|
| 89 |
+
self.data_transformation_dir: str = os.path.join(
|
| 90 |
+
training_pipeline_config.artifact_dir,
|
| 91 |
+
training_pipeline.DATA_TRANSFORMATION_DIR_NAME
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
self.transformed_train_file_path: str = os.path.join(
|
| 95 |
+
self.data_transformation_dir,
|
| 96 |
+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
|
| 97 |
+
training_pipeline.TRAIN_FILE_NAME.replace("csv", "npy")
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
self.transformed_test_file_path: str = os.path.join(
|
| 101 |
+
self.data_transformation_dir,
|
| 102 |
+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_DATA_DIR,
|
| 103 |
+
training_pipeline.TEST_FILE_NAME.replace("csv", "npy")
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
self.transformed_object_file_path: str = os.path.join(
|
| 107 |
+
self.data_transformation_dir,
|
| 108 |
+
training_pipeline.DATA_TRANSFORMATION_TRANSFORMED_OBJECT_DIR,
|
| 109 |
+
training_pipeline.PREPROCESSING_OBJECT_FILE_NAME
|
| 110 |
+
)
|
networksecurity/utils/main_utils/utils.py
CHANGED
|
@@ -29,5 +29,29 @@ def write_yaml_file(file_path: str, content: object, replace: bool = False) -> N
|
|
| 29 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 30 |
with open(file_path, 'w') as file:
|
| 31 |
yaml.dump(content, file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
except Exception as e:
|
| 33 |
raise NetworkSecurityException(e, sys)
|
|
|
|
| 29 |
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 30 |
with open(file_path, 'w') as file:
|
| 31 |
yaml.dump(content, file)
|
| 32 |
+
except Exception as e:
|
| 33 |
+
raise NetworkSecurityException(e, sys)
|
| 34 |
+
|
| 35 |
+
def save_numpy_array_data(file_path: str, array: np.array):
|
| 36 |
+
"""
|
| 37 |
+
Save numpy array data to file
|
| 38 |
+
file_path : str : file path to save the numpy array
|
| 39 |
+
array : np.array : numpy array data to be saved
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
dir_path = os.path.dirname(file_path)
|
| 43 |
+
os.makedirs(dir_path, exist_ok=True)
|
| 44 |
+
with open(file_path, 'wb') as file_obj:
|
| 45 |
+
np.save(file_obj, array)
|
| 46 |
+
except Exception as e:
|
| 47 |
+
raise NetworkSecurityException(e, sys)
|
| 48 |
+
|
| 49 |
+
def save_object(file_path: str, obj: object) -> None:
|
| 50 |
+
try:
|
| 51 |
+
logging.info("Entered the save_object method of Main Utils")
|
| 52 |
+
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
| 53 |
+
with open(file_path, 'wb') as file_obj:
|
| 54 |
+
pickle.dump(obj, file_obj)
|
| 55 |
+
logging.info("Exited the save_object method of Main Utils")
|
| 56 |
except Exception as e:
|
| 57 |
raise NetworkSecurityException(e, sys)
|
requirements.txt
CHANGED
|
@@ -6,6 +6,5 @@ pymongo
|
|
| 6 |
certifi
|
| 7 |
pymongo[srv]==3.11
|
| 8 |
scikit-learn
|
| 9 |
-
dill
|
| 10 |
pyaml
|
| 11 |
#-e .
|
|
|
|
| 6 |
certifi
|
| 7 |
pymongo[srv]==3.11
|
| 8 |
scikit-learn
|
|
|
|
| 9 |
pyaml
|
| 10 |
#-e .
|