import os,sys import numpy as np import pandas as pd from sklearn.impute import KNNImputer from sklearn.pipeline import Pipeline from networksecurity.constant.training_pipeline import TARGET_COLUMN, DATA_TRANSFORMATION_IMPUTER_PARAMS from networksecurity.entity.artifact_entity import DataTransformationArtifact, DataValidationArtifact from networksecurity.entity.config_entity import DataTransformationConfig from networksecurity.exception.exception import NetworkSecurityException from networksecurity.logging.logger import logging from networksecurity.utils.main_utils.utils import save_numpy_array_data, save_object class DataTransformation: def __init__(self, data_transformation_config: DataTransformationConfig, data_validation_artifact: DataValidationArtifact): try: logging.info(f"{'>>'*20} Data Transformation {'<<'*20}") self.data_transformation_config = data_transformation_config self.data_validation_artifact = data_validation_artifact except Exception as e: raise NetworkSecurityException(e, sys) @staticmethod def read_data(file_path) -> pd.DataFrame: try: return pd.read_csv(file_path) except Exception as e: raise NetworkSecurityException(e, sys) def get_data_transformer_object(cls)-> Pipeline: """ It initializes the KNNImputer object with the parameters specfied in the training_pipeline.py file and returns the pipeline object with the KNNImputer object as the first step. Args: cls: DataTransformation Returns: Pipeline: Pipeline object with the KNNImputer object as the first step. """ logging.info("Entered the get_data_transformer_object method of Data_Transformation class") try: imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS) logging.info("Created KNNImputer object with the parameters specified in the training_pipeline.py file") preprocessor = Pipeline(steps=[("imputer", imputer)]) return preprocessor except Exception as e: raise NetworkSecurityException(e, sys) def initiate_data_transformation(self)->DataTransformationArtifact: logging.info("Entered initiate_data_transformation method of DataTransformation class") try: logging.info("Starting data transformation") train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path) test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path) ## Training dataframe input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN], axis=1) target_feature_train_df = train_df[TARGET_COLUMN] target_feature_train_df = target_feature_train_df.replace(-1, 0) ## Testing dataframe input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN], axis=1) target_feature_test_df = test_df[TARGET_COLUMN] target_feature_test_df = target_feature_test_df.replace(-1, 0) preprocessor = self.get_data_transformer_object() preprocessor_object=preprocessor.fit(input_feature_train_df) transformed_input_train_feature = preprocessor_object.transform(input_feature_train_df) transformed_input_test_feature = preprocessor_object.transform(input_feature_test_df) train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)] test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)] ## Save numpy array data and preprocessor object save_numpy_array_data( file_path=self.data_transformation_config.transformed_train_file_path, array=train_arr ) save_numpy_array_data( file_path=self.data_transformation_config.transformed_test_file_path, array=test_arr ) save_object( file_path=self.data_transformation_config.transformed_object_file_path, obj=preprocessor_object ) data_transformation_artifact = DataTransformationArtifact( transformed_object_file_path=self.data_transformation_config.transformed_object_file_path, transformed_train_file_path=self.data_transformation_config.transformed_train_file_path, transformed_test_file_path=self.data_transformation_config.transformed_test_file_path, ) return data_transformation_artifact logging.info("Data transformation completed") except Exception as e: raise NetworkSecurityException(e, sys)