NSS / src /components /data_transformation.py
Kshitijk20's picture
Clean deployment without binary files
a21e473
import sys
import os
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.pipeline import Pipeline
from src.constant.training_pipeline import TARGET_COLUMN
from src.constant.training_pipeline import DATA_TRANSFORMATION_IMPUTER_PARAMS
from src.entity.artifact_entity import (
DataTransformationArtifact,
DataValidationArtifact,
)
from src.exception.exception import NetworkSecurityException
from src.logging.logger import logging
from src.utils.main_utils.utils import save_np_array, save_object
from src.entity.config_entity import Data_transformation_config
class DataTransformation:
def __init__(self, data_validation_artifact: DataValidationArtifact, data_transformation_config: Data_transformation_config):
try:
self.data_validation_artifact:DataValidationArtifact = data_validation_artifact
self.data_transformation_config:Data_transformation_config = data_transformation_config
except Exception as e:
raise NetworkSecurityException(e, sys) from e
@staticmethod
def read_data(file_path) -> pd.DataFrame:
try:
return pd.read_csv(file_path)
except Exception as e:
raise NetworkSecurityException(e, sys) from e
def get_data_transformer_object(self) -> Pipeline:
"""
it initialises a KNNImputer object with the parameter specified in the training_pipeline.py file and returns
a pipeline with the KNNImputer object as the first step.
args:
cls: DataTransformation
Returns:
a pipeline object
"""
logging.info("Entered get_data_transformation_object methof of transformation class")
try:
knn_imputer = KNNImputer(**DATA_TRANSFORMATION_IMPUTER_PARAMS)
logging.info(f"intialise knn imputer with {DATA_TRANSFORMATION_IMPUTER_PARAMS}")
pipeline = Pipeline(steps=[("imputer", knn_imputer)])
return pipeline
except Exception as e:
raise NetworkSecurityException(e, sys) from e
def initiate_data_transformation(self)-> DataTransformationArtifact:
try:
logging.info("Started data transformation!")
train_df = DataTransformation.read_data(self.data_validation_artifact.valid_train_file_path)
test_df = DataTransformation.read_data(self.data_validation_artifact.valid_test_file_path)
# training dataframe
input_feature_train_df = train_df.drop(columns=[TARGET_COLUMN],axis = 1)
target_feature_train_df = train_df[TARGET_COLUMN]
target_feature_train_df = target_feature_train_df.replace(-1,0)
# testing dataframe
input_feature_test_df = test_df.drop(columns=[TARGET_COLUMN],axis = 1)
target_feature_test_df = test_df[TARGET_COLUMN]
target_feature_test_df = target_feature_test_df.replace(-1,0)
preprocessor = self.get_data_transformer_object()
preprocessor_obj = preprocessor.fit(input_feature_train_df)
logging.info("Preprocessor object created and fitted on training data")
transformed_input_train_feature = preprocessor_obj.transform(input_feature_train_df)
transformed_input_test_feature = preprocessor_obj.transform(input_feature_test_df)
# combining transformed input features with target feature
train_arr = np.c_[transformed_input_train_feature, np.array(target_feature_train_df)]
test_arr = np.c_[transformed_input_test_feature, np.array(target_feature_test_df)]
# save numpy array data
save_np_array(self.data_transformation_config.transformed_train_file_path, array=train_arr)
save_np_array(self.data_transformation_config.transformed_test_file_path,array = test_arr )
save_object(self.data_transformation_config.transformed_object_file_path,preprocessor_obj)
save_object("final_model/preprocessor.pkl", preprocessor_obj)
# preparing artifacts
Data_transformation_artifact = DataTransformationArtifact(
transformed_object_file_path=self.data_transformation_config.transformed_object_file_path,
transformed_train_file_path=self.data_transformation_config.transformed_train_file_path,
transformed_test_file_path=self.data_transformation_config.transformed_test_file_path,
)
return Data_transformation_artifact
except Exception as e:
raise NetworkSecurityException(e, sys) from e