| import os |
| import sys |
| from src.exception import CustomException |
| from src.logger import logging |
| import pandas as pd |
| import numpy as np |
| from dataclasses import dataclass |
|
|
|
|
| @dataclass |
| class DataCleaningConfig: |
| raw_train_data_path: str = os.path.join('notebook', 'data', 'train.csv') |
| raw_test_data_path: str = os.path.join('notebook', 'data', 'test.csv') |
|
|
| cleaned_train_data_path: str = os.path.join('notebook', 'data', 'train_eda_clean.csv') |
| cleaned_test_data_path: str = os.path.join('notebook', 'data', 'test_eda_clean.csv') |
|
|
|
|
| class DataCleaning: |
| def __init__(self): |
| self.cleaning_config = DataCleaningConfig() |
|
|
| def _drop_duplicates(self, df: pd.DataFrame) -> pd.DataFrame: |
| try: |
| initial_shape = df.shape |
| df = df.drop_duplicates() |
| final_shape = df.shape |
| logging.info(f"Dropped {initial_shape[0] - final_shape[0]} duplicate rows.") |
| return df |
| except Exception as e: |
| raise CustomException(e, sys) |
| |
| def _drop_unnecessary_columns(self, df: pd.DataFrame) -> pd.DataFrame: |
| try: |
| cols_to_drop = ['MachineID', 'IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightsDisabled', 'SMode', 'HasTpm', 'IsVirtualDevice', 'IsPortableOS', "DeviceFamily", 'EnableLUA', |
| "OSBuildLab", "OSBuildNumberOnly", "SKUEditionName", "OSSkuFriendlyName", "OSInstallLanguageID", "Processor", "OSVersion" |
| ] |
| |
| existing_cols_to_drop = [col for col in cols_to_drop if col in df.columns] |
| if existing_cols_to_drop: |
| df = df.drop(columns=existing_cols_to_drop) |
| logging.info(f"Dropped columns: {existing_cols_to_drop}") |
| return df |
| except Exception as e: |
| raise CustomException(e, sys) |
| |
| def _handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame: |
| try: |
| if 'target' in df.columns: |
| initial_shape = df.shape |
| df = df.dropna(subset=['target']) |
| logging.info(f"Dropped {initial_shape[0] - df.shape[0]} rows due to missing target.") |
| return df |
| except Exception as e: |
| raise CustomException(e, sys) |
| |
| def initiate_data_cleaning(self): |
| logging.info(f"Entered the data cleaning component...") |
| try: |
| train_df = pd.read_csv(self.cleaning_config.raw_train_data_path) |
| test_df = pd.read_csv(self.cleaning_config.raw_test_data_path) |
| logging.info(f"Read raw train [{train_df.shape}] and test [{test_df.shape}] dataset") |
|
|
| os.makedirs(os.path.dirname(self.cleaning_config.cleaned_train_data_path), exist_ok=True) |
|
|
| logging.info("Cleaning train data...") |
| train_df = self._drop_duplicates(train_df) |
| train_df = self._drop_unnecessary_columns(train_df) |
| train_df = self._handle_missing_values(train_df) |
|
|
| logging.info("Cleaning test data...") |
| |
| test_df = self._drop_unnecessary_columns(test_df) |
| test_df = self._handle_missing_values(test_df) |
|
|
| train_df.to_csv(self.cleaning_config.cleaned_train_data_path, index=False) |
| test_df.to_csv(self.cleaning_config.cleaned_test_data_path, index=False) |
| logging.info("Cleaned train and test data saved successfully.") |
|
|
| return ( |
| self.cleaning_config.cleaned_train_data_path, |
| self.cleaning_config.cleaned_test_data_path, |
| train_df.shape, |
| test_df.shape |
| ) |
| except Exception as e: |
| raise CustomException(e, sys) |
| |
| if __name__ == "__main__": |
| |
| cleaner = DataCleaning() |
| cleaned_train_path, cleaned_test_path, train_shape, test_shape = cleaner.initiate_data_cleaning() |
| print(f"Data Cleaning Completed!\nCleaned Train shape: {train_shape}\nCleaned Test Path: {test_shape}") |