Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import logging | |
| # Set up logging configuration | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| class OutlierDetector: | |
| def __init__(self, data: pd.DataFrame): | |
| """ | |
| Initializes the OutlierDetector with data. | |
| Parameters: | |
| data : pd.DataFrame | |
| The data for outlier detection. | |
| """ | |
| self.data = data | |
| logging.info("OutlierDetector initialized with data of shape: %s", data.shape) | |
| def z_score_outlier_detection(self, threshold: float = 3.0) -> pd.DataFrame: | |
| """Detect outliers using Z-Score method.""" | |
| logging.info("Calculating Z-Scores for outlier detection.") | |
| z_scores = np.abs((self.data - self.data.mean()) / self.data.std()) | |
| outliers = (z_scores > threshold) | |
| logging.info("Detected %d outliers using Z-Score method.", outliers.sum().sum()) | |
| return self.data[~outliers.any(axis=1)] # Return DataFrame without outliers | |
| def iqr_outlier_detection(self) -> pd.DataFrame: | |
| """Detect outliers using IQR method.""" | |
| logging.info("Calculating IQR for outlier detection.") | |
| Q1 = self.data.quantile(0.25) | |
| Q3 = self.data.quantile(0.75) | |
| IQR = Q3 - Q1 | |
| outlier_condition = (self.data < (Q1 - 1.5 * IQR)) | (self.data > (Q3 + 1.5 * IQR)) | |
| logging.info("Detected %d outliers using IQR method.", outlier_condition.sum().sum()) | |
| return self.data[~outlier_condition.any(axis=1)] # Return DataFrame without outliers | |
| def run_outlier_detection(self) -> pd.DataFrame: | |
| """Run all outlier detection methods and return cleaned data.""" | |
| logging.info("Starting outlier detection steps.") | |
| # Select only numerical columns for outlier detection | |
| numerical_data = self.data.select_dtypes(include=[np.number]) # Include all numerical columns | |
| logging.info("Selected numerical columns for outlier detection: %s", numerical_data.columns.tolist()) | |
| # Z-Score Method | |
| cleaned_data_z = self.z_score_outlier_detection() | |
| # IQR Method | |
| cleaned_data_iqr = self.iqr_outlier_detection() | |
| logging.info("Outlier detection completed.") | |
| # Return a dictionary of cleaned data | |
| return cleaned_data_iqr | |
| # Usage Example | |
| if __name__ == '__main__': | |
| # Sample data | |
| # try: | |
| # df = pd.read_csv("extracted/customer_churn_dataset-training-master.csv") | |
| # logging.info("Loaded dataset with shape: %s", df.shape) | |
| # # Initialize the outlier detector | |
| # detector = OutlierDetector(df) | |
| # # Run the outlier detection | |
| # cleaned_data = detector.run_outlier_detection() | |
| # # Display the cleaned DataFrames | |
| # logging.info("Cleaned Data (Z-Score):") | |
| # print(cleaned_data["z_score_cleaned"].head()) | |
| # logging.info("Cleaned Data (IQR):") | |
| # print(cleaned_data["iqr_cleaned"].head()) | |
| # except Exception as e: | |
| # logging.error("An error occurred: %s", e) | |
| pass | |