Spaces:
Sleeping
Sleeping
| """ | |
| Data processing and cleaning module | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Union, List, Tuple | |
| def clean_data(df: pd.DataFrame, remove_duplicates: bool = True, | |
| handle_missing: str = "drop") -> pd.DataFrame: | |
| """ | |
| Clean dataset by removing duplicates and handling missing values | |
| Args: | |
| df: Input DataFrame | |
| remove_duplicates: Whether to remove duplicate rows | |
| handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill') | |
| Returns: | |
| Cleaned DataFrame | |
| """ | |
| df_clean = df.copy() | |
| if remove_duplicates: | |
| initial_shape = df_clean.shape[0] | |
| df_clean = df_clean.drop_duplicates() | |
| print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows") | |
| if handle_missing == "drop": | |
| df_clean = df_clean.dropna() | |
| elif handle_missing == "mean": | |
| numeric_cols = df_clean.select_dtypes(include=[np.number]).columns | |
| df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean()) | |
| elif handle_missing == "median": | |
| numeric_cols = df_clean.select_dtypes(include=[np.number]).columns | |
| df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median()) | |
| elif handle_missing == "forward_fill": | |
| df_clean = df_clean.fillna(method='ffill') | |
| return df_clean | |
| def remove_outliers(df: pd.DataFrame, columns: List[str], | |
| method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame: | |
| """ | |
| Remove outliers using IQR or Z-score method | |
| Args: | |
| df: Input DataFrame | |
| columns: List of column names to check for outliers | |
| method: 'iqr' or 'zscore' | |
| threshold: Threshold for outlier detection | |
| Returns: | |
| DataFrame without outliers | |
| """ | |
| df_clean = df.copy() | |
| if method == "iqr": | |
| for col in columns: | |
| Q1 = df_clean[col].quantile(0.25) | |
| Q3 = df_clean[col].quantile(0.75) | |
| IQR = Q3 - Q1 | |
| lower = Q1 - threshold * IQR | |
| upper = Q3 + threshold * IQR | |
| df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)] | |
| elif method == "zscore": | |
| from scipy import stats | |
| z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number]))) | |
| df_clean = df_clean[(z_scores < threshold).all(axis=1)] | |
| return df_clean | |
| def normalize_columns(df: pd.DataFrame, columns: List[str], | |
| method: str = "minmax") -> Tuple[pd.DataFrame, dict]: | |
| """ | |
| Normalize specified columns | |
| Args: | |
| df: Input DataFrame | |
| columns: List of column names to normalize | |
| method: 'minmax' or 'standard' | |
| Returns: | |
| Normalized DataFrame and scaling parameters | |
| """ | |
| df_norm = df.copy() | |
| scaling_params = {} | |
| if method == "minmax": | |
| for col in columns: | |
| min_val = df_norm[col].min() | |
| max_val = df_norm[col].max() | |
| df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val) | |
| scaling_params[col] = {"min": min_val, "max": max_val} | |
| elif method == "standard": | |
| for col in columns: | |
| mean_val = df_norm[col].mean() | |
| std_val = df_norm[col].std() | |
| df_norm[col] = (df_norm[col] - mean_val) / std_val | |
| scaling_params[col] = {"mean": mean_val, "std": std_val} | |
| return df_norm, scaling_params | |