""" Data processing and cleaning module """ import pandas as pd import numpy as np from typing import Union, List, Tuple def clean_data(df: pd.DataFrame, remove_duplicates: bool = True, handle_missing: str = "drop") -> pd.DataFrame: """ Clean dataset by removing duplicates and handling missing values Args: df: Input DataFrame remove_duplicates: Whether to remove duplicate rows handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill') Returns: Cleaned DataFrame """ df_clean = df.copy() if remove_duplicates: initial_shape = df_clean.shape[0] df_clean = df_clean.drop_duplicates() print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows") if handle_missing == "drop": df_clean = df_clean.dropna() elif handle_missing == "mean": numeric_cols = df_clean.select_dtypes(include=[np.number]).columns df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean()) elif handle_missing == "median": numeric_cols = df_clean.select_dtypes(include=[np.number]).columns df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median()) elif handle_missing == "forward_fill": df_clean = df_clean.fillna(method='ffill') return df_clean def remove_outliers(df: pd.DataFrame, columns: List[str], method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame: """ Remove outliers using IQR or Z-score method Args: df: Input DataFrame columns: List of column names to check for outliers method: 'iqr' or 'zscore' threshold: Threshold for outlier detection Returns: DataFrame without outliers """ df_clean = df.copy() if method == "iqr": for col in columns: Q1 = df_clean[col].quantile(0.25) Q3 = df_clean[col].quantile(0.75) IQR = Q3 - Q1 lower = Q1 - threshold * IQR upper = Q3 + threshold * IQR df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)] elif method == "zscore": from scipy import stats z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number]))) df_clean = df_clean[(z_scores < threshold).all(axis=1)] return df_clean def normalize_columns(df: pd.DataFrame, columns: List[str], method: str = "minmax") -> Tuple[pd.DataFrame, dict]: """ Normalize specified columns Args: df: Input DataFrame columns: List of column names to normalize method: 'minmax' or 'standard' Returns: Normalized DataFrame and scaling parameters """ df_norm = df.copy() scaling_params = {} if method == "minmax": for col in columns: min_val = df_norm[col].min() max_val = df_norm[col].max() df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val) scaling_params[col] = {"min": min_val, "max": max_val} elif method == "standard": for col in columns: mean_val = df_norm[col].mean() std_val = df_norm[col].std() df_norm[col] = (df_norm[col] - mean_val) / std_val scaling_params[col] = {"mean": mean_val, "std": std_val} return df_norm, scaling_params