Spaces:

abrahamcbe
/

myspace-ooty-analytics

Sleeping

File size: 3,523 Bytes

04b129a

"""
Data processing and cleaning module
"""

import pandas as pd
import numpy as np
from typing import Union, List, Tuple


def clean_data(df: pd.DataFrame, remove_duplicates: bool = True, 
               handle_missing: str = "drop") -> pd.DataFrame:
    """
    Clean dataset by removing duplicates and handling missing values
    
    Args:
        df: Input DataFrame
        remove_duplicates: Whether to remove duplicate rows
        handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill')
        
    Returns:
        Cleaned DataFrame
    """
    df_clean = df.copy()
    
    if remove_duplicates:
        initial_shape = df_clean.shape[0]
        df_clean = df_clean.drop_duplicates()
        print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows")
    
    if handle_missing == "drop":
        df_clean = df_clean.dropna()
    elif handle_missing == "mean":
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
    elif handle_missing == "median":
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
    elif handle_missing == "forward_fill":
        df_clean = df_clean.fillna(method='ffill')
    
    return df_clean


def remove_outliers(df: pd.DataFrame, columns: List[str], 
                    method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame:
    """
    Remove outliers using IQR or Z-score method
    
    Args:
        df: Input DataFrame
        columns: List of column names to check for outliers
        method: 'iqr' or 'zscore'
        threshold: Threshold for outlier detection
        
    Returns:
        DataFrame without outliers
    """
    df_clean = df.copy()
    
    if method == "iqr":
        for col in columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - threshold * IQR
            upper = Q3 + threshold * IQR
            df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    
    elif method == "zscore":
        from scipy import stats
        z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number])))
        df_clean = df_clean[(z_scores < threshold).all(axis=1)]
    
    return df_clean


def normalize_columns(df: pd.DataFrame, columns: List[str], 
                      method: str = "minmax") -> Tuple[pd.DataFrame, dict]:
    """
    Normalize specified columns
    
    Args:
        df: Input DataFrame
        columns: List of column names to normalize
        method: 'minmax' or 'standard'
        
    Returns:
        Normalized DataFrame and scaling parameters
    """
    df_norm = df.copy()
    scaling_params = {}
    
    if method == "minmax":
        for col in columns:
            min_val = df_norm[col].min()
            max_val = df_norm[col].max()
            df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
            scaling_params[col] = {"min": min_val, "max": max_val}
    
    elif method == "standard":
        for col in columns:
            mean_val = df_norm[col].mean()
            std_val = df_norm[col].std()
            df_norm[col] = (df_norm[col] - mean_val) / std_val
            scaling_params[col] = {"mean": mean_val, "std": std_val}
    
    return df_norm, scaling_params