File size: 3,523 Bytes
04b129a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
"""
Data processing and cleaning module
"""

import pandas as pd
import numpy as np
from typing import Union, List, Tuple


def clean_data(df: pd.DataFrame, remove_duplicates: bool = True, 
               handle_missing: str = "drop") -> pd.DataFrame:
    """
    Clean dataset by removing duplicates and handling missing values
    
    Args:
        df: Input DataFrame
        remove_duplicates: Whether to remove duplicate rows
        handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill')
        
    Returns:
        Cleaned DataFrame
    """
    df_clean = df.copy()
    
    if remove_duplicates:
        initial_shape = df_clean.shape[0]
        df_clean = df_clean.drop_duplicates()
        print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows")
    
    if handle_missing == "drop":
        df_clean = df_clean.dropna()
    elif handle_missing == "mean":
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
    elif handle_missing == "median":
        numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
        df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
    elif handle_missing == "forward_fill":
        df_clean = df_clean.fillna(method='ffill')
    
    return df_clean


def remove_outliers(df: pd.DataFrame, columns: List[str], 
                    method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame:
    """
    Remove outliers using IQR or Z-score method
    
    Args:
        df: Input DataFrame
        columns: List of column names to check for outliers
        method: 'iqr' or 'zscore'
        threshold: Threshold for outlier detection
        
    Returns:
        DataFrame without outliers
    """
    df_clean = df.copy()
    
    if method == "iqr":
        for col in columns:
            Q1 = df_clean[col].quantile(0.25)
            Q3 = df_clean[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - threshold * IQR
            upper = Q3 + threshold * IQR
            df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
    
    elif method == "zscore":
        from scipy import stats
        z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number])))
        df_clean = df_clean[(z_scores < threshold).all(axis=1)]
    
    return df_clean


def normalize_columns(df: pd.DataFrame, columns: List[str], 
                      method: str = "minmax") -> Tuple[pd.DataFrame, dict]:
    """
    Normalize specified columns
    
    Args:
        df: Input DataFrame
        columns: List of column names to normalize
        method: 'minmax' or 'standard'
        
    Returns:
        Normalized DataFrame and scaling parameters
    """
    df_norm = df.copy()
    scaling_params = {}
    
    if method == "minmax":
        for col in columns:
            min_val = df_norm[col].min()
            max_val = df_norm[col].max()
            df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
            scaling_params[col] = {"min": min_val, "max": max_val}
    
    elif method == "standard":
        for col in columns:
            mean_val = df_norm[col].mean()
            std_val = df_norm[col].std()
            df_norm[col] = (df_norm[col] - mean_val) / std_val
            scaling_params[col] = {"mean": mean_val, "std": std_val}
    
    return df_norm, scaling_params