myspace-ooty-analytics / src /data_processing.py
abraham9486937737
Deploy MySpace Ooty Analytics to Hugging Face - with KPI styling updates
04b129a
"""
Data processing and cleaning module
"""
import pandas as pd
import numpy as np
from typing import Union, List, Tuple
def clean_data(df: pd.DataFrame, remove_duplicates: bool = True,
handle_missing: str = "drop") -> pd.DataFrame:
"""
Clean dataset by removing duplicates and handling missing values
Args:
df: Input DataFrame
remove_duplicates: Whether to remove duplicate rows
handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill')
Returns:
Cleaned DataFrame
"""
df_clean = df.copy()
if remove_duplicates:
initial_shape = df_clean.shape[0]
df_clean = df_clean.drop_duplicates()
print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows")
if handle_missing == "drop":
df_clean = df_clean.dropna()
elif handle_missing == "mean":
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
elif handle_missing == "median":
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
elif handle_missing == "forward_fill":
df_clean = df_clean.fillna(method='ffill')
return df_clean
def remove_outliers(df: pd.DataFrame, columns: List[str],
method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame:
"""
Remove outliers using IQR or Z-score method
Args:
df: Input DataFrame
columns: List of column names to check for outliers
method: 'iqr' or 'zscore'
threshold: Threshold for outlier detection
Returns:
DataFrame without outliers
"""
df_clean = df.copy()
if method == "iqr":
for col in columns:
Q1 = df_clean[col].quantile(0.25)
Q3 = df_clean[col].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - threshold * IQR
upper = Q3 + threshold * IQR
df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]
elif method == "zscore":
from scipy import stats
z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number])))
df_clean = df_clean[(z_scores < threshold).all(axis=1)]
return df_clean
def normalize_columns(df: pd.DataFrame, columns: List[str],
method: str = "minmax") -> Tuple[pd.DataFrame, dict]:
"""
Normalize specified columns
Args:
df: Input DataFrame
columns: List of column names to normalize
method: 'minmax' or 'standard'
Returns:
Normalized DataFrame and scaling parameters
"""
df_norm = df.copy()
scaling_params = {}
if method == "minmax":
for col in columns:
min_val = df_norm[col].min()
max_val = df_norm[col].max()
df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
scaling_params[col] = {"min": min_val, "max": max_val}
elif method == "standard":
for col in columns:
mean_val = df_norm[col].mean()
std_val = df_norm[col].std()
df_norm[col] = (df_norm[col] - mean_val) / std_val
scaling_params[col] = {"mean": mean_val, "std": std_val}
return df_norm, scaling_params