Spaces:

abrahamcbe
/

myspace-ooty-analytics

Sleeping

myspace-ooty-analytics / src /data_processing.py

abraham9486937737

Deploy MySpace Ooty Analytics to Hugging Face - with KPI styling updates

04b129a 20 days ago

3.52 kB

	"""
	Data processing and cleaning module
	"""

	import pandas as pd
	import numpy as np
	from typing import Union, List, Tuple


	def clean_data(df: pd.DataFrame, remove_duplicates: bool = True,
	handle_missing: str = "drop") -> pd.DataFrame:
	"""
	Clean dataset by removing duplicates and handling missing values

	Args:
	df: Input DataFrame
	remove_duplicates: Whether to remove duplicate rows
	handle_missing: Strategy for missing values ('drop', 'mean', 'median', 'forward_fill')

	Returns:
	Cleaned DataFrame
	"""
	df_clean = df.copy()

	if remove_duplicates:
	initial_shape = df_clean.shape[0]
	df_clean = df_clean.drop_duplicates()
	print(f"Removed {initial_shape - df_clean.shape[0]} duplicate rows")

	if handle_missing == "drop":
	df_clean = df_clean.dropna()
	elif handle_missing == "mean":
	numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
	df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].mean())
	elif handle_missing == "median":
	numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
	df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
	elif handle_missing == "forward_fill":
	df_clean = df_clean.fillna(method='ffill')

	return df_clean


	def remove_outliers(df: pd.DataFrame, columns: List[str],
	method: str = "iqr", threshold: float = 1.5) -> pd.DataFrame:
	"""
	Remove outliers using IQR or Z-score method

	Args:
	df: Input DataFrame
	columns: List of column names to check for outliers
	method: 'iqr' or 'zscore'
	threshold: Threshold for outlier detection

	Returns:
	DataFrame without outliers
	"""
	df_clean = df.copy()

	if method == "iqr":
	for col in columns:
	Q1 = df_clean[col].quantile(0.25)
	Q3 = df_clean[col].quantile(0.75)
	IQR = Q3 - Q1
	lower = Q1 - threshold * IQR
	upper = Q3 + threshold * IQR
	df_clean = df_clean[(df_clean[col] >= lower) & (df_clean[col] <= upper)]

	elif method == "zscore":
	from scipy import stats
	z_scores = np.abs(stats.zscore(df_clean[columns].select_dtypes(include=[np.number])))
	df_clean = df_clean[(z_scores < threshold).all(axis=1)]

	return df_clean


	def normalize_columns(df: pd.DataFrame, columns: List[str],
	method: str = "minmax") -> Tuple[pd.DataFrame, dict]:
	"""
	Normalize specified columns

	Args:
	df: Input DataFrame
	columns: List of column names to normalize
	method: 'minmax' or 'standard'

	Returns:
	Normalized DataFrame and scaling parameters
	"""
	df_norm = df.copy()
	scaling_params = {}

	if method == "minmax":
	for col in columns:
	min_val = df_norm[col].min()
	max_val = df_norm[col].max()
	df_norm[col] = (df_norm[col] - min_val) / (max_val - min_val)
	scaling_params[col] = {"min": min_val, "max": max_val}

	elif method == "standard":
	for col in columns:
	mean_val = df_norm[col].mean()
	std_val = df_norm[col].std()
	df_norm[col] = (df_norm[col] - mean_val) / std_val
	scaling_params[col] = {"mean": mean_val, "std": std_val}

	return df_norm, scaling_params