env / utils /cleaners.py
sairaj2's picture
Upload folder using huggingface_hub
8d6e02d verified
import pandas as pd
import numpy as np
import re
from typing import Optional, Union
class DataCleaners:
def fill_missing_values(self, df: pd.DataFrame, column: str = None, strategy: str = 'auto') -> pd.DataFrame:
"""Fill missing values using specified strategy"""
df = df.copy()
if column is None:
columns = df.columns
else:
columns = [column]
for col in columns:
if df[col].isna().sum() == 0:
continue
if strategy == 'auto':
if pd.api.types.is_numeric_dtype(df[col]):
strategy = 'median'
elif pd.api.types.is_datetime64_any_dtype(df[col]):
strategy = 'ffill'
else:
strategy = 'mode'
if strategy == 'mean':
df[col] = df[col].fillna(df[col].mean())
elif strategy == 'median':
df[col] = df[col].fillna(df[col].median())
elif strategy == 'mode':
df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else '')
elif strategy == 'ffill':
df[col] = df[col].fillna(method='ffill').fillna(method='bfill')
elif strategy == 'zero':
df[col] = df[col].fillna(0)
elif strategy == 'empty':
df[col] = df[col].fillna('')
return df
def remove_duplicates(self, df: pd.DataFrame, subset: list = None) -> pd.DataFrame:
"""Remove duplicate rows"""
return df.drop_duplicates(subset=subset, keep='first').reset_index(drop=True)
def remove_outliers(self, df: pd.DataFrame, column: str, method: str = 'IQR', threshold: float = 1.5) -> pd.DataFrame:
"""Remove outliers from numeric column"""
df = df.copy()
if method == 'IQR':
q1 = df[column].quantile(0.25)
q3 = df[column].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - threshold * iqr
upper_bound = q3 + threshold * iqr
df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
elif method == 'zscore':
from scipy import stats
z_scores = np.abs(stats.zscore(df[column].dropna()))
df = df[z_scores < threshold]
return df.reset_index(drop=True)
def drop_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
"""Drop specified column"""
return df.drop(columns=[column], errors='ignore')
def clean_text_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
"""Clean text values: trim, remove special chars, standardize case"""
df = df.copy()
def clean_text(text):
if pd.isna(text):
return text
text = str(text).strip()
text = re.sub(r'\s+', ' ', text)
text = re.sub(r'[^\w\s\-.,@]', '', text)
return text
df[column] = df[column].apply(clean_text)
return df