Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import re | |
| from typing import Optional, Union | |
| class DataCleaners: | |
| def fill_missing_values(self, df: pd.DataFrame, column: str = None, strategy: str = 'auto') -> pd.DataFrame: | |
| """Fill missing values using specified strategy""" | |
| df = df.copy() | |
| if column is None: | |
| columns = df.columns | |
| else: | |
| columns = [column] | |
| for col in columns: | |
| if df[col].isna().sum() == 0: | |
| continue | |
| if strategy == 'auto': | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| strategy = 'median' | |
| elif pd.api.types.is_datetime64_any_dtype(df[col]): | |
| strategy = 'ffill' | |
| else: | |
| strategy = 'mode' | |
| if strategy == 'mean': | |
| df[col] = df[col].fillna(df[col].mean()) | |
| elif strategy == 'median': | |
| df[col] = df[col].fillna(df[col].median()) | |
| elif strategy == 'mode': | |
| df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else '') | |
| elif strategy == 'ffill': | |
| df[col] = df[col].fillna(method='ffill').fillna(method='bfill') | |
| elif strategy == 'zero': | |
| df[col] = df[col].fillna(0) | |
| elif strategy == 'empty': | |
| df[col] = df[col].fillna('') | |
| return df | |
| def remove_duplicates(self, df: pd.DataFrame, subset: list = None) -> pd.DataFrame: | |
| """Remove duplicate rows""" | |
| return df.drop_duplicates(subset=subset, keep='first').reset_index(drop=True) | |
| def remove_outliers(self, df: pd.DataFrame, column: str, method: str = 'IQR', threshold: float = 1.5) -> pd.DataFrame: | |
| """Remove outliers from numeric column""" | |
| df = df.copy() | |
| if method == 'IQR': | |
| q1 = df[column].quantile(0.25) | |
| q3 = df[column].quantile(0.75) | |
| iqr = q3 - q1 | |
| lower_bound = q1 - threshold * iqr | |
| upper_bound = q3 + threshold * iqr | |
| df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)] | |
| elif method == 'zscore': | |
| from scipy import stats | |
| z_scores = np.abs(stats.zscore(df[column].dropna())) | |
| df = df[z_scores < threshold] | |
| return df.reset_index(drop=True) | |
| def drop_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame: | |
| """Drop specified column""" | |
| return df.drop(columns=[column], errors='ignore') | |
| def clean_text_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame: | |
| """Clean text values: trim, remove special chars, standardize case""" | |
| df = df.copy() | |
| def clean_text(text): | |
| if pd.isna(text): | |
| return text | |
| text = str(text).strip() | |
| text = re.sub(r'\s+', ' ', text) | |
| text = re.sub(r'[^\w\s\-.,@]', '', text) | |
| return text | |
| df[column] = df[column].apply(clean_text) | |
| return df |