Spaces:

sairaj2
/

openenv-datacleaner

Sleeping

App Files Files Community

openenv-datacleaner / utils /cleaners.py

sairaj2

Upload folder using huggingface_hub

188937b verified 12 days ago

raw

history blame contribute delete

3.15 kB

	import pandas as pd
	import numpy as np
	import re
	from typing import Optional, Union


	class DataCleaners:
	def fill_missing_values(self, df: pd.DataFrame, column: str = None, strategy: str = 'auto') -> pd.DataFrame:
	"""Fill missing values using specified strategy"""
	df = df.copy()

	if column is None:
	columns = df.columns
	else:
	columns = [column]

	for col in columns:
	if df[col].isna().sum() == 0:
	continue

	if strategy == 'auto':
	if pd.api.types.is_numeric_dtype(df[col]):
	strategy = 'median'
	elif pd.api.types.is_datetime64_any_dtype(df[col]):
	strategy = 'ffill'
	else:
	strategy = 'mode'

	if strategy == 'mean':
	df[col] = df[col].fillna(df[col].mean())
	elif strategy == 'median':
	df[col] = df[col].fillna(df[col].median())
	elif strategy == 'mode':
	df[col] = df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else '')
	elif strategy == 'ffill':
	df[col] = df[col].fillna(method='ffill').fillna(method='bfill')
	elif strategy == 'zero':
	df[col] = df[col].fillna(0)
	elif strategy == 'empty':
	df[col] = df[col].fillna('')

	return df

	def remove_duplicates(self, df: pd.DataFrame, subset: list = None) -> pd.DataFrame:
	"""Remove duplicate rows"""
	return df.drop_duplicates(subset=subset, keep='first').reset_index(drop=True)

	def remove_outliers(self, df: pd.DataFrame, column: str, method: str = 'IQR', threshold: float = 1.5) -> pd.DataFrame:
	"""Remove outliers from numeric column"""
	df = df.copy()

	if method == 'IQR':
	q1 = df[column].quantile(0.25)
	q3 = df[column].quantile(0.75)
	iqr = q3 - q1
	lower_bound = q1 - threshold * iqr
	upper_bound = q3 + threshold * iqr
	df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

	elif method == 'zscore':
	from scipy import stats
	z_scores = np.abs(stats.zscore(df[column].dropna()))
	df = df[z_scores < threshold]

	return df.reset_index(drop=True)

	def drop_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
	"""Drop specified column"""
	return df.drop(columns=[column], errors='ignore')

	def clean_text_column(self, df: pd.DataFrame, column: str) -> pd.DataFrame:
	"""Clean text values: trim, remove special chars, standardize case"""
	df = df.copy()

	def clean_text(text):
	if pd.isna(text):
	return text
	text = str(text).strip()
	text = re.sub(r'\s+', ' ', text)
	text = re.sub(r'[^\w\s\-.,@]', '', text)
	return text

	df[column] = df[column].apply(clean_text)
	return df