Spaces:

Yashwanth34567
/

data-cleaning-env

Sleeping

App Files Files Community

data-cleaning-env / utils.py

Yashwanth34567

Upload folder using huggingface_hub

f1b06d6 verified about 2 months ago

raw

history blame contribute delete

5.22 kB

	# utils.py
	# ─────────────────────────────────────────────
	# Helper functions used across the project
	# ─────────────────────────────────────────────

	import pandas as pd
	import numpy as np
	from typing import Any, Dict, List, Tuple


	# ── DataFrame Helpers ─────────────────────────

	def df_to_records(df: pd.DataFrame) -> List[Dict[str, Any]]:
	"""Convert DataFrame to list of dicts (for JSON serialization)"""
	return df.where(pd.notnull(df), None).to_dict(orient="records")


	def df_summary(df: pd.DataFrame) -> Dict[str, Any]:
	"""Return a quick summary of a DataFrame"""
	return {
	"shape": list(df.shape),
	"columns": list(df.columns),
	"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
	"null_counts": df.isnull().sum().to_dict(),
	"duplicate_rows": int(df.duplicated().sum()),
	}


	def get_null_counts(df: pd.DataFrame) -> Dict[str, int]:
	"""Return null count per column"""
	return {col: int(count) for col, count in df.isnull().sum().items()}


	def get_duplicate_count(df: pd.DataFrame) -> int:
	"""Return number of duplicate rows"""
	return int(df.duplicated().sum())


	# ── Outlier Helpers ───────────────────────────

	def detect_outliers_iqr(df: pd.DataFrame, column: str) -> pd.Series:
	"""Return boolean mask of outliers using IQR method"""
	Q1 = df[column].quantile(0.25)
	Q3 = df[column].quantile(0.75)
	IQR = Q3 - Q1
	lower = Q1 - 1.5 * IQR
	upper = Q3 + 1.5 * IQR
	return (df[column] < lower) \| (df[column] > upper)


	def detect_outliers_zscore(df: pd.DataFrame, column: str,
	threshold: float = 3.0) -> pd.Series:
	"""Return boolean mask of outliers using Z-score method"""
	mean = df[column].mean()
	std = df[column].std()
	if std == 0:
	return pd.Series([False] * len(df))
	return ((df[column] - mean) / std).abs() > threshold


	# ── Column Name Helpers ───────────────────────

	def clean_column_name(name: str) -> str:
	"""Normalize a column name to snake_case"""
	import re
	name = name.strip().lower()
	name = re.sub(r"[^a-z0-9_]", "_", name)
	name = re.sub(r"_+", "_", name)
	return name.strip("_")


	def has_bad_column_names(df: pd.DataFrame) -> List[str]:
	"""Return list of columns with bad names"""
	bad = []
	for col in df.columns:
	if col != clean_column_name(col):
	bad.append(col)
	return bad


	# ── Value Standardization ─────────────────────

	def standardize_column(df: pd.DataFrame, column: str,
	mapping: Dict[str, str]) -> pd.DataFrame:
	"""Replace values in a column using a mapping dict"""
	df = df.copy()
	df[column] = df[column].replace(mapping)
	return df


	def get_value_counts(df: pd.DataFrame, column: str) -> Dict[str, int]:
	"""Return value counts for a column"""
	return df[column].value_counts().to_dict()


	# ── Scoring Helpers ───────────────────────────

	def compute_null_score(original_df: pd.DataFrame,
	current_df: pd.DataFrame) -> float:
	"""Score based on how many nulls have been fixed (0.0 to 1.0)"""
	original_nulls = original_df.isnull().sum().sum()
	if original_nulls == 0:
	return 1.0
	current_nulls = current_df.isnull().sum().sum()
	fixed = original_nulls - current_nulls
	return round(max(0.0, fixed / original_nulls), 4)


	def compute_duplicate_score(original_df: pd.DataFrame,
	current_df: pd.DataFrame) -> float:
	"""Score based on how many duplicates have been removed (0.0 to 1.0)"""
	original_dups = original_df.duplicated().sum()
	if original_dups == 0:
	return 1.0
	current_dups = current_df.duplicated().sum()
	fixed = original_dups - current_dups
	return round(max(0.0, fixed / original_dups), 4)


	def compute_dtype_score(df: pd.DataFrame,
	expected: Dict[str, str]) -> float:
	"""Score based on how many columns have correct dtype"""
	if not expected:
	return 1.0
	correct = sum(
	1 for col, dtype in expected.items()
	if col in df.columns and str(df[col].dtype) == dtype
	)
	return round(correct / len(expected), 4)


	# ── Misc ──────────────────────────────────────

	def clamp(value: float, min_val: float = 0.0,
	max_val: float = 1.0) -> float:
	"""Clamp a float between min and max"""
	return max(min_val, min(max_val, value))


	def format_score(score: float) -> str:
	"""Format score for display"""
	return f"{score:.3f}"