Spaces:
Sleeping
Sleeping
File size: 5,222 Bytes
f1b06d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | # utils.py
# βββββββββββββββββββββββββββββββββββββββββββββ
# Helper functions used across the project
# βββββββββββββββββββββββββββββββββββββββββββββ
import pandas as pd
import numpy as np
from typing import Any, Dict, List, Tuple
# ββ DataFrame Helpers βββββββββββββββββββββββββ
def df_to_records(df: pd.DataFrame) -> List[Dict[str, Any]]:
"""Convert DataFrame to list of dicts (for JSON serialization)"""
return df.where(pd.notnull(df), None).to_dict(orient="records")
def df_summary(df: pd.DataFrame) -> Dict[str, Any]:
"""Return a quick summary of a DataFrame"""
return {
"shape": list(df.shape),
"columns": list(df.columns),
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
"null_counts": df.isnull().sum().to_dict(),
"duplicate_rows": int(df.duplicated().sum()),
}
def get_null_counts(df: pd.DataFrame) -> Dict[str, int]:
"""Return null count per column"""
return {col: int(count) for col, count in df.isnull().sum().items()}
def get_duplicate_count(df: pd.DataFrame) -> int:
"""Return number of duplicate rows"""
return int(df.duplicated().sum())
# ββ Outlier Helpers βββββββββββββββββββββββββββ
def detect_outliers_iqr(df: pd.DataFrame, column: str) -> pd.Series:
"""Return boolean mask of outliers using IQR method"""
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
return (df[column] < lower) | (df[column] > upper)
def detect_outliers_zscore(df: pd.DataFrame, column: str,
threshold: float = 3.0) -> pd.Series:
"""Return boolean mask of outliers using Z-score method"""
mean = df[column].mean()
std = df[column].std()
if std == 0:
return pd.Series([False] * len(df))
return ((df[column] - mean) / std).abs() > threshold
# ββ Column Name Helpers βββββββββββββββββββββββ
def clean_column_name(name: str) -> str:
"""Normalize a column name to snake_case"""
import re
name = name.strip().lower()
name = re.sub(r"[^a-z0-9_]", "_", name)
name = re.sub(r"_+", "_", name)
return name.strip("_")
def has_bad_column_names(df: pd.DataFrame) -> List[str]:
"""Return list of columns with bad names"""
bad = []
for col in df.columns:
if col != clean_column_name(col):
bad.append(col)
return bad
# ββ Value Standardization βββββββββββββββββββββ
def standardize_column(df: pd.DataFrame, column: str,
mapping: Dict[str, str]) -> pd.DataFrame:
"""Replace values in a column using a mapping dict"""
df = df.copy()
df[column] = df[column].replace(mapping)
return df
def get_value_counts(df: pd.DataFrame, column: str) -> Dict[str, int]:
"""Return value counts for a column"""
return df[column].value_counts().to_dict()
# ββ Scoring Helpers βββββββββββββββββββββββββββ
def compute_null_score(original_df: pd.DataFrame,
current_df: pd.DataFrame) -> float:
"""Score based on how many nulls have been fixed (0.0 to 1.0)"""
original_nulls = original_df.isnull().sum().sum()
if original_nulls == 0:
return 1.0
current_nulls = current_df.isnull().sum().sum()
fixed = original_nulls - current_nulls
return round(max(0.0, fixed / original_nulls), 4)
def compute_duplicate_score(original_df: pd.DataFrame,
current_df: pd.DataFrame) -> float:
"""Score based on how many duplicates have been removed (0.0 to 1.0)"""
original_dups = original_df.duplicated().sum()
if original_dups == 0:
return 1.0
current_dups = current_df.duplicated().sum()
fixed = original_dups - current_dups
return round(max(0.0, fixed / original_dups), 4)
def compute_dtype_score(df: pd.DataFrame,
expected: Dict[str, str]) -> float:
"""Score based on how many columns have correct dtype"""
if not expected:
return 1.0
correct = sum(
1 for col, dtype in expected.items()
if col in df.columns and str(df[col].dtype) == dtype
)
return round(correct / len(expected), 4)
# ββ Misc ββββββββββββββββββββββββββββββββββββββ
def clamp(value: float, min_val: float = 0.0,
max_val: float = 1.0) -> float:
"""Clamp a float between min and max"""
return max(min_val, min(max_val, value))
def format_score(score: float) -> str:
"""Format score for display"""
return f"{score:.3f}" |