# utils.py # ───────────────────────────────────────────── # Helper functions used across the project # ───────────────────────────────────────────── import pandas as pd import numpy as np from typing import Any, Dict, List, Tuple # ── DataFrame Helpers ───────────────────────── def df_to_records(df: pd.DataFrame) -> List[Dict[str, Any]]: """Convert DataFrame to list of dicts (for JSON serialization)""" return df.where(pd.notnull(df), None).to_dict(orient="records") def df_summary(df: pd.DataFrame) -> Dict[str, Any]: """Return a quick summary of a DataFrame""" return { "shape": list(df.shape), "columns": list(df.columns), "dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()}, "null_counts": df.isnull().sum().to_dict(), "duplicate_rows": int(df.duplicated().sum()), } def get_null_counts(df: pd.DataFrame) -> Dict[str, int]: """Return null count per column""" return {col: int(count) for col, count in df.isnull().sum().items()} def get_duplicate_count(df: pd.DataFrame) -> int: """Return number of duplicate rows""" return int(df.duplicated().sum()) # ── Outlier Helpers ─────────────────────────── def detect_outliers_iqr(df: pd.DataFrame, column: str) -> pd.Series: """Return boolean mask of outliers using IQR method""" Q1 = df[column].quantile(0.25) Q3 = df[column].quantile(0.75) IQR = Q3 - Q1 lower = Q1 - 1.5 * IQR upper = Q3 + 1.5 * IQR return (df[column] < lower) | (df[column] > upper) def detect_outliers_zscore(df: pd.DataFrame, column: str, threshold: float = 3.0) -> pd.Series: """Return boolean mask of outliers using Z-score method""" mean = df[column].mean() std = df[column].std() if std == 0: return pd.Series([False] * len(df)) return ((df[column] - mean) / std).abs() > threshold # ── Column Name Helpers ─────────────────────── def clean_column_name(name: str) -> str: """Normalize a column name to snake_case""" import re name = name.strip().lower() name = re.sub(r"[^a-z0-9_]", "_", name) name = re.sub(r"_+", "_", name) return name.strip("_") def has_bad_column_names(df: pd.DataFrame) -> List[str]: """Return list of columns with bad names""" bad = [] for col in df.columns: if col != clean_column_name(col): bad.append(col) return bad # ── Value Standardization ───────────────────── def standardize_column(df: pd.DataFrame, column: str, mapping: Dict[str, str]) -> pd.DataFrame: """Replace values in a column using a mapping dict""" df = df.copy() df[column] = df[column].replace(mapping) return df def get_value_counts(df: pd.DataFrame, column: str) -> Dict[str, int]: """Return value counts for a column""" return df[column].value_counts().to_dict() # ── Scoring Helpers ─────────────────────────── def compute_null_score(original_df: pd.DataFrame, current_df: pd.DataFrame) -> float: """Score based on how many nulls have been fixed (0.0 to 1.0)""" original_nulls = original_df.isnull().sum().sum() if original_nulls == 0: return 1.0 current_nulls = current_df.isnull().sum().sum() fixed = original_nulls - current_nulls return round(max(0.0, fixed / original_nulls), 4) def compute_duplicate_score(original_df: pd.DataFrame, current_df: pd.DataFrame) -> float: """Score based on how many duplicates have been removed (0.0 to 1.0)""" original_dups = original_df.duplicated().sum() if original_dups == 0: return 1.0 current_dups = current_df.duplicated().sum() fixed = original_dups - current_dups return round(max(0.0, fixed / original_dups), 4) def compute_dtype_score(df: pd.DataFrame, expected: Dict[str, str]) -> float: """Score based on how many columns have correct dtype""" if not expected: return 1.0 correct = sum( 1 for col, dtype in expected.items() if col in df.columns and str(df[col].dtype) == dtype ) return round(correct / len(expected), 4) # ── Misc ────────────────────────────────────── def clamp(value: float, min_val: float = 0.0, max_val: float = 1.0) -> float: """Clamp a float between min and max""" return max(min_val, min(max_val, value)) def format_score(score: float) -> str: """Format score for display""" return f"{score:.3f}"