File size: 5,222 Bytes
f1b06d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# utils.py
# ─────────────────────────────────────────────
# Helper functions used across the project
# ─────────────────────────────────────────────

import pandas as pd
import numpy as np
from typing import Any, Dict, List, Tuple


# ── DataFrame Helpers ─────────────────────────

def df_to_records(df: pd.DataFrame) -> List[Dict[str, Any]]:
    """Convert DataFrame to list of dicts (for JSON serialization)"""
    return df.where(pd.notnull(df), None).to_dict(orient="records")


def df_summary(df: pd.DataFrame) -> Dict[str, Any]:
    """Return a quick summary of a DataFrame"""
    return {
        "shape":          list(df.shape),
        "columns":        list(df.columns),
        "dtypes":         {col: str(dtype) for col, dtype in df.dtypes.items()},
        "null_counts":    df.isnull().sum().to_dict(),
        "duplicate_rows": int(df.duplicated().sum()),
    }


def get_null_counts(df: pd.DataFrame) -> Dict[str, int]:
    """Return null count per column"""
    return {col: int(count) for col, count in df.isnull().sum().items()}


def get_duplicate_count(df: pd.DataFrame) -> int:
    """Return number of duplicate rows"""
    return int(df.duplicated().sum())


# ── Outlier Helpers ───────────────────────────

def detect_outliers_iqr(df: pd.DataFrame, column: str) -> pd.Series:
    """Return boolean mask of outliers using IQR method"""
    Q1  = df[column].quantile(0.25)
    Q3  = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return (df[column] < lower) | (df[column] > upper)


def detect_outliers_zscore(df: pd.DataFrame, column: str,

                            threshold: float = 3.0) -> pd.Series:
    """Return boolean mask of outliers using Z-score method"""
    mean = df[column].mean()
    std  = df[column].std()
    if std == 0:
        return pd.Series([False] * len(df))
    return ((df[column] - mean) / std).abs() > threshold


# ── Column Name Helpers ───────────────────────

def clean_column_name(name: str) -> str:
    """Normalize a column name to snake_case"""
    import re
    name = name.strip().lower()
    name = re.sub(r"[^a-z0-9_]", "_", name)
    name = re.sub(r"_+", "_", name)
    return name.strip("_")


def has_bad_column_names(df: pd.DataFrame) -> List[str]:
    """Return list of columns with bad names"""
    bad = []
    for col in df.columns:
        if col != clean_column_name(col):
            bad.append(col)
    return bad


# ── Value Standardization ─────────────────────

def standardize_column(df: pd.DataFrame, column: str,

                        mapping: Dict[str, str]) -> pd.DataFrame:
    """Replace values in a column using a mapping dict"""
    df = df.copy()
    df[column] = df[column].replace(mapping)
    return df


def get_value_counts(df: pd.DataFrame, column: str) -> Dict[str, int]:
    """Return value counts for a column"""
    return df[column].value_counts().to_dict()


# ── Scoring Helpers ───────────────────────────

def compute_null_score(original_df: pd.DataFrame,

                        current_df: pd.DataFrame) -> float:
    """Score based on how many nulls have been fixed (0.0 to 1.0)"""
    original_nulls = original_df.isnull().sum().sum()
    if original_nulls == 0:
        return 1.0
    current_nulls = current_df.isnull().sum().sum()
    fixed = original_nulls - current_nulls
    return round(max(0.0, fixed / original_nulls), 4)


def compute_duplicate_score(original_df: pd.DataFrame,

                              current_df: pd.DataFrame) -> float:
    """Score based on how many duplicates have been removed (0.0 to 1.0)"""
    original_dups = original_df.duplicated().sum()
    if original_dups == 0:
        return 1.0
    current_dups = current_df.duplicated().sum()
    fixed = original_dups - current_dups
    return round(max(0.0, fixed / original_dups), 4)


def compute_dtype_score(df: pd.DataFrame,

                         expected: Dict[str, str]) -> float:
    """Score based on how many columns have correct dtype"""
    if not expected:
        return 1.0
    correct = sum(
        1 for col, dtype in expected.items()
        if col in df.columns and str(df[col].dtype) == dtype
    )
    return round(correct / len(expected), 4)


# ── Misc ──────────────────────────────────────

def clamp(value: float, min_val: float = 0.0,

          max_val: float = 1.0) -> float:
    """Clamp a float between min and max"""
    return max(min_val, min(max_val, value))


def format_score(score: float) -> str:
    """Format score for display"""
    return f"{score:.3f}"