Spaces:
Sleeping
Sleeping
| """Profile the dataset: shape, target balance, missingness, degenerate columns, | |
| and single-feature leakage suspects. Writes reports/data_profile.md.""" | |
| from __future__ import annotations | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.metrics import roc_auc_score | |
| from src import config | |
| from src.data.load import load_raw, split_xy | |
| def _leakage_suspects(X: pd.DataFrame, y: pd.Series, top: int = 15) -> pd.DataFrame: | |
| """Rank features by single-feature AUC. Very high values are leakage suspects.""" | |
| rows = [] | |
| yv = y.values | |
| numeric_cols = X.select_dtypes(include="number").columns | |
| for col in numeric_cols: | |
| s = X[col] | |
| if s.notna().sum() < 50 or s.nunique(dropna=True) < 2: | |
| continue | |
| filled = s.fillna(s.median()) | |
| try: | |
| auc = roc_auc_score(yv, filled.values) | |
| except ValueError: | |
| continue | |
| rows.append((col, max(auc, 1 - auc))) | |
| out = pd.DataFrame(rows, columns=["feature", "single_feature_auc"]) | |
| return out.sort_values("single_feature_auc", ascending=False).head(top) | |
| def main() -> None: | |
| config.ensure_dirs() | |
| df = load_raw() | |
| X, y = split_xy(df) | |
| n_rows, n_feats = X.shape | |
| pos = int(y.sum()) | |
| neg = int((y == 0).sum()) | |
| pos_rate = pos / len(y) | |
| miss = X.isna().mean().sort_values(ascending=False) | |
| high_miss_90 = int((miss > 0.90).sum()) | |
| high_miss_50 = int((miss > 0.50).sum()) | |
| nunique = X.nunique(dropna=True) | |
| constant = int((nunique <= 1).sum()) | |
| near_constant = int(((nunique > 1) & (nunique <= 2)).sum()) | |
| # duplicate columns (by content hash over a sample of rows for speed) | |
| sample = X.head(2000).fillna(-999_999) | |
| hashes = {} | |
| dup = 0 | |
| for col in sample.columns: | |
| h = pd.util.hash_pandas_object(sample[col], index=False).sum() | |
| if h in hashes: | |
| dup += 1 | |
| else: | |
| hashes[h] = col | |
| leak = _leakage_suspects(X, y) | |
| known_present = [f for f in config.KNOWN_IMPORTANT if f in X.columns] | |
| known_miss = X[known_present].isna().mean().round(3) | |
| lines = [] | |
| lines.append("# Data Profile — MuleGuard\n") | |
| lines.append("## Shape & Target\n") | |
| lines.append(f"- **Rows (accounts):** {n_rows:,}") | |
| lines.append(f"- **Features:** {n_feats:,} (target `{config.TARGET}` separated)") | |
| lines.append(f"- **Class balance:** {neg:,} legit (0) vs **{pos} mule (1)**") | |
| lines.append(f"- **Positive rate:** {pos_rate:.4%} → extreme imbalance") | |
| lines.append(f"- **Implication:** optimize **PR-AUC / recall@precision / F2**, never accuracy.\n") | |
| lines.append("## Missingness\n") | |
| lines.append(f"- Mean missing fraction across features: **{miss.mean():.2%}**") | |
| lines.append(f"- Features >90% missing: **{high_miss_90}**") | |
| lines.append(f"- Features >50% missing: **{high_miss_50}**") | |
| lines.append(f"- Most-missing features: {', '.join(f'{c} ({m:.0%})' for c, m in miss.head(5).items())}\n") | |
| lines.append("## Degenerate Columns\n") | |
| lines.append(f"- Constant (≤1 unique value): **{constant}**") | |
| lines.append(f"- Near-constant (≤2 unique values): **{near_constant}**") | |
| lines.append(f"- Duplicate columns (sampled): **~{dup}**\n") | |
| lines.append("## Leakage Suspects (single-feature AUC)\n") | |
| lines.append("High single-feature AUC on a rare target warrants review before trusting it.\n") | |
| lines.append("| Feature | Single-feature AUC |") | |
| lines.append("|---|---|") | |
| for _, r in leak.iterrows(): | |
| lines.append(f"| {r.feature} | {r.single_feature_auc:.3f} |") | |
| lines.append("") | |
| lines.append("## Bank-flagged High-signal Features\n") | |
| lines.append(f"- Present in data: **{len(known_present)}/{len(config.KNOWN_IMPORTANT)}**") | |
| lines.append("| Feature | Missing % |") | |
| lines.append("|---|---|") | |
| for f, m in known_miss.items(): | |
| lines.append(f"| {f} | {m:.1%} |") | |
| lines.append("") | |
| lines.append("## Recommendations\n") | |
| lines.append("1. Drop constant/duplicate/>90%-missing columns before selection.") | |
| lines.append("2. Median-impute + add missingness flags (missing-not-at-random may be predictive).") | |
| lines.append("3. Retain bank-flagged features as domain priors unless flagged leaky above.") | |
| lines.append("4. Use repeated stratified CV; report mean ± std.") | |
| lines.append("5. Review the top leakage suspects manually before including them.\n") | |
| out = config.REPORTS_DIR / "data_profile.md" | |
| out.write_text("\n".join(lines)) | |
| print(f"Wrote {out}") | |
| print(f"Rows={n_rows} Feats={n_feats} Pos={pos} ({pos_rate:.4%}) " | |
| f"Constant={constant} Dup~{dup} HighMiss90={high_miss_90}") | |
| if __name__ == "__main__": | |
| main() | |