"""Profile the dataset: shape, target balance, missingness, degenerate columns, and single-feature leakage suspects. Writes reports/data_profile.md.""" from __future__ import annotations import numpy as np import pandas as pd from sklearn.metrics import roc_auc_score from src import config from src.data.load import load_raw, split_xy def _leakage_suspects(X: pd.DataFrame, y: pd.Series, top: int = 15) -> pd.DataFrame: """Rank features by single-feature AUC. Very high values are leakage suspects.""" rows = [] yv = y.values numeric_cols = X.select_dtypes(include="number").columns for col in numeric_cols: s = X[col] if s.notna().sum() < 50 or s.nunique(dropna=True) < 2: continue filled = s.fillna(s.median()) try: auc = roc_auc_score(yv, filled.values) except ValueError: continue rows.append((col, max(auc, 1 - auc))) out = pd.DataFrame(rows, columns=["feature", "single_feature_auc"]) return out.sort_values("single_feature_auc", ascending=False).head(top) def main() -> None: config.ensure_dirs() df = load_raw() X, y = split_xy(df) n_rows, n_feats = X.shape pos = int(y.sum()) neg = int((y == 0).sum()) pos_rate = pos / len(y) miss = X.isna().mean().sort_values(ascending=False) high_miss_90 = int((miss > 0.90).sum()) high_miss_50 = int((miss > 0.50).sum()) nunique = X.nunique(dropna=True) constant = int((nunique <= 1).sum()) near_constant = int(((nunique > 1) & (nunique <= 2)).sum()) # duplicate columns (by content hash over a sample of rows for speed) sample = X.head(2000).fillna(-999_999) hashes = {} dup = 0 for col in sample.columns: h = pd.util.hash_pandas_object(sample[col], index=False).sum() if h in hashes: dup += 1 else: hashes[h] = col leak = _leakage_suspects(X, y) known_present = [f for f in config.KNOWN_IMPORTANT if f in X.columns] known_miss = X[known_present].isna().mean().round(3) lines = [] lines.append("# Data Profile — MuleGuard\n") lines.append("## Shape & Target\n") lines.append(f"- **Rows (accounts):** {n_rows:,}") lines.append(f"- **Features:** {n_feats:,} (target `{config.TARGET}` separated)") lines.append(f"- **Class balance:** {neg:,} legit (0) vs **{pos} mule (1)**") lines.append(f"- **Positive rate:** {pos_rate:.4%} → extreme imbalance") lines.append(f"- **Implication:** optimize **PR-AUC / recall@precision / F2**, never accuracy.\n") lines.append("## Missingness\n") lines.append(f"- Mean missing fraction across features: **{miss.mean():.2%}**") lines.append(f"- Features >90% missing: **{high_miss_90}**") lines.append(f"- Features >50% missing: **{high_miss_50}**") lines.append(f"- Most-missing features: {', '.join(f'{c} ({m:.0%})' for c, m in miss.head(5).items())}\n") lines.append("## Degenerate Columns\n") lines.append(f"- Constant (≤1 unique value): **{constant}**") lines.append(f"- Near-constant (≤2 unique values): **{near_constant}**") lines.append(f"- Duplicate columns (sampled): **~{dup}**\n") lines.append("## Leakage Suspects (single-feature AUC)\n") lines.append("High single-feature AUC on a rare target warrants review before trusting it.\n") lines.append("| Feature | Single-feature AUC |") lines.append("|---|---|") for _, r in leak.iterrows(): lines.append(f"| {r.feature} | {r.single_feature_auc:.3f} |") lines.append("") lines.append("## Bank-flagged High-signal Features\n") lines.append(f"- Present in data: **{len(known_present)}/{len(config.KNOWN_IMPORTANT)}**") lines.append("| Feature | Missing % |") lines.append("|---|---|") for f, m in known_miss.items(): lines.append(f"| {f} | {m:.1%} |") lines.append("") lines.append("## Recommendations\n") lines.append("1. Drop constant/duplicate/>90%-missing columns before selection.") lines.append("2. Median-impute + add missingness flags (missing-not-at-random may be predictive).") lines.append("3. Retain bank-flagged features as domain priors unless flagged leaky above.") lines.append("4. Use repeated stratified CV; report mean ± std.") lines.append("5. Review the top leakage suspects manually before including them.\n") out = config.REPORTS_DIR / "data_profile.md" out.write_text("\n".join(lines)) print(f"Wrote {out}") print(f"Rows={n_rows} Feats={n_feats} Pos={pos} ({pos_rate:.4%}) " f"Constant={constant} Dup~{dup} HighMiss90={high_miss_90}") if __name__ == "__main__": main()