MuleGuard / src /data /profile.py
MuleGuard
MuleGuard: end-to-end mule-account detection + HF Space deploy
af879c2
Raw
History Blame Contribute Delete
4.69 kB
"""Profile the dataset: shape, target balance, missingness, degenerate columns,
and single-feature leakage suspects. Writes reports/data_profile.md."""
from __future__ import annotations
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from src import config
from src.data.load import load_raw, split_xy
def _leakage_suspects(X: pd.DataFrame, y: pd.Series, top: int = 15) -> pd.DataFrame:
"""Rank features by single-feature AUC. Very high values are leakage suspects."""
rows = []
yv = y.values
numeric_cols = X.select_dtypes(include="number").columns
for col in numeric_cols:
s = X[col]
if s.notna().sum() < 50 or s.nunique(dropna=True) < 2:
continue
filled = s.fillna(s.median())
try:
auc = roc_auc_score(yv, filled.values)
except ValueError:
continue
rows.append((col, max(auc, 1 - auc)))
out = pd.DataFrame(rows, columns=["feature", "single_feature_auc"])
return out.sort_values("single_feature_auc", ascending=False).head(top)
def main() -> None:
config.ensure_dirs()
df = load_raw()
X, y = split_xy(df)
n_rows, n_feats = X.shape
pos = int(y.sum())
neg = int((y == 0).sum())
pos_rate = pos / len(y)
miss = X.isna().mean().sort_values(ascending=False)
high_miss_90 = int((miss > 0.90).sum())
high_miss_50 = int((miss > 0.50).sum())
nunique = X.nunique(dropna=True)
constant = int((nunique <= 1).sum())
near_constant = int(((nunique > 1) & (nunique <= 2)).sum())
# duplicate columns (by content hash over a sample of rows for speed)
sample = X.head(2000).fillna(-999_999)
hashes = {}
dup = 0
for col in sample.columns:
h = pd.util.hash_pandas_object(sample[col], index=False).sum()
if h in hashes:
dup += 1
else:
hashes[h] = col
leak = _leakage_suspects(X, y)
known_present = [f for f in config.KNOWN_IMPORTANT if f in X.columns]
known_miss = X[known_present].isna().mean().round(3)
lines = []
lines.append("# Data Profile — MuleGuard\n")
lines.append("## Shape & Target\n")
lines.append(f"- **Rows (accounts):** {n_rows:,}")
lines.append(f"- **Features:** {n_feats:,} (target `{config.TARGET}` separated)")
lines.append(f"- **Class balance:** {neg:,} legit (0) vs **{pos} mule (1)**")
lines.append(f"- **Positive rate:** {pos_rate:.4%} → extreme imbalance")
lines.append(f"- **Implication:** optimize **PR-AUC / recall@precision / F2**, never accuracy.\n")
lines.append("## Missingness\n")
lines.append(f"- Mean missing fraction across features: **{miss.mean():.2%}**")
lines.append(f"- Features >90% missing: **{high_miss_90}**")
lines.append(f"- Features >50% missing: **{high_miss_50}**")
lines.append(f"- Most-missing features: {', '.join(f'{c} ({m:.0%})' for c, m in miss.head(5).items())}\n")
lines.append("## Degenerate Columns\n")
lines.append(f"- Constant (≤1 unique value): **{constant}**")
lines.append(f"- Near-constant (≤2 unique values): **{near_constant}**")
lines.append(f"- Duplicate columns (sampled): **~{dup}**\n")
lines.append("## Leakage Suspects (single-feature AUC)\n")
lines.append("High single-feature AUC on a rare target warrants review before trusting it.\n")
lines.append("| Feature | Single-feature AUC |")
lines.append("|---|---|")
for _, r in leak.iterrows():
lines.append(f"| {r.feature} | {r.single_feature_auc:.3f} |")
lines.append("")
lines.append("## Bank-flagged High-signal Features\n")
lines.append(f"- Present in data: **{len(known_present)}/{len(config.KNOWN_IMPORTANT)}**")
lines.append("| Feature | Missing % |")
lines.append("|---|---|")
for f, m in known_miss.items():
lines.append(f"| {f} | {m:.1%} |")
lines.append("")
lines.append("## Recommendations\n")
lines.append("1. Drop constant/duplicate/>90%-missing columns before selection.")
lines.append("2. Median-impute + add missingness flags (missing-not-at-random may be predictive).")
lines.append("3. Retain bank-flagged features as domain priors unless flagged leaky above.")
lines.append("4. Use repeated stratified CV; report mean ± std.")
lines.append("5. Review the top leakage suspects manually before including them.\n")
out = config.REPORTS_DIR / "data_profile.md"
out.write_text("\n".join(lines))
print(f"Wrote {out}")
print(f"Rows={n_rows} Feats={n_feats} Pos={pos} ({pos_rate:.4%}) "
f"Constant={constant} Dup~{dup} HighMiss90={high_miss_90}")
if __name__ == "__main__":
main()