ravitejas1596's picture
Deploy Space
ae7c16d verified
from __future__ import annotations
from pathlib import Path
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from scripts.train_xgb import TARGET, _read_joined, _select_columns, _freq_encode
def _group_metrics(y: np.ndarray, p: np.ndarray, group: pd.Series) -> pd.DataFrame:
rows = []
group = group.fillna("missing").astype(str)
group = group.reset_index(drop=True)
for g in group.unique().tolist():
mask = (group == g).to_numpy()
n = int(mask.sum())
if n < 500:
continue
yy = y[mask]
pp = p[mask]
auc = float("nan")
if len(np.unique(yy)) == 2:
auc = float(roc_auc_score(yy, pp))
rows.append(
{
"group": str(g),
"n": n,
"fraud_rate": float(np.mean(yy)),
"mean_score": float(np.mean(pp)),
"roc_auc": auc,
}
)
return pd.DataFrame(rows).sort_values("n", ascending=False)
def main() -> None:
repo_root = Path(__file__).resolve().parents[1]
raw_dir = repo_root / "data" / "raw"
artifacts_path = repo_root / "artifacts" / "model.joblib"
out_dir = repo_root / "reports"
out_dir.mkdir(parents=True, exist_ok=True)
df = _read_joined(raw_dir, max_rows=200000)
df, numeric, categorical = _select_columns(df)
df = df.dropna(subset=[TARGET]).copy()
y = df[TARGET].astype(int).to_numpy()
X = df.drop(columns=[TARGET])
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.25, random_state=42, stratify=y
)
X_test = X_test.reset_index(drop=True)
payload = joblib.load(artifacts_path)
model = payload["model"]
imputer = payload["imputer"]
cat_cols = list(payload.get("categorical_cols") or [])
X_train_enc, X_test_enc, _ = _freq_encode(X_train, X_test, cat_cols)
Xte = imputer.transform(X_test_enc)
p = model.predict_proba(Xte)[:, 1]
product = X_test.get("ProductCD", pd.Series([None] * len(X_test)))
device = X_test.get("DeviceType", pd.Series([None] * len(X_test)))
product_table = _group_metrics(y_test, p, product)
device_table = _group_metrics(y_test, p, device)
md = []
md.append("## Bias audit (limited slices)\n")
md.append(
"This audit checks basic performance skews across a couple of available categorical slices. It is not a substitute for a full fairness review.\n"
)
md.append("### Slice: ProductCD\n")
md.append(product_table.to_markdown(index=False) if not product_table.empty else "_Not available._")
md.append("\n\n### Slice: DeviceType\n")
md.append(device_table.to_markdown(index=False) if not device_table.empty else "_Not available._")
md.append("\n")
out_path = out_dir / "bias_audit.md"
out_path.write_text("\n".join(md))
print(f"Wrote {out_path}")
if __name__ == "__main__":
main()