import pandas as pd import numpy as np from sklearn.ensemble import IsolationForest from xgboost import XGBClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report import shap import joblib def generate_fraud_dataset(num_samples=10000): # Base data amount = np.random.lognormal(3, 0.5, num_samples) hour = np.random.randint(0, 24, num_samples) country = np.random.choice(["US", "Nigeria", "Russia", "China", "UK"], num_samples) merchant_category = np.random.choice(["Retail", "Health", "Crypto", "Gambling", "Travel"], num_samples) is_weekend = np.random.choice([0, 1], num_samples) fraud_risk = np.zeros(num_samples) fraud_risk += (amount > 1000).astype(float) * 0.3 high_risk_countries = ["Nigeria", "Russia", "China"] fraud_risk += np.isin(country, high_risk_countries).astype(float) * 0.3 risky_merchants = ["Crypto", "Gambling"] fraud_risk += np.isin(merchant_category, risky_merchants).astype(float) * 0.3 fraud_risk += ((hour >= 2) & (hour <= 5)).astype(float) * 0.1 # Combine and cap at 0.95 fraud_risk = np.clip(fraud_risk, 0, 0.95) # Generate target: higher fraud_risk → higher chance of fraud target = (np.random.rand(num_samples) < fraud_risk).astype(int) return pd.DataFrame({ "amount": amount, "hour": hour, "country": country, "merchant_category": merchant_category, "is_weekend": is_weekend, "target": target }) # Train models df = generate_fraud_dataset() df['amount_log'] = np.log1p(df['amount']) df = pd.get_dummies(df, columns=["country", "merchant_category"]) for col in df.columns: if df[col].dtype == 'bool': df[col] = df[col].astype(int) X = df.drop("target", axis=1) y = df["target"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Train Isolation Forest iso_forest = IsolationForest(contamination=0.05, random_state=42) iso_forest.fit(X_train) joblib.dump(iso_forest, "iso_forest.pkl") # Train XGBoost xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) xgb.fit(X_train, y_train) xgb.get_booster().save_model("xgb_fraud.json") joblib.dump(X_train.columns.tolist(), "train_columns.pkl") # Evaluate preds = xgb.predict(X_test) print(classification_report(y_test, preds)) print(" Models saved: iso_forest.pkl, xgb_fraud.json, train_columns.pkl")