Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| from sklearn.ensemble import IsolationForest | |
| from xgboost import XGBClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report | |
| import shap | |
| import joblib | |
| def generate_fraud_dataset(num_samples=10000): | |
| # Base data | |
| amount = np.random.lognormal(3, 0.5, num_samples) | |
| hour = np.random.randint(0, 24, num_samples) | |
| country = np.random.choice(["US", "Nigeria", "Russia", "China", "UK"], num_samples) | |
| merchant_category = np.random.choice(["Retail", "Health", "Crypto", "Gambling", "Travel"], num_samples) | |
| is_weekend = np.random.choice([0, 1], num_samples) | |
| fraud_risk = np.zeros(num_samples) | |
| fraud_risk += (amount > 1000).astype(float) * 0.3 | |
| high_risk_countries = ["Nigeria", "Russia", "China"] | |
| fraud_risk += np.isin(country, high_risk_countries).astype(float) * 0.3 | |
| risky_merchants = ["Crypto", "Gambling"] | |
| fraud_risk += np.isin(merchant_category, risky_merchants).astype(float) * 0.3 | |
| fraud_risk += ((hour >= 2) & (hour <= 5)).astype(float) * 0.1 | |
| # Combine and cap at 0.95 | |
| fraud_risk = np.clip(fraud_risk, 0, 0.95) | |
| # Generate target: higher fraud_risk → higher chance of fraud | |
| target = (np.random.rand(num_samples) < fraud_risk).astype(int) | |
| return pd.DataFrame({ | |
| "amount": amount, | |
| "hour": hour, | |
| "country": country, | |
| "merchant_category": merchant_category, | |
| "is_weekend": is_weekend, | |
| "target": target | |
| }) | |
| # Train models | |
| df = generate_fraud_dataset() | |
| df['amount_log'] = np.log1p(df['amount']) | |
| df = pd.get_dummies(df, columns=["country", "merchant_category"]) | |
| for col in df.columns: | |
| if df[col].dtype == 'bool': | |
| df[col] = df[col].astype(int) | |
| X = df.drop("target", axis=1) | |
| y = df["target"] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Train Isolation Forest | |
| iso_forest = IsolationForest(contamination=0.05, random_state=42) | |
| iso_forest.fit(X_train) | |
| joblib.dump(iso_forest, "iso_forest.pkl") | |
| # Train XGBoost | |
| xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42) | |
| xgb.fit(X_train, y_train) | |
| xgb.get_booster().save_model("xgb_fraud.json") | |
| joblib.dump(X_train.columns.tolist(), "train_columns.pkl") | |
| # Evaluate | |
| preds = xgb.predict(X_test) | |
| print(classification_report(y_test, preds)) | |
| print(" Models saved: iso_forest.pkl, xgb_fraud.json, train_columns.pkl") | |