File size: 2,498 Bytes
1bb2414
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import shap
import joblib

def generate_fraud_dataset(num_samples=10000):
    # Base data
    amount = np.random.lognormal(3, 0.5, num_samples) 
    hour = np.random.randint(0, 24, num_samples)
    country = np.random.choice(["US", "Nigeria", "Russia", "China", "UK"], num_samples)
    merchant_category = np.random.choice(["Retail", "Health", "Crypto", "Gambling", "Travel"], num_samples)
    is_weekend = np.random.choice([0, 1], num_samples)

    
    fraud_risk = np.zeros(num_samples)

    
    fraud_risk += (amount > 1000).astype(float) * 0.3

    
    high_risk_countries = ["Nigeria", "Russia", "China"]
    fraud_risk += np.isin(country, high_risk_countries).astype(float) * 0.3


    risky_merchants = ["Crypto", "Gambling"]
    fraud_risk += np.isin(merchant_category, risky_merchants).astype(float) * 0.3

    
    fraud_risk += ((hour >= 2) & (hour <= 5)).astype(float) * 0.1

    # Combine and cap at 0.95
    fraud_risk = np.clip(fraud_risk, 0, 0.95)

    # Generate target: higher fraud_risk → higher chance of fraud
    target = (np.random.rand(num_samples) < fraud_risk).astype(int)

    return pd.DataFrame({
        "amount": amount,
        "hour": hour,
        "country": country,
        "merchant_category": merchant_category,
        "is_weekend": is_weekend,
        "target": target
    })
    
    # Train models
df = generate_fraud_dataset()
df['amount_log'] = np.log1p(df['amount'])
df = pd.get_dummies(df, columns=["country", "merchant_category"])
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(int)

X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)
joblib.dump(iso_forest, "iso_forest.pkl")  

# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)


xgb.get_booster().save_model("xgb_fraud.json")  

joblib.dump(X_train.columns.tolist(), "train_columns.pkl")

# Evaluate
preds = xgb.predict(X_test)
print(classification_report(y_test, preds))
print(" Models saved: iso_forest.pkl, xgb_fraud.json, train_columns.pkl")