FraudGuard_AI / fraud_model.py
solfedge's picture
Upload 9 files
1bb2414 verified
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import shap
import joblib
def generate_fraud_dataset(num_samples=10000):
# Base data
amount = np.random.lognormal(3, 0.5, num_samples)
hour = np.random.randint(0, 24, num_samples)
country = np.random.choice(["US", "Nigeria", "Russia", "China", "UK"], num_samples)
merchant_category = np.random.choice(["Retail", "Health", "Crypto", "Gambling", "Travel"], num_samples)
is_weekend = np.random.choice([0, 1], num_samples)
fraud_risk = np.zeros(num_samples)
fraud_risk += (amount > 1000).astype(float) * 0.3
high_risk_countries = ["Nigeria", "Russia", "China"]
fraud_risk += np.isin(country, high_risk_countries).astype(float) * 0.3
risky_merchants = ["Crypto", "Gambling"]
fraud_risk += np.isin(merchant_category, risky_merchants).astype(float) * 0.3
fraud_risk += ((hour >= 2) & (hour <= 5)).astype(float) * 0.1
# Combine and cap at 0.95
fraud_risk = np.clip(fraud_risk, 0, 0.95)
# Generate target: higher fraud_risk → higher chance of fraud
target = (np.random.rand(num_samples) < fraud_risk).astype(int)
return pd.DataFrame({
"amount": amount,
"hour": hour,
"country": country,
"merchant_category": merchant_category,
"is_weekend": is_weekend,
"target": target
})
# Train models
df = generate_fraud_dataset()
df['amount_log'] = np.log1p(df['amount'])
df = pd.get_dummies(df, columns=["country", "merchant_category"])
for col in df.columns:
if df[col].dtype == 'bool':
df[col] = df[col].astype(int)
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)
iso_forest.fit(X_train)
joblib.dump(iso_forest, "iso_forest.pkl")
# Train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
xgb.get_booster().save_model("xgb_fraud.json")
joblib.dump(X_train.columns.tolist(), "train_columns.pkl")
# Evaluate
preds = xgb.predict(X_test)
print(classification_report(y_test, preds))
print(" Models saved: iso_forest.pkl, xgb_fraud.json, train_columns.pkl")