Spaces:
Running
Running
Upload 9 files
Browse files- app.py +93 -0
- fraud_model.py +82 -0
- iso_forest.pkl +3 -0
- parser.py +31 -0
- requirements.txt +8 -0
- train_columns.pkl +3 -0
- xgb_explainer.pkl +3 -0
- xgb_fraud.json +0 -0
- xgb_fraud.pkl +3 -0
app.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import joblib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import shap
|
| 6 |
+
import xgboost as xgb
|
| 7 |
+
from xgboost import XGBClassifier
|
| 8 |
+
import numpy as np
|
| 9 |
+
import matplotlib
|
| 10 |
+
matplotlib.use('Agg')
|
| 11 |
+
import matplotlib.pyplot as plt
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
try:
|
| 15 |
+
iso_forest = joblib.load("iso_forest.pkl")
|
| 16 |
+
# Load XGBoost from JSON
|
| 17 |
+
xgb = XGBClassifier()
|
| 18 |
+
xgb.load_model("xgb_fraud.json")
|
| 19 |
+
|
| 20 |
+
# Load training columns
|
| 21 |
+
train_cols = joblib.load("train_columns.pkl")
|
| 22 |
+
|
| 23 |
+
except FileNotFoundError as e:
|
| 24 |
+
raise FileNotFoundError(f"File missing: {e}. Did you run fraud_model.py?")
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
explainer = shap.Explainer(xgb, pd.DataFrame(np.zeros((1, len(train_cols))), columns=train_cols))
|
| 28 |
+
|
| 29 |
+
def predict_fraud(amount, hour, country, merchant_category, is_weekend):
|
| 30 |
+
try:
|
| 31 |
+
amount = float(amount)
|
| 32 |
+
hour = int(hour)
|
| 33 |
+
is_weekend = int(is_weekend)
|
| 34 |
+
except ValueError:
|
| 35 |
+
return " Invalid input: Please enter valid numbers.", None
|
| 36 |
+
|
| 37 |
+
input_data = pd.DataFrame({
|
| 38 |
+
"amount": [amount],
|
| 39 |
+
"hour": [hour],
|
| 40 |
+
"is_weekend": [is_weekend],
|
| 41 |
+
"country": [country],
|
| 42 |
+
"merchant_category": [merchant_category]
|
| 43 |
+
})
|
| 44 |
+
|
| 45 |
+
input_data['amount_log'] = np.log1p(input_data['amount'])
|
| 46 |
+
input_data = pd.get_dummies(input_data, columns=["country", "merchant_category"])
|
| 47 |
+
input_data = input_data.reindex(columns=train_cols, fill_value=0)
|
| 48 |
+
|
| 49 |
+
risk_score = iso_forest.score_samples(input_data)[0]
|
| 50 |
+
prediction = xgb.predict(input_data)[0]
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
shap_values = explainer(input_data)
|
| 54 |
+
fig, ax = plt.subplots(figsize=(8, 5))
|
| 55 |
+
shap.plots.waterfall(shap_values[0], max_display=6, show=False)
|
| 56 |
+
plt.tight_layout()
|
| 57 |
+
plt.close()
|
| 58 |
+
|
| 59 |
+
if prediction == 1:
|
| 60 |
+
return f" FRAUD DETECTED! Anomaly Score: {risk_score:.3f}", fig
|
| 61 |
+
else:
|
| 62 |
+
return f" No Fraud. Anomaly Score: {risk_score:.3f}", fig
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
# Gradio Interface
|
| 66 |
+
with gr.Blocks(title="FraudGuard", theme=gr.themes.Soft()) as demo:
|
| 67 |
+
gr.Markdown("""
|
| 68 |
+
# FraudGuard Real-Time Transaction Fraud Detector
|
| 69 |
+
Enter transaction details below. FraudGuard uses AI to detect and **explain** fraud risk.
|
| 70 |
+
""")
|
| 71 |
+
|
| 72 |
+
with gr.Row():
|
| 73 |
+
amount = gr.Number(label="Transaction Amount ($)", value=100.0)
|
| 74 |
+
hour = gr.Slider(0, 23, step=1, label="Hour of Day", value=14)
|
| 75 |
+
country = gr.Dropdown(["US", "Nigeria", "Russia", "China", "UK"], label="Country", value="US")
|
| 76 |
+
merchant_category = gr.Dropdown(["Retail", "Health", "Crypto", "Gambling", "Travel"],
|
| 77 |
+
label="Merchant Category", value="Retail")
|
| 78 |
+
is_weekend = gr.Checkbox(label="Is Weekend?")
|
| 79 |
+
|
| 80 |
+
output = gr.Textbox(label="Risk Status")
|
| 81 |
+
explanation = gr.Plot(label="Why This Decision? (SHAP Explanation)")
|
| 82 |
+
|
| 83 |
+
submit_btn = gr.Button(" Analyze Transaction")
|
| 84 |
+
submit_btn.click(
|
| 85 |
+
fn=predict_fraud,
|
| 86 |
+
inputs=[amount, hour, country, merchant_category, is_weekend],
|
| 87 |
+
outputs=[output, explanation]
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
if __name__ == "__main__":
|
| 91 |
+
demo.launch(share=True)
|
| 92 |
+
|
| 93 |
+
|
fraud_model.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.ensemble import IsolationForest
|
| 5 |
+
from xgboost import XGBClassifier
|
| 6 |
+
from sklearn.model_selection import train_test_split
|
| 7 |
+
from sklearn.metrics import classification_report
|
| 8 |
+
import shap
|
| 9 |
+
import joblib
|
| 10 |
+
|
| 11 |
+
def generate_fraud_dataset(num_samples=10000):
|
| 12 |
+
# Base data
|
| 13 |
+
amount = np.random.lognormal(3, 0.5, num_samples)
|
| 14 |
+
hour = np.random.randint(0, 24, num_samples)
|
| 15 |
+
country = np.random.choice(["US", "Nigeria", "Russia", "China", "UK"], num_samples)
|
| 16 |
+
merchant_category = np.random.choice(["Retail", "Health", "Crypto", "Gambling", "Travel"], num_samples)
|
| 17 |
+
is_weekend = np.random.choice([0, 1], num_samples)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
fraud_risk = np.zeros(num_samples)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
fraud_risk += (amount > 1000).astype(float) * 0.3
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
high_risk_countries = ["Nigeria", "Russia", "China"]
|
| 27 |
+
fraud_risk += np.isin(country, high_risk_countries).astype(float) * 0.3
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
risky_merchants = ["Crypto", "Gambling"]
|
| 31 |
+
fraud_risk += np.isin(merchant_category, risky_merchants).astype(float) * 0.3
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
fraud_risk += ((hour >= 2) & (hour <= 5)).astype(float) * 0.1
|
| 35 |
+
|
| 36 |
+
# Combine and cap at 0.95
|
| 37 |
+
fraud_risk = np.clip(fraud_risk, 0, 0.95)
|
| 38 |
+
|
| 39 |
+
# Generate target: higher fraud_risk → higher chance of fraud
|
| 40 |
+
target = (np.random.rand(num_samples) < fraud_risk).astype(int)
|
| 41 |
+
|
| 42 |
+
return pd.DataFrame({
|
| 43 |
+
"amount": amount,
|
| 44 |
+
"hour": hour,
|
| 45 |
+
"country": country,
|
| 46 |
+
"merchant_category": merchant_category,
|
| 47 |
+
"is_weekend": is_weekend,
|
| 48 |
+
"target": target
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
# Train models
|
| 52 |
+
df = generate_fraud_dataset()
|
| 53 |
+
df['amount_log'] = np.log1p(df['amount'])
|
| 54 |
+
df = pd.get_dummies(df, columns=["country", "merchant_category"])
|
| 55 |
+
for col in df.columns:
|
| 56 |
+
if df[col].dtype == 'bool':
|
| 57 |
+
df[col] = df[col].astype(int)
|
| 58 |
+
|
| 59 |
+
X = df.drop("target", axis=1)
|
| 60 |
+
y = df["target"]
|
| 61 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
| 62 |
+
|
| 63 |
+
# Train Isolation Forest
|
| 64 |
+
iso_forest = IsolationForest(contamination=0.05, random_state=42)
|
| 65 |
+
iso_forest.fit(X_train)
|
| 66 |
+
joblib.dump(iso_forest, "iso_forest.pkl")
|
| 67 |
+
|
| 68 |
+
# Train XGBoost
|
| 69 |
+
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
|
| 70 |
+
xgb.fit(X_train, y_train)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
xgb.get_booster().save_model("xgb_fraud.json")
|
| 74 |
+
|
| 75 |
+
joblib.dump(X_train.columns.tolist(), "train_columns.pkl")
|
| 76 |
+
|
| 77 |
+
# Evaluate
|
| 78 |
+
preds = xgb.predict(X_test)
|
| 79 |
+
print(classification_report(y_test, preds))
|
| 80 |
+
print(" Models saved: iso_forest.pkl, xgb_fraud.json, train_columns.pkl")
|
| 81 |
+
|
| 82 |
+
|
iso_forest.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98cd90d8667eaefb4df9f0559b162c6f085d8016e92cd656cf1984153ca984a2
|
| 3 |
+
size 1911161
|
parser.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.preprocessing import StandardScaler
|
| 5 |
+
|
| 6 |
+
def prase_transaction_data(file_path):
|
| 7 |
+
"""prase and clean transaction data"""
|
| 8 |
+
df = pd.read_csv(file_path)
|
| 9 |
+
|
| 10 |
+
df = df.dropna()
|
| 11 |
+
df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
|
| 12 |
+
df['amount_log'] = np.log1p(df['amount'])
|
| 13 |
+
df['is_high_risk_country'] = df['country'].apply(lambda x: 1 if x in ["Nigeria", "Russia", "China"] else 0)
|
| 14 |
+
|
| 15 |
+
return df
|
| 16 |
+
|
| 17 |
+
def preprocess_for_model(df):
|
| 18 |
+
"""Prepare data for fraud detection model"""
|
| 19 |
+
features = ['amount_log', 'hour','is_high_risk_country','merchant_category']
|
| 20 |
+
X = df[features]
|
| 21 |
+
y = df.get('fraud_label', None)
|
| 22 |
+
|
| 23 |
+
#One-hot encode category
|
| 24 |
+
X = pd.get_dummies(X,columns=['merchant_category'],drop_first=True)
|
| 25 |
+
|
| 26 |
+
#Normalize
|
| 27 |
+
scaler = StandardScaler()
|
| 28 |
+
X_scaled = scaler.fit_transform(X)
|
| 29 |
+
|
| 30 |
+
return X_scaled, y
|
| 31 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas
|
| 2 |
+
numpy
|
| 3 |
+
scikit-learn
|
| 4 |
+
xgboost
|
| 5 |
+
shap
|
| 6 |
+
gradio
|
| 7 |
+
joblib
|
| 8 |
+
matplotlib
|
train_columns.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81b0dd6b6e09b1fa1e3c073f711c98fadbfd1b0ac866d3c06755025958da8bb2
|
| 3 |
+
size 272
|
xgb_explainer.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:41e0d684af68af6429f119b65cbb79cc30bbe438a996084917a87434ef44c748
|
| 3 |
+
size 1150093
|
xgb_fraud.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
xgb_fraud.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2e1e8aae48dff34de465ecc84cd199306980ae617a4ace286b92febe6a828d4e
|
| 3 |
+
size 350795
|