Sahithi27 commited on
Commit
3cdcdd2
·
verified ·
1 Parent(s): 5ccde51

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -0
app.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import joblib
4
+
5
+ from xgboost import XGBClassifier
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.metrics import roc_auc_score, classification_report
8
+
9
+ import onnxmltools
10
+ from onnxmltools.convert.common.data_types import FloatTensorType
11
+
12
+
13
+ def main():
14
+
15
+ # =============================
16
+ # 1. LOAD DATA
17
+ # =============================
18
+ df = pd.read_csv("synthetic_collusion_1M.csv")
19
+
20
+ # Robust timestamp parsing
21
+ df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
22
+ df["hour"] = df["timestamp"].dt.hour
23
+ df["day_of_week"] = df["timestamp"].dt.dayofweek
24
+
25
+ # =============================
26
+ # 2. FEATURE ENGINEERING
27
+ # =============================
28
+ df["user_txn_count"] = df.groupby("user_id")["transaction_id"].transform("count")
29
+ df["driver_txn_count"] = df.groupby("driver_id")["transaction_id"].transform("count")
30
+ df["user_driver_pair_count"] = (
31
+ df.groupby(["user_id", "driver_id"])["transaction_id"]
32
+ .transform("count")
33
+ )
34
+
35
+ FEATURES = [
36
+ "amount",
37
+ "user_txn_count",
38
+ "driver_txn_count",
39
+ "user_driver_pair_count",
40
+ "hour",
41
+ "day_of_week"
42
+ ]
43
+
44
+ X = df[FEATURES].fillna(0)
45
+ y = df["is_collusion_fraud"]
46
+
47
+ # =============================
48
+ # 3. TRAIN / TEST SPLIT
49
+ # =============================
50
+ X_train, X_test, y_train, y_test = train_test_split(
51
+ X, y,
52
+ test_size=0.2,
53
+ stratify=y,
54
+ random_state=42
55
+ )
56
+
57
+ # =============================
58
+ # 4. TRAIN XGBOOST
59
+ # =============================
60
+ xgb_model = XGBClassifier(
61
+ n_estimators=300,
62
+ max_depth=6,
63
+ learning_rate=0.05,
64
+ subsample=0.8,
65
+ colsample_bytree=0.8,
66
+ scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
67
+ base_score=0.5,
68
+ objective="binary:logistic",
69
+ eval_metric="auc",
70
+ random_state=42,
71
+ n_jobs=-1
72
+ )
73
+
74
+ xgb_model.fit(X_train, y_train)
75
+
76
+ # =============================
77
+ # 5. EVALUATION
78
+ # =============================
79
+ y_prob = xgb_model.predict_proba(X_test)[:, 1]
80
+
81
+ print("\n=== MODEL EVALUATION ===")
82
+ print("ROC-AUC:", roc_auc_score(y_test, y_prob))
83
+ print(classification_report(y_test, (y_prob > 0.7).astype(int)))
84
+
85
+ # =============================
86
+ # 6. SAVE MODEL ARTIFACTS
87
+ # =============================
88
+ joblib.dump(xgb_model, "collusion_xgb_model.joblib")
89
+ joblib.dump(FEATURES, "feature_order.joblib")
90
+
91
+ print("✅ Model and feature order saved")
92
+
93
+ # =============================
94
+ # 7. CONVERT TO ONNX
95
+ # =============================
96
+ booster = xgb_model.get_booster()
97
+
98
+ # Rename features to f0, f1, f2... (required by onnxmltools)
99
+ booster.feature_names = [f"f{i}" for i in range(len(FEATURES))]
100
+
101
+ initial_type = [
102
+ ("float_input", FloatTensorType([None, len(FEATURES)]))
103
+ ]
104
+
105
+ onnx_model = onnxmltools.convert_xgboost(
106
+ booster,
107
+ initial_types=initial_type,
108
+ target_opset=12
109
+ )
110
+
111
+ with open("collusion_xgb_model.onnx", "wb") as f:
112
+ f.write(onnx_model.SerializeToString())
113
+
114
+ print("✅ ONNX model exported successfully")
115
+
116
+
117
+ if __name__ == "__main__":
118
+ main()
119
+