Spaces:

Sahithi27
/

Collision_Fraud_Detection

Sleeping

App Files Files Community

Sahithi27 commited on Jan 7

Commit

3cdcdd2

verified ·

1 Parent(s): 5ccde51

Create app.py

Browse files

Files changed (1) hide show

app.py +119 -0

app.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import pandas as pd
+import numpy as np
+import joblib
+from xgboost import XGBClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import roc_auc_score, classification_report
+import onnxmltools
+from onnxmltools.convert.common.data_types import FloatTensorType
+def main():
+    # =============================
+    # 1. LOAD DATA
+    # =============================
+    df = pd.read_csv("synthetic_collusion_1M.csv")
+    # Robust timestamp parsing
+    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
+    df["hour"] = df["timestamp"].dt.hour
+    df["day_of_week"] = df["timestamp"].dt.dayofweek
+    # =============================
+    # 2. FEATURE ENGINEERING
+    # =============================
+    df["user_txn_count"] = df.groupby("user_id")["transaction_id"].transform("count")
+    df["driver_txn_count"] = df.groupby("driver_id")["transaction_id"].transform("count")
+    df["user_driver_pair_count"] = (
+        df.groupby(["user_id", "driver_id"])["transaction_id"]
+          .transform("count")
+    )
+    FEATURES = [
+        "amount",
+        "user_txn_count",
+        "driver_txn_count",
+        "user_driver_pair_count",
+        "hour",
+        "day_of_week"
+    ]
+    X = df[FEATURES].fillna(0)
+    y = df["is_collusion_fraud"]
+    # =============================
+    # 3. TRAIN / TEST SPLIT
+    # =============================
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y,
+        test_size=0.2,
+        stratify=y,
+        random_state=42
+    )
+    # =============================
+    # 4. TRAIN XGBOOST
+    # =============================
+    xgb_model = XGBClassifier(
+        n_estimators=300,
+        max_depth=6,
+        learning_rate=0.05,
+        subsample=0.8,
+        colsample_bytree=0.8,
+        scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1],
+        base_score=0.5,
+        objective="binary:logistic",
+        eval_metric="auc",
+        random_state=42,
+        n_jobs=-1
+    )
+    xgb_model.fit(X_train, y_train)
+    # =============================
+    # 5. EVALUATION
+    # =============================
+    y_prob = xgb_model.predict_proba(X_test)[:, 1]
+    print("\n=== MODEL EVALUATION ===")
+    print("ROC-AUC:", roc_auc_score(y_test, y_prob))
+    print(classification_report(y_test, (y_prob > 0.7).astype(int)))
+    # =============================
+    # 6. SAVE MODEL ARTIFACTS
+    # =============================
+    joblib.dump(xgb_model, "collusion_xgb_model.joblib")
+    joblib.dump(FEATURES, "feature_order.joblib")
+    print("✅ Model and feature order saved")
+    # =============================
+    # 7. CONVERT TO ONNX
+    # =============================
+    booster = xgb_model.get_booster()
+    # Rename features to f0, f1, f2... (required by onnxmltools)
+    booster.feature_names = [f"f{i}" for i in range(len(FEATURES))]
+    initial_type = [
+        ("float_input", FloatTensorType([None, len(FEATURES)]))
+    ]
+    onnx_model = onnxmltools.convert_xgboost(
+        booster,
+        initial_types=initial_type,
+        target_opset=12
+    )
+    with open("collusion_xgb_model.onnx", "wb") as f:
+        f.write(onnx_model.SerializeToString())
+    print("✅ ONNX model exported successfully")
+if __name__ == "__main__":
+    main()