File size: 2,523 Bytes
94c31e0
 
 
 
 
f9e312c
94c31e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e312c
94c31e0
f9e312c
94c31e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f9e312c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94c31e0
 
 
 
 
f9e312c
94c31e0
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import pickle
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('data/creditcard.csv')

x = df.drop(['Class', 'Time', 'Amount'], axis=1)
y = df['Class']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        max_iter=1000,
        solver='lbfgs',
        class_weight='balanced',
        random_state=42,
    ))
])
pipeline.fit(x_train, y_train)

# Find best threshold
proba = pipeline.predict_proba(x_test)[:, 1]
thresholds = np.arange(0.05, 0.95, 0.01)
best_thresh = 0.5
best_f1 = 0

for t in thresholds:
    preds = (proba >= t).astype(int)
    f1 = f1_score(y_test, preds, zero_division=0)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = t

print(f"Best threshold: {best_thresh:.2f} with F1: {best_f1:.4f}")

os.makedirs('artifacts', exist_ok=True)

with open('artifacts/model.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

with open('artifacts/threshold.pkl', 'wb') as f:
    pickle.dump(float(best_thresh), f)

# ── Save test CSV from actual test set ──────────────────────────────
# 10 real fraud rows + 10 real legit rows from x_test
fraud_idx  = y_test[y_test == 1].index[:10]
legit_idx  = y_test[y_test == 0].index[:10]
sample_idx = fraud_idx.tolist() + legit_idx.tolist()

test_sample = x_test.loc[sample_idx].copy()
test_sample['Class'] = y_test.loc[sample_idx].values

import random
test_sample = test_sample.sample(frac=1, random_state=42).reset_index(drop=True)
test_sample.to_csv('artifacts/test_transactions.csv', index=False)
print("Test CSV saved to artifacts/test_transactions.csv")
print(test_sample['Class'].value_counts().to_string())

# Metrics
y_pred = (proba >= best_thresh).astype(int)
cm = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, proba)
avg_precision = average_precision_score(y_test, proba)

print(f"\nROC-AUC: {roc_auc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred, zero_division=0)}")
print(f"Confusion Matrix:\n{cm}")
print("\nModel saved!")