File size: 2,523 Bytes
94c31e0 f9e312c 94c31e0 f9e312c 94c31e0 f9e312c 94c31e0 f9e312c 94c31e0 f9e312c 94c31e0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | import pickle
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('data/creditcard.csv')
x = df.drop(['Class', 'Time', 'Amount'], axis=1)
y = df['Class']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
pipeline = Pipeline([
('scaler', StandardScaler()),
('model', LogisticRegression(
max_iter=1000,
solver='lbfgs',
class_weight='balanced',
random_state=42,
))
])
pipeline.fit(x_train, y_train)
# Find best threshold
proba = pipeline.predict_proba(x_test)[:, 1]
thresholds = np.arange(0.05, 0.95, 0.01)
best_thresh = 0.5
best_f1 = 0
for t in thresholds:
preds = (proba >= t).astype(int)
f1 = f1_score(y_test, preds, zero_division=0)
if f1 > best_f1:
best_f1 = f1
best_thresh = t
print(f"Best threshold: {best_thresh:.2f} with F1: {best_f1:.4f}")
os.makedirs('artifacts', exist_ok=True)
with open('artifacts/model.pkl', 'wb') as f:
pickle.dump(pipeline, f)
with open('artifacts/threshold.pkl', 'wb') as f:
pickle.dump(float(best_thresh), f)
# ββ Save test CSV from actual test set ββββββββββββββββββββββββββββββ
# 10 real fraud rows + 10 real legit rows from x_test
fraud_idx = y_test[y_test == 1].index[:10]
legit_idx = y_test[y_test == 0].index[:10]
sample_idx = fraud_idx.tolist() + legit_idx.tolist()
test_sample = x_test.loc[sample_idx].copy()
test_sample['Class'] = y_test.loc[sample_idx].values
import random
test_sample = test_sample.sample(frac=1, random_state=42).reset_index(drop=True)
test_sample.to_csv('artifacts/test_transactions.csv', index=False)
print("Test CSV saved to artifacts/test_transactions.csv")
print(test_sample['Class'].value_counts().to_string())
# Metrics
y_pred = (proba >= best_thresh).astype(int)
cm = confusion_matrix(y_test, y_pred)
roc_auc = roc_auc_score(y_test, proba)
avg_precision = average_precision_score(y_test, proba)
print(f"\nROC-AUC: {roc_auc:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred, zero_division=0)}")
print(f"Confusion Matrix:\n{cm}")
print("\nModel saved!") |