import pickle import os import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression df = pd.read_csv('data/creditcard.csv') x = df.drop(['Class', 'Time', 'Amount'], axis=1) y = df['Class'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42) pipeline = Pipeline([ ('scaler', StandardScaler()), ('model', LogisticRegression( max_iter=1000, solver='lbfgs', class_weight='balanced', random_state=42, )) ]) pipeline.fit(x_train, y_train) # Find best threshold proba = pipeline.predict_proba(x_test)[:, 1] thresholds = np.arange(0.05, 0.95, 0.01) best_thresh = 0.5 best_f1 = 0 for t in thresholds: preds = (proba >= t).astype(int) f1 = f1_score(y_test, preds, zero_division=0) if f1 > best_f1: best_f1 = f1 best_thresh = t print(f"Best threshold: {best_thresh:.2f} with F1: {best_f1:.4f}") os.makedirs('artifacts', exist_ok=True) with open('artifacts/model.pkl', 'wb') as f: pickle.dump(pipeline, f) with open('artifacts/threshold.pkl', 'wb') as f: pickle.dump(float(best_thresh), f) # ── Save test CSV from actual test set ────────────────────────────── # 10 real fraud rows + 10 real legit rows from x_test fraud_idx = y_test[y_test == 1].index[:10] legit_idx = y_test[y_test == 0].index[:10] sample_idx = fraud_idx.tolist() + legit_idx.tolist() test_sample = x_test.loc[sample_idx].copy() test_sample['Class'] = y_test.loc[sample_idx].values import random test_sample = test_sample.sample(frac=1, random_state=42).reset_index(drop=True) test_sample.to_csv('artifacts/test_transactions.csv', index=False) print("Test CSV saved to artifacts/test_transactions.csv") print(test_sample['Class'].value_counts().to_string()) # Metrics y_pred = (proba >= best_thresh).astype(int) cm = confusion_matrix(y_test, y_pred) roc_auc = roc_auc_score(y_test, proba) avg_precision = average_precision_score(y_test, proba) print(f"\nROC-AUC: {roc_auc:.4f}") print(f"Average Precision: {avg_precision:.4f}") print(f"\nClassification Report:\n{classification_report(y_test, y_pred, zero_division=0)}") print(f"Confusion Matrix:\n{cm}") print("\nModel saved!")