Spaces:

Moncey10
/

Creditcard_fraud_detection

Running

App Files Files Community

Creditcard_fraud_detection / preprocessing.py

Moncey10

Update preprocessing.py

f9e312c verified 14 days ago

raw

history blame contribute delete

2.52 kB

	import pickle
	import os
	import pandas as pd
	import numpy as np
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, average_precision_score, f1_score
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LogisticRegression

	df = pd.read_csv('data/creditcard.csv')

	x = df.drop(['Class', 'Time', 'Amount'], axis=1)
	y = df['Class']

	x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

	pipeline = Pipeline([
	('scaler', StandardScaler()),
	('model', LogisticRegression(
	max_iter=1000,
	solver='lbfgs',
	class_weight='balanced',
	random_state=42,
	))
	])
	pipeline.fit(x_train, y_train)

	# Find best threshold
	proba = pipeline.predict_proba(x_test)[:, 1]
	thresholds = np.arange(0.05, 0.95, 0.01)
	best_thresh = 0.5
	best_f1 = 0

	for t in thresholds:
	preds = (proba >= t).astype(int)
	f1 = f1_score(y_test, preds, zero_division=0)
	if f1 > best_f1:
	best_f1 = f1
	best_thresh = t

	print(f"Best threshold: {best_thresh:.2f} with F1: {best_f1:.4f}")

	os.makedirs('artifacts', exist_ok=True)

	with open('artifacts/model.pkl', 'wb') as f:
	pickle.dump(pipeline, f)

	with open('artifacts/threshold.pkl', 'wb') as f:
	pickle.dump(float(best_thresh), f)

	# ── Save test CSV from actual test set ──────────────────────────────
	# 10 real fraud rows + 10 real legit rows from x_test
	fraud_idx = y_test[y_test == 1].index[:10]
	legit_idx = y_test[y_test == 0].index[:10]
	sample_idx = fraud_idx.tolist() + legit_idx.tolist()

	test_sample = x_test.loc[sample_idx].copy()
	test_sample['Class'] = y_test.loc[sample_idx].values

	import random
	test_sample = test_sample.sample(frac=1, random_state=42).reset_index(drop=True)
	test_sample.to_csv('artifacts/test_transactions.csv', index=False)
	print("Test CSV saved to artifacts/test_transactions.csv")
	print(test_sample['Class'].value_counts().to_string())

	# Metrics
	y_pred = (proba >= best_thresh).astype(int)
	cm = confusion_matrix(y_test, y_pred)
	roc_auc = roc_auc_score(y_test, proba)
	avg_precision = average_precision_score(y_test, proba)

	print(f"\nROC-AUC: {roc_auc:.4f}")
	print(f"Average Precision: {avg_precision:.4f}")
	print(f"\nClassification Report:\n{classification_report(y_test, y_pred, zero_division=0)}")
	print(f"Confusion Matrix:\n{cm}")
	print("\nModel saved!")