Spaces:

marianeft
/

Fraud_Detection

Sleeping

App Files Files Community

Fraud_Detection / app.py

marianeft

Update app.py

d99ae65 verified 10 months ago

raw

history blame contribute delete

5.96 kB

	import streamlit as st
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
	from sklearn.datasets import make_classification
	import joblib

	# Generate sample data
	def load_data():
	# Create a synthetic dataset for classification with 1000 samples and 20 features
	X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
	return X, y

	# Train models
	def train_models(X_train, y_train):
	# Dictionary of models to train
	models = {
	'Logistic Regression': LogisticRegression(),
	'Random Forest': RandomForestClassifier(),
	'Gradient Boosting': GradientBoostingClassifier()
	}

	trained_models = {}
	# Train each model using the training data
	for name, model in models.items():
	model.fit(X_train, y_train)
	trained_models[name] = model # Store trained models in a dictionary
	return trained_models

	# Predict and evaluate
	def evaluate_models(models, X_test, y_test):
	results = {}
	# Evaluate each model using the test data
	for name, model in models.items():
	y_pred = model.predict(X_test) # Predict class labels
	y_prob = model.predict_proba(X_test)[:, 1] # Probability estimates for ROC

	# Calculate accuracy and ROC AUC score
	accuracy = model.score(X_test, y_test)
	roc_auc = roc_auc_score(y_test, y_prob)
	# Compute confusion matrix and classification report
	conf_matrix = confusion_matrix(y_test, y_pred)
	class_report = classification_report(y_test, y_pred)

	results[name] = {
	'Accuracy': accuracy,
	'ROC AUC': roc_auc,
	'Confusion Matrix': conf_matrix,
	'Classification Report': class_report
	}
	return results

	# Streamlit app
	def main():
	st.title("Model Performance and Predictions")

	# Load and split data into training and test sets
	X, y = load_data()
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train) # Scale training data
	X_test_scaled = scaler.transform(X_test) # Scale test data

	# Train models using scaled training data
	models = train_models(X_train_scaled, y_train)

	# Sidebar for model selection
	st.sidebar.header("Model Selection")
	model_names = list(models.keys())
	selected_model_name = st.sidebar.selectbox("Select Model", model_names)
	selected_model = models[selected_model_name]

	# Evaluate selected model using test data
	results = evaluate_models(models, X_test_scaled, y_test)
	metrics = results[selected_model_name]

	st.header(f"Model: {selected_model_name}")

	st.subheader("Metrics")
	st.write(f"Accuracy: {metrics['Accuracy']:.4f}")
	st.write(f"ROC AUC: {metrics['ROC AUC']:.4f}")

	st.write("Confusion Matrix:")
	st.write(metrics['Confusion Matrix'])

	st.write("Classification Report:")
	st.text(metrics['Classification Report'])

	st.subheader("ROC Curve")
	plt.figure(figsize=(10, 7))
	y_prob = selected_model.predict_proba(X_test_scaled)[:, 1]
	fpr, tpr, _ = roc_curve(y_test, y_prob)
	plt.plot(fpr, tpr, label=f'{selected_model_name} (AUC = {metrics["ROC AUC"]:.2f})')
	plt.plot([0, 1], [0, 1], 'k--') # Diagonal line for random guessing
	plt.xlabel('False Positive Rate')
	plt.ylabel('True Positive Rate')
	plt.title('Receiver Operating Characteristic (ROC) Curve')
	plt.legend(loc='lower right')
	st.pyplot(plt)

	st.subheader("Feature Importance")
	if selected_model_name in ['Random Forest', 'Gradient Boosting']:
	feature_importances = selected_model.feature_importances_
	feature_names = [f'Feature {i}' for i in range(X_test_scaled.shape[1])]
	importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importances})
	importance_df = importance_df.sort_values(by='Importance', ascending=False)

	fig, ax = plt.subplots(figsize=(10, 7))
	sns.barplot(x='Importance', y='Feature', data=importance_df, ax=ax)
	ax.set_title(f'Feature Importance - {selected_model_name}')
	st.pyplot(fig)

	st.subheader("Make Predictions")
	input_data = st.text_input("Enter features separated by commas (e.g., 0.1, 0.2, ..., 0.5)")
	if input_data:
	try:
	# Convert input data to numpy array and reshape
	input_features = np.array([float(i) for i in input_data.split(',')]).reshape(1, -1)

	# Check if the number of features matches the model's input
	if input_features.shape[1] != X_train_scaled.shape[1]:
	st.error(f"Number of features should be {X_train_scaled.shape[1]}.")
	else:
	# Transform input features using the same scaler
	input_features_scaled = scaler.transform(input_features)

	# Predict using the selected model
	prediction = selected_model.predict(input_features_scaled)
	prediction_proba = selected_model.predict_proba(input_features_scaled)[:, 1]
	st.write(f"Prediction: {'Positive' if prediction[0] == 1 else 'Negative'}")
	st.write(f"Probability of Positive: {prediction_proba[0]:.4f}")

	except ValueError:
	st.error("Please enter valid numerical values separated by commas.")
	except Exception as e:
	st.error(f"An error occurred: {e}")

	if __name__ == "__main__":
	main()