""" Financial Fraud Detection System - TechMatrix Solvers Team Members: - Abhay Gupta - Jay Kumar - Kripanshu Gupta - Bhumika Patel A comprehensive fraud detection system using machine learning algorithms. """ import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns import plotly.express as px import plotly.graph_objects as go import os import pickle import time import warnings from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from xgboost import XGBClassifier from sklearn.metrics import ( accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report, roc_curve ) from imblearn.over_sampling import SMOTE # Suppress warnings warnings.filterwarnings('ignore') # Set page configuration st.set_page_config( page_title="TechMatrix Fraud Detection System", page_icon="🔒", layout="wide", initial_sidebar_state="collapsed" ) # Custom CSS for better styling st.markdown(""" """, unsafe_allow_html=True) # Create necessary directories os.makedirs("data", exist_ok=True) os.makedirs("models", exist_ok=True) # Initialize session state if 'current_page' not in st.session_state: st.session_state['current_page'] = 'home' if 'data' not in st.session_state: st.session_state['data'] = None if 'preprocessed_data' not in st.session_state: st.session_state['preprocessed_data'] = None if 'engineered_data' not in st.session_state: st.session_state['engineered_data'] = None if 'target_col' not in st.session_state: st.session_state['target_col'] = 'Class' if 'trained_models' not in st.session_state: st.session_state['trained_models'] = {} if 'predictions' not in st.session_state: st.session_state['predictions'] = None if 'progress' not in st.session_state: st.session_state['progress'] = 0 # Main title st.markdown("

TechMatrix Fraud Detection System

", unsafe_allow_html=True) # Team information st.markdown("""

Team TechMatrix Solvers

Abhay Gupta | Jay Kumar | Kripanshu Gupta | Bhumika Patel

""", unsafe_allow_html=True) # Home Page if st.session_state['current_page'] == 'home': # Introduction section st.markdown("

Welcome to TechMatrix Fraud Detection System

", unsafe_allow_html=True) col1, col2 = st.columns([2, 1]) with col1: st.markdown(""" Our advanced fraud detection system leverages cutting-edge machine learning algorithms to identify and prevent fraudulent transactions in real-time. ### Understanding Financial Fraud Financial fraud encompasses various deceptive practices aimed at unauthorized acquisition of funds or assets. Our system specifically addresses: - Credit card transaction fraud - Identity theft incidents - Account compromise attempts - Suspicious transaction patterns ### Machine Learning Implementation Our system employs sophisticated machine learning models that analyze transaction patterns and behavioral data. The models are trained on historical fraud data and continuously updated to adapt to emerging fraud patterns. ### System Advantages: - **Real-time Monitoring**: Instant detection of suspicious activities - **Scalable Processing**: Efficient handling of large transaction volumes - **Pattern Recognition**: Advanced detection of complex fraud patterns - **Risk Assessment**: Probability-based fraud scoring system """) with col2: # Create a unique visualization of the fraud detection process fig = go.Figure() # Create a hexagonal flow diagram angles = np.linspace(0, 2*np.pi, 6, endpoint=False) x = 0.5 + 0.4 * np.cos(angles) y = 0.5 + 0.4 * np.sin(angles) # Add connecting lines with gradient effect for i in range(len(angles)): next_i = (i + 1) % len(angles) fig.add_trace(go.Scatter( x=[x[i], x[next_i]], y=[y[i], y[next_i]], mode='lines', line=dict( color='rgba(46, 125, 50, 0.5)', width=2, dash='dot' ), showlegend=False )) # Add nodes with updated colors and labels node_labels = ['Input Data', 'Validation', 'Processing', 'Analysis', 'Detection', 'Action'] node_colors = ['#2E7D32', '#43A047', '#81C784', '#1976D2', '#64B5F6', '#D32F2F'] for i in range(len(angles)): fig.add_trace(go.Scatter( x=[x[i]], y=[y[i]], mode='markers+text', marker=dict( size=30, color=node_colors[i], symbol='hexagon' ), text=node_labels[i], textposition="middle center", textfont=dict(color='white', size=12), showlegend=False )) # Add title in the center with updated styling fig.add_trace(go.Scatter( x=[0.5], y=[0.5], mode='text', text='Fraud
Detection
Pipeline', textposition="middle center", textfont=dict( color='#212121', size=14, family='Arial, bold' ), showlegend=False )) fig.update_layout( height=400, width=400, margin=dict(l=0, r=0, t=0, b=0), xaxis=dict( showgrid=False, zeroline=False, showticklabels=False, range=[0, 1] ), yaxis=dict( showgrid=False, zeroline=False, showticklabels=False, range=[0, 1] ), plot_bgcolor='rgba(0,0,0,0)' ) st.plotly_chart(fig) # Workflow section st.markdown("

System Workflow

", unsafe_allow_html=True) col1, col2, col3, col4 = st.columns(4) with col1: st.markdown("### 1. Data Ingestion") st.markdown("Secure upload and validation of transaction data in CSV format.") st.image("https://cdn-icons-png.flaticon.com/512/4208/4208479.png", width=100) with col2: st.markdown("### 2. Data Processing") st.markdown("Advanced data cleaning and preparation for analysis.") st.image("https://cdn-icons-png.flaticon.com/512/1875/1875627.png", width=100) with col3: st.markdown("### 3. Feature Extraction") st.markdown("Intelligent feature engineering and pattern recognition.") st.image("https://cdn-icons-png.flaticon.com/512/2103/2103633.png", width=100) with col4: st.markdown("### 4. Model Deployment") st.markdown("Real-time fraud detection and risk assessment.") st.image("https://cdn-icons-png.flaticon.com/512/2103/2103658.png", width=100) # Sample visualizations section st.markdown("

System Analytics

", unsafe_allow_html=True) col1, col2 = st.columns(2) with col1: # Sample ROC curve with improved styling fig = go.Figure() fpr = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] tpr_lr = [0, 0.4, 0.55, 0.68, 0.75, 0.8, 0.85, 0.9, 0.94, 0.98, 1.0] tpr_rf = [0, 0.5, 0.65, 0.78, 0.85, 0.88, 0.91, 0.95, 0.97, 0.99, 1.0] tpr_xgb = [0, 0.55, 0.7, 0.8, 0.87, 0.9, 0.93, 0.96, 0.98, 0.99, 1.0] fig.add_trace(go.Scatter( x=fpr, y=tpr_lr, mode='lines', name='Logistic Regression (AUC = 0.85)', line=dict(color='#2E7D32', width=3) )) fig.add_trace(go.Scatter( x=fpr, y=tpr_rf, mode='lines', name='Random Forest (AUC = 0.92)', line=dict(color='#1976D2', width=3) )) fig.add_trace(go.Scatter( x=fpr, y=tpr_xgb, mode='lines', name='XGBoost (AUC = 0.94)', line=dict(color='#D32F2F', width=3) )) fig.add_trace(go.Scatter( x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash', color='#757575', width=2) )) fig.update_layout( title='Model Performance Comparison', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', legend=dict(x=0.01, y=0.99), width=600, height=400, template='plotly_white', margin=dict(l=40, r=40, t=40, b=40) ) st.plotly_chart(fig) with col2: # Sample feature importance with improved styling features = ['Transaction Amount', 'Time of Day', 'Merchant Category', 'Location', 'Transaction Frequency', 'Device Used', 'IP Address', 'Account Age', 'Previous Fraud Flag', 'Transaction Type'] importance = [0.23, 0.18, 0.15, 0.12, 0.09, 0.08, 0.06, 0.04, 0.03, 0.02] fig = px.bar( x=importance, y=features, orientation='h', title='Feature Importance Analysis', labels={'x': 'Importance Score', 'y': 'Feature'}, color=importance, color_continuous_scale=['#2E7D32', '#43A047', '#81C784'] ) fig.update_layout( width=600, height=400, template='plotly_white', margin=dict(l=40, r=40, t=40, b=40) ) st.plotly_chart(fig) # Get started button st.markdown("
", unsafe_allow_html=True) if st.button("Get Started", key="get_started", help="Begin the fraud detection process"): st.session_state['current_page'] = 'upload' st.rerun() st.markdown("
", unsafe_allow_html=True) # Data Upload Page elif st.session_state['current_page'] == 'upload': st.markdown("

Step 1: Data Ingestion

", unsafe_allow_html=True) # File uploader with size limit warning st.markdown(""" ### Secure Data Upload Upload your transaction data securely in CSV format. The system supports the following: - Transaction details (amount, timestamp, location, etc.) - Target column for fraud classification (default: 'Class' with 0 for normal, 1 for fraud) - **Maximum file size: 200 MB** For testing purposes, you can use the [Credit Card Fraud Detection dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud) from Kaggle. ### Data Requirements: - CSV format with UTF-8 encoding - No missing values in critical fields - Proper date/time formatting - Numeric values for transaction amounts """) uploaded_file = st.file_uploader( "Upload transaction data (CSV file)", type="csv", help="Maximum file size: 200 MB" ) if uploaded_file is not None: # Check file size (200 MB limit) file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type} # Read the file into a buffer to check its size file_buffer = uploaded_file.getvalue() file_size_mb = len(file_buffer) / (1024 * 1024) if file_size_mb > 200: st.error(f"File size exceeds the 200 MB limit. Your file is {file_size_mb:.2f} MB. Please upload a smaller file.") st.stop() else: st.info(f"File size: {file_size_mb:.2f} MB") # Load data with progress bar progress_bar = st.progress(0) status_text = st.empty() status_text.text("Initializing data ingestion...") progress_bar.progress(25) time.sleep(0.3) try: # Use BytesIO to avoid loading the file twice from io import BytesIO df = pd.read_csv(BytesIO(file_buffer)) st.session_state['data'] = df progress_bar.progress(50) status_text.text("Validating data structure...") time.sleep(0.3) progress_bar.progress(75) status_text.text("Preparing data preview...") time.sleep(0.3) progress_bar.progress(100) status_text.text("Data ingestion completed!") time.sleep(0.3) status_text.empty() progress_bar.empty() # Show basic data info st.success(f"Data ingested successfully! Shape: {df.shape[0]} rows and {df.shape[1]} columns") col1, col2 = st.columns(2) with col1: st.subheader("Data Preview") st.dataframe(df.head()) with col2: st.subheader("Data Structure") # Display data types and missing values data_info = pd.DataFrame({ 'Data Type': df.dtypes, 'Non-Null Count': df.count(), 'Missing Values': df.isnull().sum(), 'Unique Values': [df[col].nunique() for col in df.columns] }) st.dataframe(data_info) # Check for target column if 'Class' in df.columns: fraud_count = df['Class'].sum() total_count = len(df) fraud_percentage = (fraud_count / total_count) * 100 st.info(f"Target column 'Class' detected with {fraud_count} fraud cases ({fraud_percentage:.2f}% of data)") else: st.warning("No 'Class' column detected. You'll need to specify the target column in the next step.") except Exception as e: st.error(f"Error during data ingestion: {str(e)}") st.info("Please ensure the file is a valid CSV with proper formatting.") # Navigation buttons col1, col2 = st.columns([1, 5]) with col1: if st.button("← Back to Home", key="back_to_home"): st.session_state['current_page'] = 'home' st.rerun() with col2: if st.session_state['data'] is not None: if st.button("Continue to Data Processing →", key="to_preprocess"): st.session_state['current_page'] = 'preprocess' st.rerun() # Data Preprocessing Page elif st.session_state['current_page'] == 'preprocess': st.markdown("

Step 2: Data Processing

", unsafe_allow_html=True) if st.session_state['data'] is None: st.error("No data found. Please upload data first.") if st.button("Go back to Data Ingestion"): st.session_state['current_page'] = 'upload' st.rerun() else: df = st.session_state['data'] st.markdown(""" ### Advanced Data Processing Enhance your data quality through our comprehensive processing pipeline. The system will: - Handle missing values intelligently - Remove statistical outliers - Normalize numerical features - Balance class distribution Select the processing options below to customize the pipeline. """) # Target column selection if 'Class' in df.columns: target_col = 'Class' st.info(f"Target column 'Class' detected with values: {df[target_col].unique()}") else: target_col = st.selectbox("Select the target column (fraud indicator)", df.columns) st.session_state['target_col'] = target_col # Preprocessing options st.subheader("Processing Options") col1, col2 = st.columns(2) with col1: handle_missing = st.checkbox("Handle Missing Values", value=True, help="Fill missing numerical values with mean and categorical values with mode") remove_outliers = st.checkbox("Remove Outliers", value=False, help="Remove extreme values that might affect model performance") with col2: normalize_data = st.checkbox("Normalize Data", value=True, help="Scale numerical features to have zero mean and unit variance") balance_classes = st.checkbox("Balance Classes", value=True, help="Handle class imbalance using SMOTE in the training phase") # Handle missing values if st.button("Process Data"): with st.spinner("Processing data..."): # Create a copy of the dataframe df_processed = df.copy() # Progress bar progress_bar = st.progress(0) status_text = st.empty() # Handle missing values if handle_missing: status_text.text("Processing missing values...") progress_bar.progress(25) time.sleep(0.3) for col in df_processed.columns: if df_processed[col].dtype in ['int64', 'float64']: df_processed[col] = df_processed[col].fillna(df_processed[col].mean()) else: df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0]) # Remove outliers if selected if remove_outliers: status_text.text("Processing outliers...") progress_bar.progress(50) time.sleep(0.3) # Only apply to numerical columns num_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns for col in num_cols: if col != target_col: # Don't remove outliers from target column Q1 = df_processed[col].quantile(0.25) Q3 = df_processed[col].quantile(0.75) IQR = Q3 - Q1 lower_bound = Q1 - 3 * IQR upper_bound = Q3 + 3 * IQR df_processed = df_processed[(df_processed[col] >= lower_bound) & (df_processed[col] <= upper_bound)] # Store the processed data status_text.text("Finalizing data processing...") progress_bar.progress(100) time.sleep(0.3) st.session_state['preprocessed_data'] = df_processed status_text.empty() progress_bar.empty() st.success("Data processing completed!") # Show class distribution if target_col in df_processed.columns: st.subheader("Class Distribution After Processing") col1, col2 = st.columns(2) with col1: # Create pie chart with improved styling labels = ['Normal', 'Fraud'] values = [len(df_processed[df_processed[target_col] == 0]), len(df_processed[df_processed[target_col] == 1])] fig = px.pie( values=values, names=labels, title='Transaction Distribution', color_discrete_sequence=['#2E7D32', '#D32F2F'], hole=0.4 ) fig.update_traces(textposition='inside', textinfo='percent+label') fig.update_layout( template='plotly_white', margin=dict(l=20, r=20, t=30, b=20) ) st.plotly_chart(fig) with col2: # Calculate statistics fraud_count = df_processed[target_col].sum() total_count = len(df_processed) fraud_percentage = (fraud_count / total_count) * 100 st.metric("Total Transactions", f"{total_count:,}") st.metric("Fraud Transactions", f"{fraud_count:,}") st.metric("Fraud Percentage", f"{fraud_percentage:.2f}%") if fraud_percentage < 1: st.warning("Your dataset is highly imbalanced. Class balancing will be applied during model training.") # Navigation buttons col1, col2 = st.columns([1, 5]) with col1: if st.button("← Back to Upload", key="back_to_upload"): st.session_state['current_page'] = 'upload' st.rerun() with col2: if st.session_state['preprocessed_data'] is not None: if st.button("Continue to Feature Extraction →", key="to_feature_eng"): st.session_state['current_page'] = 'feature_engineering' st.rerun() # Feature Engineering Page elif st.session_state['current_page'] == 'feature_engineering': st.markdown("

Step 3: Feature Extraction

", unsafe_allow_html=True) if st.session_state['preprocessed_data'] is None: st.error("No processed data found. Please complete data processing first.") if st.button("Go back to Data Processing"): st.session_state['current_page'] = 'preprocess' st.rerun() else: df_processed = st.session_state['preprocessed_data'] target_col = st.session_state['target_col'] st.markdown(""" ### Intelligent Feature Extraction Enhance your fraud detection capabilities through advanced feature engineering. Our system provides: - Time-based pattern analysis - Transaction amount profiling - Behavioral feature extraction - Cross-feature interaction analysis Select the features to extract below to optimize your model's performance. """) # Feature engineering options st.subheader("Feature Extraction Options") col1, col2 = st.columns(2) with col1: create_time_features = st.checkbox("Time-based Features", value=True, help="Extract temporal patterns and behavioral indicators") create_amount_features = st.checkbox("Amount-based Features", value=True, help="Generate transaction amount profiles and risk indicators") with col2: create_aggregations = st.checkbox("Aggregation Features", value=False, help="Create aggregated metrics for transaction patterns") create_interactions = st.checkbox("Interaction Features", value=False, help="Generate cross-feature interactions for complex pattern detection") # Apply feature engineering if st.button("Extract Features"): with st.spinner("Extracting features..."): # Create a copy of the dataframe df_engineered = df_processed.copy() # Progress bar progress_bar = st.progress(0) status_text = st.empty() # Time-based features if create_time_features and 'Time' in df_engineered.columns: status_text.text("Extracting temporal features...") progress_bar.progress(25) time.sleep(0.3) # Hour of day df_engineered['Hour'] = (df_engineered['Time'] / 3600) % 24 # Flag for transactions during odd hours (midnight to 5 AM) df_engineered['Odd_Hour'] = ((df_engineered['Hour'] >= 0) & (df_engineered['Hour'] < 5)).astype(int) # Part of day df_engineered['Part_of_Day'] = pd.cut( df_engineered['Hour'], bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'] ) # Amount-based features if create_amount_features and 'Amount' in df_engineered.columns: status_text.text("Extracting amount-based features...") progress_bar.progress(50) time.sleep(0.3) # Log transform for amount (to handle skewed distribution) df_engineered['Log_Amount'] = np.log1p(df_engineered['Amount']) # Flag for high-value transactions (top 5%) threshold = df_engineered['Amount'].quantile(0.95) df_engineered['High_Value'] = (df_engineered['Amount'] > threshold).astype(int) # Amount bins df_engineered['Amount_Bin'] = pd.qcut( df_engineered['Amount'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'] ) # Aggregation features if create_aggregations: status_text.text("Generating aggregation features...") progress_bar.progress(75) time.sleep(0.3) # Check if there's a card ID or similar column potential_id_cols = [col for col in df_engineered.columns if 'id' in col.lower() or 'card' in col.lower()] if potential_id_cols: id_col = potential_id_cols[0] # Number of transactions per card tx_count = df_engineered.groupby(id_col).size().reset_index(name='Tx_Count') df_engineered = df_engineered.merge(tx_count, on=id_col, how='left') # Average transaction amount per card if 'Amount' in df_engineered.columns: avg_amount = df_engineered.groupby(id_col)['Amount'].mean().reset_index(name='Avg_Amount') df_engineered = df_engineered.merge(avg_amount, on=id_col, how='left') # Transaction amount deviation from average df_engineered['Amount_Deviation'] = df_engineered['Amount'] - df_engineered['Avg_Amount'] # Interaction features if create_interactions: status_text.text("Generating interaction features...") progress_bar.progress(90) time.sleep(0.3) # Only create interactions between numerical features num_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns num_cols = [col for col in num_cols if col != target_col and 'id' not in col.lower()] # Limit to a few important features to avoid explosion of features if len(num_cols) > 3: num_cols = num_cols[:3] # Create interactions for i in range(len(num_cols)): for j in range(i+1, len(num_cols)): col1_name = num_cols[i] col2_name = num_cols[j] df_engineered[f'{col1_name}_x_{col2_name}'] = df_engineered[col1_name] * df_engineered[col2_name] # Convert categorical columns to one-hot encoding cat_cols = df_engineered.select_dtypes(include=['object', 'category']).columns for col in cat_cols: dummies = pd.get_dummies(df_engineered[col], prefix=col, drop_first=True) df_engineered = pd.concat([df_engineered, dummies], axis=1) df_engineered.drop(columns=[col], inplace=True) # Store the engineered data status_text.text("Finalizing feature extraction...") progress_bar.progress(100) time.sleep(0.3) st.session_state['engineered_data'] = df_engineered status_text.empty() progress_bar.empty() st.success("Feature extraction completed!") # Show correlation with target if target_col in df_engineered.columns: st.subheader("Feature Correlation Analysis") # Get correlation with target corr_with_target = df_engineered.corr()[target_col].sort_values(ascending=False) # Remove target's correlation with itself corr_with_target = corr_with_target.drop(target_col) # Get top 10 positive and negative correlations top_pos = corr_with_target.head(10) top_neg = corr_with_target.tail(10).iloc[::-1] # Reverse to show strongest negative first col1, col2 = st.columns(2) with col1: # Plot top positive correlations with improved styling fig = px.bar( x=top_pos.values, y=top_pos.index, orientation='h', title='Top Positive Correlations with Fraud', labels={'x': 'Correlation', 'y': 'Feature'}, color=top_pos.values, color_continuous_scale=['#2E7D32', '#43A047', '#81C784'] ) fig.update_layout( height=400, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) with col2: # Plot top negative correlations with improved styling fig = px.bar( x=top_neg.values, y=top_neg.index, orientation='h', title='Top Negative Correlations with Fraud', labels={'x': 'Correlation', 'y': 'Feature'}, color=top_neg.values, color_continuous_scale=['#81C784', '#43A047', '#2E7D32'] ) fig.update_layout( height=400, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Correlation heatmap st.subheader("Feature Correlation Matrix") # Get top correlated features corr_matrix = df_engineered.corr() top_corr_features = corr_with_target.abs().sort_values(ascending=False).head(15).index # Create heatmap with selected features top_corr_matrix = corr_matrix.loc[top_corr_features, top_corr_features] fig = px.imshow( top_corr_matrix, text_auto='.2f', color_continuous_scale=['#2E7D32', 'white', '#1976D2'], title='Feature Correlation Matrix' ) fig.update_layout( height=600, width=800, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Feature distributions st.subheader("Feature Distribution Analysis") # Select a feature to visualize numeric_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns numeric_cols = [col for col in numeric_cols if col != target_col] selected_feature = st.selectbox("Select feature to analyze", numeric_cols) # Create distribution plot with improved styling fig = px.histogram( df_engineered, x=selected_feature, color=target_col, marginal="box", opacity=0.7, barmode="overlay", color_discrete_map={0: "#2E7D32", 1: "#D32F2F"}, labels={target_col: "Class", "0": "Normal", "1": "Fraud"} ) fig.update_layout( title=f"Distribution Analysis of {selected_feature}", template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Navigation buttons col1, col2 = st.columns([1, 5]) with col1: if st.button("← Back to Processing", key="back_to_preprocess"): st.session_state['current_page'] = 'preprocess' st.rerun() with col2: if st.session_state['engineered_data'] is not None: if st.button("Continue to Model Training →", key="to_model_training"): st.session_state['current_page'] = 'model_training' st.rerun() # Model Training Page elif st.session_state['current_page'] == 'model_training': st.markdown("

Step 4: Model Training

", unsafe_allow_html=True) if st.session_state['engineered_data'] is None: st.error("No engineered data found. Please complete feature extraction first.") if st.button("Go back to Feature Extraction"): st.session_state['current_page'] = 'feature_engineering' st.rerun() else: df_engineered = st.session_state['engineered_data'] target_col = st.session_state['target_col'] st.markdown(""" ### Advanced Model Training Train sophisticated machine learning models for fraud detection. Our system provides: - Multiple model architectures - Automated hyperparameter optimization - Cross-validation for robust evaluation - Performance metrics visualization Select your preferred models and training parameters below. """) # Training options st.subheader("Training Configuration") col1, col2 = st.columns(2) with col1: # Data sampling for faster training - default to a smaller sample for speed use_sample = st.checkbox("Use Data Sample for Faster Training", value=True, help="Use a sample of the data to speed up training (recommended for large datasets)") if use_sample: sample_size = st.slider("Sample Size (%)", min_value=10, max_value=100, value=20, help="Percentage of data to use for training") # Test size test_size = st.slider("Test Set Size (%)", min_value=10, max_value=50, value=20, help="Percentage of data to use for testing") # Class balancing use_smote = st.checkbox("Apply SMOTE for Class Balancing", value=True, help="Use SMOTE to handle class imbalance") with col2: # Model selection st.write("Select Models to Train:") train_lr = st.checkbox("Logistic Regression", value=True) train_rf = st.checkbox("Random Forest", value=True) train_xgb = st.checkbox("XGBoost", value=True) # Advanced options - reduced default values for faster training show_advanced = st.checkbox("Show Advanced Options", value=False) if show_advanced: # Number of estimators for tree models - reduced for speed n_estimators = st.slider("Number of Estimators", min_value=10, max_value=200, value=50, help="Number of trees for Random Forest and XGBoost (higher = more accurate but slower)") # Max depth for tree models max_depth = st.slider("Max Tree Depth", min_value=3, max_value=15, value=6, help="Maximum depth of trees (higher = more complex model)") # Start training if st.button("Train Models"): with st.spinner("Training models..."): status_container = st.empty() status_container.markdown( '
Training in progress... This may take a few minutes.
', unsafe_allow_html=True ) # Prepare data for training X = df_engineered.drop(columns=[target_col]) y = df_engineered[target_col] # Use sample if selected if use_sample and sample_size < 100: sample_frac = sample_size / 100 # Stratified sampling to maintain class distribution X_sample = pd.DataFrame() y_sample = pd.Series() for class_value in y.unique(): X_class = X[y == class_value] y_class = y[y == class_value] n_samples = int(len(X_class) * sample_frac) indices = np.random.choice(X_class.index, size=n_samples, replace=False) X_sample = pd.concat([X_sample, X_class.loc[indices]]) y_sample = pd.concat([y_sample, y_class.loc[indices]]) X = X_sample y = y_sample # Progress bar progress_bar = st.progress(0) status_text = st.empty() status_text.text("Preparing training data...") progress_bar.progress(10) # Split data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size/100, random_state=42, stratify=y ) status_text.text("Scaling features...") progress_bar.progress(20) # Scale features scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) # Handle class imbalance with SMOTE if selected if use_smote: status_text.text("Applying SMOTE for class balancing...") progress_bar.progress(30) smote = SMOTE(random_state=42) X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train) else: X_train_resampled, y_train_resampled = X_train_scaled, y_train # Save preprocessor with open("models/scaler.pkl", "wb") as f: pickle.dump(scaler, f) # Save feature columns with open("models/feature_columns.pkl", "wb") as f: pickle.dump(X.columns.tolist(), f) # Initialize results list results = [] trained_models = {} # Train selected models if train_lr: status_text.text("Training Logistic Regression...") progress_bar.progress(40) # Train Logistic Regression lr_model = LogisticRegression(max_iter=1000, class_weight='balanced') lr_model.fit(X_train_resampled, y_train_resampled) # Make predictions y_pred = lr_model.predict(X_test_scaled) y_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1] # Calculate metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) auc = roc_auc_score(y_test, y_pred_proba) cm = confusion_matrix(y_test, y_pred) # Store results lr_results = { 'model_name': 'Logistic Regression', 'model': lr_model, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'auc': auc, 'confusion_matrix': cm, 'y_test': y_test, 'y_pred_proba': y_pred_proba } results.append(lr_results) trained_models['lr'] = lr_model # Save model with open("models/logistic_regression.pkl", "wb") as f: pickle.dump(lr_model, f) if train_rf: status_text.text("Training Random Forest...") progress_bar.progress(60) # Get parameters - use smaller values for speed n_est = n_estimators if show_advanced else 50 m_depth = max_depth if show_advanced else 6 # Train Random Forest rf_model = RandomForestClassifier( n_estimators=n_est, max_depth=m_depth, class_weight='balanced', random_state=42 ) rf_model.fit(X_train_resampled, y_train_resampled) # Make predictions y_pred = rf_model.predict(X_test_scaled) y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1] # Calculate metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) auc = roc_auc_score(y_test, y_pred_proba) cm = confusion_matrix(y_test, y_pred) # Store results rf_results = { 'model_name': 'Random Forest', 'model': rf_model, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'auc': auc, 'confusion_matrix': cm, 'y_test': y_test, 'y_pred_proba': y_pred_proba } results.append(rf_results) trained_models['rf'] = rf_model # Save model with open("models/random_forest.pkl", "wb") as f: pickle.dump(rf_model, f) if train_xgb: status_text.text("Training XGBoost...") progress_bar.progress(80) # Get parameters - use smaller values for speed n_est = n_estimators if show_advanced else 50 m_depth = max_depth if show_advanced else 6 # Train XGBoost xgb_model = XGBClassifier( n_estimators=n_est, max_depth=m_depth, scale_pos_weight=10, random_state=42, use_label_encoder=False, eval_metric='logloss' ) xgb_model.fit(X_train_resampled, y_train_resampled) # Make predictions y_pred = xgb_model.predict(X_test_scaled) y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1] # Calculate metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) auc = roc_auc_score(y_test, y_pred_proba) cm = confusion_matrix(y_test, y_pred) # Store results xgb_results = { 'model_name': 'XGBoost', 'model': xgb_model, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1, 'auc': auc, 'confusion_matrix': cm, 'y_test': y_test, 'y_pred_proba': y_pred_proba } results.append(xgb_results) trained_models['xgb'] = xgb_model # Save model with open("models/xgboost.pkl", "wb") as f: pickle.dump(xgb_model, f) # Save test data with open("models/test_data.pkl", "wb") as f: pickle.dump({"X_test": X_test_scaled, "y_test": y_test}, f) st.session_state['trained_models'] = trained_models # Automatically make predictions on the original dataset status_text.text("Generating predictions...") progress_bar.progress(90) # Find the best model based on F1 score (good for imbalanced data) best_model = None best_f1 = -1 best_model_name = "" for result in results: if result['f1_score'] > best_f1: best_f1 = result['f1_score'] best_model = result['model'] best_model_name = result['model_name'] if best_model is not None: # Prepare full dataset for prediction X_full = df_engineered.drop(columns=[target_col]) # Scale the data X_full_scaled = scaler.transform(X_full) # Make predictions y_pred = best_model.predict(X_full_scaled) y_pred_proba = best_model.predict_proba(X_full_scaled)[:, 1] # Add predictions to the dataframe df_with_predictions = df_engineered.copy() df_with_predictions['Fraud_Probability'] = y_pred_proba df_with_predictions['Predicted_Fraud'] = y_pred # Store predictions st.session_state['predictions'] = { 'df': df_with_predictions, 'model_name': best_model_name, 'results': results } status_text.text("Training completed!") progress_bar.progress(100) time.sleep(0.3) status_text.empty() progress_bar.empty() st.success("Models trained successfully!") # Display comparison of results if results: st.subheader("Model Performance Analysis") # Create comparison table comparison_df = pd.DataFrame([ { 'Model': r['model_name'], 'Accuracy': r['accuracy'], 'Precision': r['precision'], 'Recall': r['recall'], 'F1 Score': r['f1_score'], 'AUC': r['auc'] } for r in results ]) st.dataframe(comparison_df.style.highlight_max(axis=0, color='#81C784')) # Plot metrics comparison with improved styling fig = px.bar( comparison_df.melt(id_vars=['Model'], var_name='Metric', value_name='Value'), x='Model', y='Value', color='Metric', barmode='group', title='Model Performance Comparison', labels={'Value': 'Score', 'Model': 'Model'}, color_discrete_sequence=['#2E7D32', '#43A047', '#81C784', '#1976D2', '#D32F2F'] ) fig.update_layout( height=500, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Plot ROC curves with improved styling st.subheader("ROC Curve Analysis") fig = go.Figure() colors = ['#2E7D32', '#1976D2', '#D32F2F'] for i, result in enumerate(results): model_name = result['model_name'] y_test = result['y_test'] y_pred_proba = result['y_pred_proba'] fpr, tpr, _ = roc_curve(y_test, y_pred_proba) auc = result['auc'] fig.add_trace(go.Scatter( x=fpr, y=tpr, mode='lines', name=f'{model_name} (AUC = {auc:.3f})', line=dict(color=colors[i % len(colors)], width=3) )) fig.add_trace(go.Scatter( x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash', color='#757575', width=2) )) fig.update_layout( title='ROC Curve Analysis', xaxis_title='False Positive Rate', yaxis_title='True Positive Rate', legend=dict(x=0.01, y=0.99), height=500, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Show confusion matrices with improved styling st.subheader("Confusion Matrix Analysis") cols = st.columns(len(results)) for i, result in enumerate(results): with cols[i]: model_name = result['model_name'] cm = result['confusion_matrix'] # Calculate percentages cm_percent = cm / cm.sum() # Create annotation text annotations = [] for i in range(cm.shape[0]): for j in range(cm.shape[1]): annotations.append({ 'x': j, 'y': i, 'text': f"{cm[i, j]}
({cm_percent[i, j]:.1%})", 'showarrow': False, 'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'} }) # Create heatmap fig = go.Figure(data=go.Heatmap( z=cm, x=['Predicted Normal', 'Predicted Fraud'], y=['Actual Normal', 'Actual Fraud'], colorscale=[[0, '#81C784'], [1, '#2E7D32']], showscale=False )) fig.update_layout( title=f"{model_name}", annotations=annotations, height=300, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Feature importance for tree-based models with improved styling st.subheader("Feature Importance Analysis") for result in results: model_name = result['model_name'] model = result['model'] if model_name in ['Random Forest', 'XGBoost']: # Get feature importance if hasattr(model, 'feature_importances_'): importances = model.feature_importances_ feature_names = X.columns # Sort by importance indices = np.argsort(importances)[::-1] top_indices = indices[:10] # Show top 10 features for speed # Create bar chart fig = px.bar( x=importances[top_indices], y=[feature_names[i] for i in top_indices], orientation='h', title=f'Top Features - {model_name}', labels={'x': 'Importance', 'y': 'Feature'}, color=importances[top_indices], color_continuous_scale=['#81C784', '#43A047', '#2E7D32'] ) fig.update_layout( height=400, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Navigation buttons col1, col2 = st.columns([1, 5]) with col1: if st.button("← Back to Feature Extraction", key="back_to_feature_eng"): st.session_state['current_page'] = 'feature_engineering' st.rerun() with col2: if st.session_state['predictions'] is not None: if st.button("Continue to Results →", key="to_results"): st.session_state['current_page'] = 'results' st.rerun() # Fraud Detection Results Page elif st.session_state['current_page'] == 'results': st.markdown("

Step 5: Fraud Detection Results

", unsafe_allow_html=True) if st.session_state['predictions'] is None: st.error("No predictions found. Please complete model training first.") if st.button("Go back to Model Training"): st.session_state['current_page'] = 'model_training' st.rerun() else: predictions = st.session_state['predictions'] df_with_predictions = predictions['df'] model_name = predictions['model_name'] st.markdown(f"

Fraud Detection Results using {model_name}

", unsafe_allow_html=True) # Summary of predictions fraud_count = df_with_predictions['Predicted_Fraud'].sum() total_count = len(df_with_predictions) fraud_percentage = (fraud_count / total_count) * 100 # Create metrics display with improved styling col1, col2, col3 = st.columns(3) with col1: st.metric( label="Total Transactions", value=f"{total_count:,}", delta=None ) with col2: st.metric( label="Detected Frauds", value=f"{fraud_count:,}", delta=None ) with col3: st.metric( label="Fraud Percentage", value=f"{fraud_percentage:.2f}%", delta=None ) # Visualization of fraud distribution with improved styling st.subheader("Fraud Probability Distribution") fig = px.histogram( df_with_predictions, x='Fraud_Probability', nbins=50, color='Predicted_Fraud', color_discrete_map={0: "#6200EA", 1: "#D50000"}, labels={'Predicted_Fraud': 'Prediction', '0': 'Normal', '1': 'Fraud'}, title='Distribution of Fraud Probabilities', marginal='box' ) fig.update_layout( height=500, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Show high probability transactions st.subheader("High Fraud Probability Transactions") # Slider for probability threshold threshold = st.slider( "Fraud Probability Threshold", min_value=0.5, max_value=0.95, value=0.7, step=0.05, help="Transactions with fraud probability above this threshold will be shown" ) high_prob_df = df_with_predictions[df_with_predictions['Fraud_Probability'] > threshold] if len(high_prob_df) > 0: st.write(f"Found {len(high_prob_df)} transactions with fraud probability > {threshold}") # Sort by probability high_prob_df = high_prob_df.sort_values('Fraud_Probability', ascending=False) # Select columns to display display_cols = ['Fraud_Probability', 'Predicted_Fraud'] # Add original features if 'Amount' in high_prob_df.columns: display_cols.insert(0, 'Amount') if 'Time' in high_prob_df.columns: display_cols.insert(0, 'Time') # Add target column if it exists if st.session_state['target_col'] in high_prob_df.columns: display_cols.append(st.session_state['target_col']) # Display dataframe st.dataframe(high_prob_df[display_cols]) # Download button csv = high_prob_df.to_csv(index=False) st.download_button( label="Download High Risk Transactions", data=csv, file_name="high_risk_transactions.csv", mime="text/csv" ) else: st.info(f"No transactions found with fraud probability > {threshold}") # Show top 10 highest probability transactions instead st.write("Top 10 highest fraud probability transactions:") st.dataframe(df_with_predictions.sort_values('Fraud_Probability', ascending=False).head(10)) # Compare actual vs predicted (if actual labels exist) target_col = st.session_state['target_col'] if target_col in df_with_predictions.columns: st.subheader("Actual vs Predicted Fraud") # Confusion matrix with improved styling cm = confusion_matrix(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) # Calculate percentages cm_percent = cm / cm.sum() # Create annotation text annotations = [] for i in range(cm.shape[0]): for j in range(cm.shape[1]): annotations.append({ 'x': j, 'y': i, 'text': f"{cm[i, j]}
({cm_percent[i, j]:.1%})", 'showarrow': False, 'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'} }) # Create heatmap fig = go.Figure(data=go.Heatmap( z=cm, x=['Predicted Normal', 'Predicted Fraud'], y=['Actual Normal', 'Actual Fraud'], colorscale=[[0, '#81C784'], [1, '#2E7D32']], showscale=False )) fig.update_layout( title=f"Confusion Matrix - {model_name}", annotations=annotations, height=400, template='plotly_white', margin=dict(l=20, r=20, t=40, b=20) ) st.plotly_chart(fig) # Calculate metrics accuracy = accuracy_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) # Calculate metrics precision = precision_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) recall = recall_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) f1 = f1_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud']) # Display metrics with improved styling st.subheader("Performance Metrics on Full Dataset") col1, col2, col3, col4 = st.columns(4) with col1: st.metric( label="Accuracy", value=f"{accuracy:.4f}", delta=None ) with col2: st.metric( label="Precision", value=f"{precision:.4f}", delta=None ) with col3: st.metric( label="Recall", value=f"{recall:.4f}", delta=None ) with col4: st.metric( label="F1 Score", value=f"{f1:.4f}", delta=None ) # Download all predictions st.subheader("Download Results") csv = df_with_predictions.to_csv(index=False) st.download_button( label="Download All Predictions as CSV", data=csv, file_name="fraud_predictions.csv", mime="text/csv" ) # Navigation buttons col1, col2 = st.columns([1, 5]) with col1: if st.button("← Back to Model Training", key="back_to_model_training"): st.session_state['current_page'] = 'model_training' st.rerun() with col2: if st.button("Start Over", key="start_over"): # Reset session state for key in list(st.session_state.keys()): del st.session_state[key] st.session_state['current_page'] = 'home' st.rerun()