Fraud_Detection_API_excecute4_Part2

Runtime error

App Files Files Community

Abs6187 commited on Mar 22, 2025

Commit

c5ec08c

verified ·

1 Parent(s): b78fe73

Upload 12 files

Browse files

Files changed (13) hide show

.gitattributes +3 -0
README.md +4 -4
app.py +1807 -0
data_exploration.py +139 -0
data_processor.py +115 -0
engineered_data.csv +3 -0
gitattributes +38 -0
gitkeep +0 -0
model_trainer.py +121 -0
preprocessed_data.csv +3 -0
requirements.txt +11 -0
uploaded_data.csv +3 -0
visualizer.py +162 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+engineered_data.csv filter=lfs diff=lfs merge=lfs -text
+preprocessed_data.csv filter=lfs diff=lfs merge=lfs -text
+uploaded_data.csv filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,13 @@
 ---
-title: Fraud Detection API Excecute4 Part2
-emoji: 😻
-colorFrom: red
 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.43.2
 app_file: app.py
 pinned: false
-short_description: SabPaisa_financial_frauds
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Financial Fraud Detection
+emoji: 👁
+colorFrom: yellow
 colorTo: yellow
 sdk: streamlit
 sdk_version: 1.43.2
 app_file: app.py
 pinned: false
+short_description: Detects Financial Frauds
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1807 @@

+"""
+Financial Fraud Detection System - TechMatrix Solvers
+Team Members:
+- Abhay Gupta
+- Jay Kumar
+- Kripanshu Gupta
+- Bhumika Patel
+A comprehensive fraud detection system using machine learning algorithms.
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+import os
+import pickle
+import time
+import warnings
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    roc_auc_score, confusion_matrix, classification_report, roc_curve
+)
+from imblearn.over_sampling import SMOTE
+# Suppress warnings
+warnings.filterwarnings('ignore')
+# Set page configuration
+st.set_page_config(
+    page_title="TechMatrix Fraud Detection System",
+    page_icon="🔒",
+    layout="wide",
+    initial_sidebar_state="collapsed"
+)
+# Custom CSS for better styling
+st.markdown("""
+<style>
+    /* Main theme colors */
+    :root {
+        --primary: #2E7D32;
+        --primary-light: #81C784;
+        --primary-dark: #1B5E20;
+        --secondary: #1976D2;
+        --secondary-light: #64B5F6;
+        --text-on-primary: #FFFFFF;
+        --text-primary: #212121;
+        --text-secondary: #757575;
+        --background: #F5F5F5;
+        --card-bg: #FFFFFF;
+        --success: #43A047;
+        --warning: #FFA000;
+        --error: #D32F2F;
+        --info: #1976D2;
+    }
+    /* Base styles */
+    .main-header {
+        font-size: 2.8rem;
+        color: var(--primary);
+        text-align: center;
+        margin-bottom: 1.5rem;
+        font-weight: 700;
+        background: linear-gradient(90deg, var(--primary), var(--secondary));
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        padding: 0.5rem 0;
+    }
+    .sub-header {
+        font-size: 2rem;
+        color: var(--primary-dark);
+        margin-top: 2rem;
+        margin-bottom: 1rem;
+        font-weight: 600;
+        border-bottom: 2px solid var(--primary-light);
+        padding-bottom: 0.5rem;
+    }
+    .metric-card {
+        text-align: center;
+        padding: 1.2rem;
+        border-radius: 0.8rem;
+        background-color: rgba(46, 125, 50, 0.1);
+        transition: transform 0.3s ease;
+        border-left: 4px solid var(--primary);
+    }
+    .metric-card:hover {
+        transform: translateY(-5px);
+        background-color: rgba(46, 125, 50, 0.15);
+    }
+    .metric-value {
+        font-size: 2.5rem;
+        font-weight: 700;
+        color: var(--primary);
+        margin: 0.5rem 0;
+    }
+    .metric-label {
+        font-size: 1rem;
+        color: var(--text-secondary);
+        margin-bottom: 0.5rem;
+    }
+    div[data-testid="stMetric"] {
+        background-color: rgba(46, 125, 50, 0.1);
+        padding: 1rem;
+        border-radius: 0.8rem;
+        border-left: 4px solid var(--primary);
+        transition: transform 0.3s ease;
+    }
+    div[data-testid="stMetric"]:hover {
+        transform: translateY(-5px);
+        background-color: rgba(46, 125, 50, 0.15);
+    }
+    div[data-testid="stMetric"] > div {
+        gap: 0.2rem;
+    }
+    div[data-testid="stMetric"] label {
+        color: var(--text-secondary) !important;
+    }
+    div[data-testid="stMetric"] .css-1wivap2 {
+        color: var(--primary) !important;
+    }
+    .stButton > button {
+        background-color: var(--primary);
+        color: var(--text-on-primary);
+        border-radius: 0.5rem;
+        padding: 0.5rem 1rem;
+        font-weight: 600;
+        border: none;
+        transition: all 0.3s ease;
+    }
+    .stButton > button:hover {
+        background-color: var(--primary-dark);
+        box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
+        transform: translateY(-2px);
+    }
+    .stProgress > div > div > div {
+        background-color: var(--primary);
+        background-image: linear-gradient(45deg,
+            rgba(255,255,255,.15) 25%,
+            transparent 25%,
+            transparent 50%,
+            rgba(255,255,255,.15) 50%,
+            rgba(255,255,255,.15) 75%,
+            transparent 75%,
+            transparent
+        );
+        background-size: 1rem 1rem;
+        animation: progress-animation 1s linear infinite;
+    }
+    @keyframes progress-animation {
+        0% { background-position: 0 0; }
+        100% { background-position: 1rem 0; }
+    }
+    .success-text {
+        color: var(--success);
+        font-weight: bold;
+    }
+    .warning-text {
+        color: var(--warning);
+        font-weight: bold;
+    }
+    .error-text {
+        color: var(--error);
+        font-weight: bold;
+    }
+    .info-text {
+        color: var(--info);
+        font-weight: bold;
+    }
+    @keyframes fadeIn {
+        from { opacity: 0; }
+        to { opacity: 1; }
+    }
+    .animate-fade-in {
+        animation: fadeIn 0.8s ease-in-out;
+    }
+    [data-testid="stSidebarNav"] ul li:nth-child(2) {
+        display: none;
+    }
+    .dataframe {
+        border-collapse: collapse;
+        border: none;
+        font-size: 0.9rem;
+    }
+    .dataframe th {
+        background-color: var(--primary-light);
+        color: var(--text-primary);
+        padding: 0.5rem;
+        text-align: left;
+    }
+    .dataframe td {
+        padding: 0.5rem;
+        border-bottom: 1px solid #eee;
+    }
+    .dataframe tr:hover {
+        background-color: #f5f5f5;
+    }
+    .stSlider > div > div {
+        background-color: var(--primary-light);
+    }
+    .stSelectbox > div > div {
+        background-color: var(--card-bg);
+        border-radius: 0.5rem;
+        border: 1px solid var(--primary-light);
+    }
+    @keyframes pulse {
+        0% { opacity: 0.6; }
+        50% { opacity: 1; }
+        100% { opacity: 0.6; }
+    }
+    .loading-pulse {
+        animation: pulse 1.5s infinite ease-in-out;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Create necessary directories
+os.makedirs("data", exist_ok=True)
+os.makedirs("models", exist_ok=True)
+# Initialize session state
+if 'current_page' not in st.session_state:
+    st.session_state['current_page'] = 'home'
+if 'data' not in st.session_state:
+    st.session_state['data'] = None
+if 'preprocessed_data' not in st.session_state:
+    st.session_state['preprocessed_data'] = None
+if 'engineered_data' not in st.session_state:
+    st.session_state['engineered_data'] = None
+if 'target_col' not in st.session_state:
+    st.session_state['target_col'] = 'Class'
+if 'trained_models' not in st.session_state:
+    st.session_state['trained_models'] = {}
+if 'predictions' not in st.session_state:
+    st.session_state['predictions'] = None
+if 'progress' not in st.session_state:
+    st.session_state['progress'] = 0
+# Main title
+st.markdown("<div class='animate-fade-in'><h1 class='main-header'>TechMatrix Fraud Detection System</h1></div>", unsafe_allow_html=True)
+# Team information
+st.markdown("""
+<div style='text-align: center; margin-bottom: 2rem;'>
+    <h3>Team TechMatrix Solvers</h3>
+    <p>Abhay Gupta | Jay Kumar | Kripanshu Gupta | Bhumika Patel</p>
+</div>
+""", unsafe_allow_html=True)
+# Home Page
+if st.session_state['current_page'] == 'home':
+    # Introduction section
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Welcome to TechMatrix Fraud Detection System</h2></div>", unsafe_allow_html=True)
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.markdown("""
+        Our advanced fraud detection system leverages cutting-edge machine learning algorithms to identify and prevent fraudulent transactions in real-time.
+        ### Understanding Financial Fraud
+        Financial fraud encompasses various deceptive practices aimed at unauthorized acquisition of funds or assets.
+        Our system specifically addresses:
+        - Credit card transaction fraud
+        - Identity theft incidents
+        - Account compromise attempts
+        - Suspicious transaction patterns
+        ### Machine Learning Implementation
+        Our system employs sophisticated machine learning models that analyze transaction patterns and behavioral data.
+        The models are trained on historical fraud data and continuously updated to adapt to emerging fraud patterns.
+        ### System Advantages:
+        - **Real-time Monitoring**: Instant detection of suspicious activities
+        - **Scalable Processing**: Efficient handling of large transaction volumes
+        - **Pattern Recognition**: Advanced detection of complex fraud patterns
+        - **Risk Assessment**: Probability-based fraud scoring system
+        """)
+    with col2:
+        # Create a unique visualization of the fraud detection process
+        fig = go.Figure()
+        # Create a hexagonal flow diagram
+        angles = np.linspace(0, 2*np.pi, 6, endpoint=False)
+        x = 0.5 + 0.4 * np.cos(angles)
+        y = 0.5 + 0.4 * np.sin(angles)
+        # Add connecting lines with gradient effect
+        for i in range(len(angles)):
+            next_i = (i + 1) % len(angles)
+            fig.add_trace(go.Scatter(
+                x=[x[i], x[next_i]],
+                y=[y[i], y[next_i]],
+                mode='lines',
+                line=dict(
+                    color='rgba(46, 125, 50, 0.5)',
+                    width=2,
+                    dash='dot'
+                ),
+                showlegend=False
+            ))
+        # Add nodes with updated colors and labels
+        node_labels = ['Input Data', 'Validation', 'Processing', 'Analysis', 'Detection', 'Action']
+        node_colors = ['#2E7D32', '#43A047', '#81C784', '#1976D2', '#64B5F6', '#D32F2F']
+        for i in range(len(angles)):
+            fig.add_trace(go.Scatter(
+                x=[x[i]],
+                y=[y[i]],
+                mode='markers+text',
+                marker=dict(
+                    size=30,
+                    color=node_colors[i],
+                    symbol='hexagon'
+                ),
+                text=node_labels[i],
+                textposition="middle center",
+                textfont=dict(color='white', size=12),
+                showlegend=False
+            ))
+        # Add title in the center with updated styling
+        fig.add_trace(go.Scatter(
+            x=[0.5],
+            y=[0.5],
+            mode='text',
+            text='Fraud<br>Detection<br>Pipeline',
+            textposition="middle center",
+            textfont=dict(
+                color='#212121',
+                size=14,
+                family='Arial, bold'
+            ),
+            showlegend=False
+        ))
+        fig.update_layout(
+            height=400,
+            width=400,
+            margin=dict(l=0, r=0, t=0, b=0),
+            xaxis=dict(
+                showgrid=False,
+                zeroline=False,
+                showticklabels=False,
+                range=[0, 1]
+            ),
+            yaxis=dict(
+                showgrid=False,
+                zeroline=False,
+                showticklabels=False,
+                range=[0, 1]
+            ),
+            plot_bgcolor='rgba(0,0,0,0)'
+        )
+        st.plotly_chart(fig)
+    # Workflow section
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>System Workflow</h2></div>", unsafe_allow_html=True)
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.markdown("### 1. Data Ingestion")
+        st.markdown("Secure upload and validation of transaction data in CSV format.")
+        st.image("https://cdn-icons-png.flaticon.com/512/4208/4208479.png", width=100)
+    with col2:
+        st.markdown("### 2. Data Processing")
+        st.markdown("Advanced data cleaning and preparation for analysis.")
+        st.image("https://cdn-icons-png.flaticon.com/512/1875/1875627.png", width=100)
+    with col3:
+        st.markdown("### 3. Feature Extraction")
+        st.markdown("Intelligent feature engineering and pattern recognition.")
+        st.image("https://cdn-icons-png.flaticon.com/512/2103/2103633.png", width=100)
+    with col4:
+        st.markdown("### 4. Model Deployment")
+        st.markdown("Real-time fraud detection and risk assessment.")
+        st.image("https://cdn-icons-png.flaticon.com/512/2103/2103658.png", width=100)
+    # Sample visualizations section
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>System Analytics</h2></div>", unsafe_allow_html=True)
+    col1, col2 = st.columns(2)
+    with col1:
+        # Sample ROC curve with improved styling
+        fig = go.Figure()
+        fpr = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
+        tpr_lr = [0, 0.4, 0.55, 0.68, 0.75, 0.8, 0.85, 0.9, 0.94, 0.98, 1.0]
+        tpr_rf = [0, 0.5, 0.65, 0.78, 0.85, 0.88, 0.91, 0.95, 0.97, 0.99, 1.0]
+        tpr_xgb = [0, 0.55, 0.7, 0.8, 0.87, 0.9, 0.93, 0.96, 0.98, 0.99, 1.0]
+        fig.add_trace(go.Scatter(
+            x=fpr,
+            y=tpr_lr,
+            mode='lines',
+            name='Logistic Regression (AUC = 0.85)',
+            line=dict(color='#2E7D32', width=3)
+        ))
+        fig.add_trace(go.Scatter(
+            x=fpr,
+            y=tpr_rf,
+            mode='lines',
+            name='Random Forest (AUC = 0.92)',
+            line=dict(color='#1976D2', width=3)
+        ))
+        fig.add_trace(go.Scatter(
+            x=fpr,
+            y=tpr_xgb,
+            mode='lines',
+            name='XGBoost (AUC = 0.94)',
+            line=dict(color='#D32F2F', width=3)
+        ))
+        fig.add_trace(go.Scatter(
+            x=[0, 1],
+            y=[0, 1],
+            mode='lines',
+            name='Random',
+            line=dict(dash='dash', color='#757575', width=2)
+        ))
+        fig.update_layout(
+            title='Model Performance Comparison',
+            xaxis_title='False Positive Rate',
+            yaxis_title='True Positive Rate',
+            legend=dict(x=0.01, y=0.99),
+            width=600,
+            height=400,
+            template='plotly_white',
+            margin=dict(l=40, r=40, t=40, b=40)
+        )
+        st.plotly_chart(fig)
+    with col2:
+        # Sample feature importance with improved styling
+        features = ['Transaction Amount', 'Time of Day', 'Merchant Category', 'Location', 'Transaction Frequency',
+                   'Device Used', 'IP Address', 'Account Age', 'Previous Fraud Flag', 'Transaction Type']
+        importance = [0.23, 0.18, 0.15, 0.12, 0.09, 0.08, 0.06, 0.04, 0.03, 0.02]
+        fig = px.bar(
+            x=importance,
+            y=features,
+            orientation='h',
+            title='Feature Importance Analysis',
+            labels={'x': 'Importance Score', 'y': 'Feature'},
+            color=importance,
+            color_continuous_scale=['#2E7D32', '#43A047', '#81C784']
+        )
+        fig.update_layout(
+            width=600,
+            height=400,
+            template='plotly_white',
+            margin=dict(l=40, r=40, t=40, b=40)
+        )
+        st.plotly_chart(fig)
+    # Get started button
+    st.markdown("<div style='text-align: center; margin-top: 2rem;'>", unsafe_allow_html=True)
+    if st.button("Get Started", key="get_started", help="Begin the fraud detection process"):
+        st.session_state['current_page'] = 'upload'
+        st.rerun()
+    st.markdown("</div>", unsafe_allow_html=True)
+# Data Upload Page
+elif st.session_state['current_page'] == 'upload':
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 1: Data Ingestion</h2></div>", unsafe_allow_html=True)
+    # File uploader with size limit warning
+    st.markdown("""
+    ### Secure Data Upload
+    Upload your transaction data securely in CSV format. The system supports the following:
+    - Transaction details (amount, timestamp, location, etc.)
+    - Target column for fraud classification (default: 'Class' with 0 for normal, 1 for fraud)
+    - **Maximum file size: 200 MB**
+    For testing purposes, you can use the [Credit Card Fraud Detection dataset](https://www.kaggle.com/mlg-ulb/creditcardfraud) from Kaggle.
+    ### Data Requirements:
+    - CSV format with UTF-8 encoding
+    - No missing values in critical fields
+    - Proper date/time formatting
+    - Numeric values for transaction amounts
+    """)
+    uploaded_file = st.file_uploader(
+        "Upload transaction data (CSV file)",
+        type="csv",
+        help="Maximum file size: 200 MB"
+    )
+    if uploaded_file is not None:
+        # Check file size (200 MB limit)
+        file_details = {"FileName": uploaded_file.name, "FileType": uploaded_file.type}
+        # Read the file into a buffer to check its size
+        file_buffer = uploaded_file.getvalue()
+        file_size_mb = len(file_buffer) / (1024 * 1024)
+        if file_size_mb > 200:
+            st.error(f"File size exceeds the 200 MB limit. Your file is {file_size_mb:.2f} MB. Please upload a smaller file.")
+            st.stop()
+        else:
+            st.info(f"File size: {file_size_mb:.2f} MB")
+        # Load data with progress bar
+        progress_bar = st.progress(0)
+        status_text = st.empty()
+        status_text.text("Initializing data ingestion...")
+        progress_bar.progress(25)
+        time.sleep(0.3)
+        try:
+            # Use BytesIO to avoid loading the file twice
+            from io import BytesIO
+            df = pd.read_csv(BytesIO(file_buffer))
+            st.session_state['data'] = df
+            progress_bar.progress(50)
+            status_text.text("Validating data structure...")
+            time.sleep(0.3)
+            progress_bar.progress(75)
+            status_text.text("Preparing data preview...")
+            time.sleep(0.3)
+            progress_bar.progress(100)
+            status_text.text("Data ingestion completed!")
+            time.sleep(0.3)
+            status_text.empty()
+            progress_bar.empty()
+            # Show basic data info
+            st.success(f"Data ingested successfully! Shape: {df.shape[0]} rows and {df.shape[1]} columns")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("Data Preview")
+                st.dataframe(df.head())
+            with col2:
+                st.subheader("Data Structure")
+                # Display data types and missing values
+                data_info = pd.DataFrame({
+                    'Data Type': df.dtypes,
+                    'Non-Null Count': df.count(),
+                    'Missing Values': df.isnull().sum(),
+                    'Unique Values': [df[col].nunique() for col in df.columns]
+                })
+                st.dataframe(data_info)
+                # Check for target column
+                if 'Class' in df.columns:
+                    fraud_count = df['Class'].sum()
+                    total_count = len(df)
+                    fraud_percentage = (fraud_count / total_count) * 100
+                    st.info(f"Target column 'Class' detected with {fraud_count} fraud cases ({fraud_percentage:.2f}% of data)")
+                else:
+                    st.warning("No 'Class' column detected. You'll need to specify the target column in the next step.")
+        except Exception as e:
+            st.error(f"Error during data ingestion: {str(e)}")
+            st.info("Please ensure the file is a valid CSV with proper formatting.")
+    # Navigation buttons
+    col1, col2 = st.columns([1, 5])
+    with col1:
+        if st.button("← Back to Home", key="back_to_home"):
+            st.session_state['current_page'] = 'home'
+            st.rerun()
+    with col2:
+        if st.session_state['data'] is not None:
+            if st.button("Continue to Data Processing →", key="to_preprocess"):
+                st.session_state['current_page'] = 'preprocess'
+                st.rerun()
+# Data Preprocessing Page
+elif st.session_state['current_page'] == 'preprocess':
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 2: Data Processing</h2></div>", unsafe_allow_html=True)
+    if st.session_state['data'] is None:
+        st.error("No data found. Please upload data first.")
+        if st.button("Go back to Data Ingestion"):
+            st.session_state['current_page'] = 'upload'
+            st.rerun()
+    else:
+        df = st.session_state['data']
+        st.markdown("""
+        ### Advanced Data Processing
+        Enhance your data quality through our comprehensive processing pipeline. The system will:
+        - Handle missing values intelligently
+        - Remove statistical outliers
+        - Normalize numerical features
+        - Balance class distribution
+        Select the processing options below to customize the pipeline.
+        """)
+        # Target column selection
+        if 'Class' in df.columns:
+            target_col = 'Class'
+            st.info(f"Target column 'Class' detected with values: {df[target_col].unique()}")
+        else:
+            target_col = st.selectbox("Select the target column (fraud indicator)", df.columns)
+        st.session_state['target_col'] = target_col
+        # Preprocessing options
+        st.subheader("Processing Options")
+        col1, col2 = st.columns(2)
+        with col1:
+            handle_missing = st.checkbox("Handle Missing Values", value=True,
+                                        help="Fill missing numerical values with mean and categorical values with mode")
+            remove_outliers = st.checkbox("Remove Outliers", value=False,
+                                         help="Remove extreme values that might affect model performance")
+        with col2:
+            normalize_data = st.checkbox("Normalize Data", value=True,
+                                        help="Scale numerical features to have zero mean and unit variance")
+            balance_classes = st.checkbox("Balance Classes", value=True,
+                                         help="Handle class imbalance using SMOTE in the training phase")
+        # Handle missing values
+        if st.button("Process Data"):
+            with st.spinner("Processing data..."):
+                # Create a copy of the dataframe
+                df_processed = df.copy()
+                # Progress bar
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                # Handle missing values
+                if handle_missing:
+                    status_text.text("Processing missing values...")
+                    progress_bar.progress(25)
+                    time.sleep(0.3)
+                    for col in df_processed.columns:
+                        if df_processed[col].dtype in ['int64', 'float64']:
+                            df_processed[col] = df_processed[col].fillna(df_processed[col].mean())
+                        else:
+                            df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])
+                # Remove outliers if selected
+                if remove_outliers:
+                    status_text.text("Processing outliers...")
+                    progress_bar.progress(50)
+                    time.sleep(0.3)
+                    # Only apply to numerical columns
+                    num_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns
+                    for col in num_cols:
+                        if col != target_col:  # Don't remove outliers from target column
+                            Q1 = df_processed[col].quantile(0.25)
+                            Q3 = df_processed[col].quantile(0.75)
+                            IQR = Q3 - Q1
+                            lower_bound = Q1 - 3 * IQR
+                            upper_bound = Q3 + 3 * IQR
+                            df_processed = df_processed[(df_processed[col] >= lower_bound) &
+                                                       (df_processed[col] <= upper_bound)]
+                # Store the processed data
+                status_text.text("Finalizing data processing...")
+                progress_bar.progress(100)
+                time.sleep(0.3)
+                st.session_state['preprocessed_data'] = df_processed
+                status_text.empty()
+                progress_bar.empty()
+            st.success("Data processing completed!")
+            # Show class distribution
+            if target_col in df_processed.columns:
+                st.subheader("Class Distribution After Processing")
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Create pie chart with improved styling
+                    labels = ['Normal', 'Fraud']
+                    values = [len(df_processed[df_processed[target_col] == 0]),
+                             len(df_processed[df_processed[target_col] == 1])]
+                    fig = px.pie(
+                        values=values,
+                        names=labels,
+                        title='Transaction Distribution',
+                        color_discrete_sequence=['#2E7D32', '#D32F2F'],
+                        hole=0.4
+                    )
+                    fig.update_traces(textposition='inside', textinfo='percent+label')
+                    fig.update_layout(
+                        template='plotly_white',
+                        margin=dict(l=20, r=20, t=30, b=20)
+                    )
+                    st.plotly_chart(fig)
+                with col2:
+                    # Calculate statistics
+                    fraud_count = df_processed[target_col].sum()
+                    total_count = len(df_processed)
+                    fraud_percentage = (fraud_count / total_count) * 100
+                    st.metric("Total Transactions", f"{total_count:,}")
+                    st.metric("Fraud Transactions", f"{fraud_count:,}")
+                    st.metric("Fraud Percentage", f"{fraud_percentage:.2f}%")
+                    if fraud_percentage < 1:
+                        st.warning("Your dataset is highly imbalanced. Class balancing will be applied during model training.")
+        # Navigation buttons
+        col1, col2 = st.columns([1, 5])
+        with col1:
+            if st.button("← Back to Upload", key="back_to_upload"):
+                st.session_state['current_page'] = 'upload'
+                st.rerun()
+        with col2:
+            if st.session_state['preprocessed_data'] is not None:
+                if st.button("Continue to Feature Extraction →", key="to_feature_eng"):
+                    st.session_state['current_page'] = 'feature_engineering'
+                    st.rerun()
+# Feature Engineering Page
+elif st.session_state['current_page'] == 'feature_engineering':
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 3: Feature Extraction</h2></div>", unsafe_allow_html=True)
+    if st.session_state['preprocessed_data'] is None:
+        st.error("No processed data found. Please complete data processing first.")
+        if st.button("Go back to Data Processing"):
+            st.session_state['current_page'] = 'preprocess'
+            st.rerun()
+    else:
+        df_processed = st.session_state['preprocessed_data']
+        target_col = st.session_state['target_col']
+        st.markdown("""
+        ### Intelligent Feature Extraction
+        Enhance your fraud detection capabilities through advanced feature engineering. Our system provides:
+        - Time-based pattern analysis
+        - Transaction amount profiling
+        - Behavioral feature extraction
+        - Cross-feature interaction analysis
+        Select the features to extract below to optimize your model's performance.
+        """)
+        # Feature engineering options
+        st.subheader("Feature Extraction Options")
+        col1, col2 = st.columns(2)
+        with col1:
+            create_time_features = st.checkbox("Time-based Features", value=True,
+                                              help="Extract temporal patterns and behavioral indicators")
+            create_amount_features = st.checkbox("Amount-based Features", value=True,
+                                               help="Generate transaction amount profiles and risk indicators")
+        with col2:
+            create_aggregations = st.checkbox("Aggregation Features", value=False,
+                                             help="Create aggregated metrics for transaction patterns")
+            create_interactions = st.checkbox("Interaction Features", value=False,
+                                             help="Generate cross-feature interactions for complex pattern detection")
+        # Apply feature engineering
+        if st.button("Extract Features"):
+            with st.spinner("Extracting features..."):
+                # Create a copy of the dataframe
+                df_engineered = df_processed.copy()
+                # Progress bar
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                # Time-based features
+                if create_time_features and 'Time' in df_engineered.columns:
+                    status_text.text("Extracting temporal features...")
+                    progress_bar.progress(25)
+                    time.sleep(0.3)
+                    # Hour of day
+                    df_engineered['Hour'] = (df_engineered['Time'] / 3600) % 24
+                    # Flag for transactions during odd hours (midnight to 5 AM)
+                    df_engineered['Odd_Hour'] = ((df_engineered['Hour'] >= 0) & (df_engineered['Hour'] < 5)).astype(int)
+                    # Part of day
+                    df_engineered['Part_of_Day'] = pd.cut(
+                        df_engineered['Hour'],
+                        bins=[0, 6, 12, 18, 24],
+                        labels=['Night', 'Morning', 'Afternoon', 'Evening']
+                    )
+                # Amount-based features
+                if create_amount_features and 'Amount' in df_engineered.columns:
+                    status_text.text("Extracting amount-based features...")
+                    progress_bar.progress(50)
+                    time.sleep(0.3)
+                    # Log transform for amount (to handle skewed distribution)
+                    df_engineered['Log_Amount'] = np.log1p(df_engineered['Amount'])
+                    # Flag for high-value transactions (top 5%)
+                    threshold = df_engineered['Amount'].quantile(0.95)
+                    df_engineered['High_Value'] = (df_engineered['Amount'] > threshold).astype(int)
+                    # Amount bins
+                    df_engineered['Amount_Bin'] = pd.qcut(
+                        df_engineered['Amount'],
+                        q=5,
+                        labels=['Very Low', 'Low', 'Medium', 'High', 'Very High']
+                    )
+                # Aggregation features
+                if create_aggregations:
+                    status_text.text("Generating aggregation features...")
+                    progress_bar.progress(75)
+                    time.sleep(0.3)
+                    # Check if there's a card ID or similar column
+                    potential_id_cols = [col for col in df_engineered.columns if 'id' in col.lower() or 'card' in col.lower()]
+                    if potential_id_cols:
+                        id_col = potential_id_cols[0]
+                        # Number of transactions per card
+                        tx_count = df_engineered.groupby(id_col).size().reset_index(name='Tx_Count')
+                        df_engineered = df_engineered.merge(tx_count, on=id_col, how='left')
+                        # Average transaction amount per card
+                        if 'Amount' in df_engineered.columns:
+                            avg_amount = df_engineered.groupby(id_col)['Amount'].mean().reset_index(name='Avg_Amount')
+                            df_engineered = df_engineered.merge(avg_amount, on=id_col, how='left')
+                            # Transaction amount deviation from average
+                            df_engineered['Amount_Deviation'] = df_engineered['Amount'] - df_engineered['Avg_Amount']
+                # Interaction features
+                if create_interactions:
+                    status_text.text("Generating interaction features...")
+                    progress_bar.progress(90)
+                    time.sleep(0.3)
+                    # Only create interactions between numerical features
+                    num_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
+                    num_cols = [col for col in num_cols if col != target_col and 'id' not in col.lower()]
+                    # Limit to a few important features to avoid explosion of features
+                    if len(num_cols) > 3:
+                        num_cols = num_cols[:3]
+                    # Create interactions
+                    for i in range(len(num_cols)):
+                        for j in range(i+1, len(num_cols)):
+                            col1_name = num_cols[i]
+                            col2_name = num_cols[j]
+                            df_engineered[f'{col1_name}_x_{col2_name}'] = df_engineered[col1_name] * df_engineered[col2_name]
+                # Convert categorical columns to one-hot encoding
+                cat_cols = df_engineered.select_dtypes(include=['object', 'category']).columns
+                for col in cat_cols:
+                    dummies = pd.get_dummies(df_engineered[col], prefix=col, drop_first=True)
+                    df_engineered = pd.concat([df_engineered, dummies], axis=1)
+                    df_engineered.drop(columns=[col], inplace=True)
+                # Store the engineered data
+                status_text.text("Finalizing feature extraction...")
+                progress_bar.progress(100)
+                time.sleep(0.3)
+                st.session_state['engineered_data'] = df_engineered
+                status_text.empty()
+                progress_bar.empty()
+            st.success("Feature extraction completed!")
+            # Show correlation with target
+            if target_col in df_engineered.columns:
+                st.subheader("Feature Correlation Analysis")
+                # Get correlation with target
+                corr_with_target = df_engineered.corr()[target_col].sort_values(ascending=False)
+                # Remove target's correlation with itself
+                corr_with_target = corr_with_target.drop(target_col)
+                # Get top 10 positive and negative correlations
+                top_pos = corr_with_target.head(10)
+                top_neg = corr_with_target.tail(10).iloc[::-1]  # Reverse to show strongest negative first
+                col1, col2 = st.columns(2)
+                with col1:
+                    # Plot top positive correlations with improved styling
+                    fig = px.bar(
+                        x=top_pos.values,
+                        y=top_pos.index,
+                        orientation='h',
+                        title='Top Positive Correlations with Fraud',
+                        labels={'x': 'Correlation', 'y': 'Feature'},
+                        color=top_pos.values,
+                        color_continuous_scale=['#2E7D32', '#43A047', '#81C784']
+                    )
+                    fig.update_layout(
+                        height=400,
+                        template='plotly_white',
+                        margin=dict(l=20, r=20, t=40, b=20)
+                    )
+                    st.plotly_chart(fig)
+                with col2:
+                    # Plot top negative correlations with improved styling
+                    fig = px.bar(
+                        x=top_neg.values,
+                        y=top_neg.index,
+                        orientation='h',
+                        title='Top Negative Correlations with Fraud',
+                        labels={'x': 'Correlation', 'y': 'Feature'},
+                        color=top_neg.values,
+                        color_continuous_scale=['#81C784', '#43A047', '#2E7D32']
+                    )
+                    fig.update_layout(
+                        height=400,
+                        template='plotly_white',
+                        margin=dict(l=20, r=20, t=40, b=20)
+                    )
+                    st.plotly_chart(fig)
+                # Correlation heatmap
+                st.subheader("Feature Correlation Matrix")
+                # Get top correlated features
+                corr_matrix = df_engineered.corr()
+                top_corr_features = corr_with_target.abs().sort_values(ascending=False).head(15).index
+                # Create heatmap with selected features
+                top_corr_matrix = corr_matrix.loc[top_corr_features, top_corr_features]
+                fig = px.imshow(
+                    top_corr_matrix,
+                    text_auto='.2f',
+                    color_continuous_scale=['#2E7D32', 'white', '#1976D2'],
+                    title='Feature Correlation Matrix'
+                )
+                fig.update_layout(
+                    height=600,
+                    width=800,
+                    template='plotly_white',
+                    margin=dict(l=20, r=20, t=40, b=20)
+                )
+                st.plotly_chart(fig)
+                # Feature distributions
+                st.subheader("Feature Distribution Analysis")
+                # Select a feature to visualize
+                numeric_cols = df_engineered.select_dtypes(include=['int64', 'float64']).columns
+                numeric_cols = [col for col in numeric_cols if col != target_col]
+                selected_feature = st.selectbox("Select feature to analyze", numeric_cols)
+                # Create distribution plot with improved styling
+                fig = px.histogram(
+                    df_engineered,
+                    x=selected_feature,
+                    color=target_col,
+                    marginal="box",
+                    opacity=0.7,
+                    barmode="overlay",
+                    color_discrete_map={0: "#2E7D32", 1: "#D32F2F"},
+                    labels={target_col: "Class", "0": "Normal", "1": "Fraud"}
+                )
+                fig.update_layout(
+                    title=f"Distribution Analysis of {selected_feature}",
+                    template='plotly_white',
+                    margin=dict(l=20, r=20, t=40, b=20)
+                )
+                st.plotly_chart(fig)
+        # Navigation buttons
+        col1, col2 = st.columns([1, 5])
+        with col1:
+            if st.button("← Back to Processing", key="back_to_preprocess"):
+                st.session_state['current_page'] = 'preprocess'
+                st.rerun()
+        with col2:
+            if st.session_state['engineered_data'] is not None:
+                if st.button("Continue to Model Training →", key="to_model_training"):
+                    st.session_state['current_page'] = 'model_training'
+                    st.rerun()
+# Model Training Page
+elif st.session_state['current_page'] == 'model_training':
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 4: Model Training</h2></div>", unsafe_allow_html=True)
+    if st.session_state['engineered_data'] is None:
+        st.error("No engineered data found. Please complete feature extraction first.")
+        if st.button("Go back to Feature Extraction"):
+            st.session_state['current_page'] = 'feature_engineering'
+            st.rerun()
+    else:
+        df_engineered = st.session_state['engineered_data']
+        target_col = st.session_state['target_col']
+        st.markdown("""
+        ### Advanced Model Training
+        Train sophisticated machine learning models for fraud detection. Our system provides:
+        - Multiple model architectures
+        - Automated hyperparameter optimization
+        - Cross-validation for robust evaluation
+        - Performance metrics visualization
+        Select your preferred models and training parameters below.
+        """)
+        # Training options
+        st.subheader("Training Configuration")
+        col1, col2 = st.columns(2)
+        with col1:
+            # Data sampling for faster training - default to a smaller sample for speed
+            use_sample = st.checkbox("Use Data Sample for Faster Training", value=True,
+                                    help="Use a sample of the data to speed up training (recommended for large datasets)")
+            if use_sample:
+                sample_size = st.slider("Sample Size (%)", min_value=10, max_value=100, value=20,
+                                       help="Percentage of data to use for training")
+            # Test size
+            test_size = st.slider("Test Set Size (%)", min_value=10, max_value=50, value=20,
+                                 help="Percentage of data to use for testing")
+            # Class balancing
+            use_smote = st.checkbox("Apply SMOTE for Class Balancing", value=True,
+                                   help="Use SMOTE to handle class imbalance")
+        with col2:
+            # Model selection
+            st.write("Select Models to Train:")
+            train_lr = st.checkbox("Logistic Regression", value=True)
+            train_rf = st.checkbox("Random Forest", value=True)
+            train_xgb = st.checkbox("XGBoost", value=True)
+            # Advanced options - reduced default values for faster training
+            show_advanced = st.checkbox("Show Advanced Options", value=False)
+            if show_advanced:
+                # Number of estimators for tree models - reduced for speed
+                n_estimators = st.slider("Number of Estimators", min_value=10, max_value=200, value=50,
+                                        help="Number of trees for Random Forest and XGBoost (higher = more accurate but slower)")
+                # Max depth for tree models
+                max_depth = st.slider("Max Tree Depth", min_value=3, max_value=15, value=6,
+                                     help="Maximum depth of trees (higher = more complex model)")
+        # Start training
+        if st.button("Train Models"):
+            with st.spinner("Training models..."):
+                status_container = st.empty()
+                status_container.markdown(
+                    '<div class="loading-pulse">Training in progress... This may take a few minutes.</div>',
+                    unsafe_allow_html=True
+                )
+                # Prepare data for training
+                X = df_engineered.drop(columns=[target_col])
+                y = df_engineered[target_col]
+                # Use sample if selected
+                if use_sample and sample_size < 100:
+                    sample_frac = sample_size / 100
+                    # Stratified sampling to maintain class distribution
+                    X_sample = pd.DataFrame()
+                    y_sample = pd.Series()
+                    for class_value in y.unique():
+                        X_class = X[y == class_value]
+                        y_class = y[y == class_value]
+                        n_samples = int(len(X_class) * sample_frac)
+                        indices = np.random.choice(X_class.index, size=n_samples, replace=False)
+                        X_sample = pd.concat([X_sample, X_class.loc[indices]])
+                        y_sample = pd.concat([y_sample, y_class.loc[indices]])
+                    X = X_sample
+                    y = y_sample
+                # Progress bar
+                progress_bar = st.progress(0)
+                status_text = st.empty()
+                status_text.text("Preparing training data...")
+                progress_bar.progress(10)
+                # Split data
+                X_train, X_test, y_train, y_test = train_test_split(
+                    X, y, test_size=test_size/100, random_state=42, stratify=y
+                )
+                status_text.text("Scaling features...")
+                progress_bar.progress(20)
+                # Scale features
+                scaler = StandardScaler()
+                X_train_scaled = scaler.fit_transform(X_train)
+                X_test_scaled = scaler.transform(X_test)
+                # Handle class imbalance with SMOTE if selected
+                if use_smote:
+                    status_text.text("Applying SMOTE for class balancing...")
+                    progress_bar.progress(30)
+                    smote = SMOTE(random_state=42)
+                    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)
+                else:
+                    X_train_resampled, y_train_resampled = X_train_scaled, y_train
+                # Save preprocessor
+                with open("models/scaler.pkl", "wb") as f:
+                    pickle.dump(scaler, f)
+                # Save feature columns
+                with open("models/feature_columns.pkl", "wb") as f:
+                    pickle.dump(X.columns.tolist(), f)
+                # Initialize results list
+                results = []
+                trained_models = {}
+                # Train selected models
+                if train_lr:
+                    status_text.text("Training Logistic Regression...")
+                    progress_bar.progress(40)
+                    # Train Logistic Regression
+                    lr_model = LogisticRegression(max_iter=1000, class_weight='balanced')
+                    lr_model.fit(X_train_resampled, y_train_resampled)
+                    # Make predictions
+                    y_pred = lr_model.predict(X_test_scaled)
+                    y_pred_proba = lr_model.predict_proba(X_test_scaled)[:, 1]
+                    # Calculate metrics
+                    accuracy = accuracy_score(y_test, y_pred)
+                    precision = precision_score(y_test, y_pred)
+                    recall = recall_score(y_test, y_pred)
+                    f1 = f1_score(y_test, y_pred)
+                    auc = roc_auc_score(y_test, y_pred_proba)
+                    cm = confusion_matrix(y_test, y_pred)
+                    # Store results
+                    lr_results = {
+                        'model_name': 'Logistic Regression',
+                        'model': lr_model,
+                        'accuracy': accuracy,
+                        'precision': precision,
+                        'recall': recall,
+                        'f1_score': f1,
+                        'auc': auc,
+                        'confusion_matrix': cm,
+                        'y_test': y_test,
+                        'y_pred_proba': y_pred_proba
+                    }
+                    results.append(lr_results)
+                    trained_models['lr'] = lr_model
+                    # Save model
+                    with open("models/logistic_regression.pkl", "wb") as f:
+                        pickle.dump(lr_model, f)
+                if train_rf:
+                    status_text.text("Training Random Forest...")
+                    progress_bar.progress(60)
+                    # Get parameters - use smaller values for speed
+                    n_est = n_estimators if show_advanced else 50
+                    m_depth = max_depth if show_advanced else 6
+                    # Train Random Forest
+                    rf_model = RandomForestClassifier(
+                        n_estimators=n_est,
+                        max_depth=m_depth,
+                        class_weight='balanced',
+                        random_state=42
+                    )
+                    rf_model.fit(X_train_resampled, y_train_resampled)
+                    # Make predictions
+                    y_pred = rf_model.predict(X_test_scaled)
+                    y_pred_proba = rf_model.predict_proba(X_test_scaled)[:, 1]
+                    # Calculate metrics
+                    accuracy = accuracy_score(y_test, y_pred)
+                    precision = precision_score(y_test, y_pred)
+                    recall = recall_score(y_test, y_pred)
+                    f1 = f1_score(y_test, y_pred)
+                    auc = roc_auc_score(y_test, y_pred_proba)
+                    cm = confusion_matrix(y_test, y_pred)
+                    # Store results
+                    rf_results = {
+                        'model_name': 'Random Forest',
+                        'model': rf_model,
+                        'accuracy': accuracy,
+                        'precision': precision,
+                        'recall': recall,
+                        'f1_score': f1,
+                        'auc': auc,
+                        'confusion_matrix': cm,
+                        'y_test': y_test,
+                        'y_pred_proba': y_pred_proba
+                    }
+                    results.append(rf_results)
+                    trained_models['rf'] = rf_model
+                    # Save model
+                    with open("models/random_forest.pkl", "wb") as f:
+                        pickle.dump(rf_model, f)
+                if train_xgb:
+                    status_text.text("Training XGBoost...")
+                    progress_bar.progress(80)
+                    # Get parameters - use smaller values for speed
+                    n_est = n_estimators if show_advanced else 50
+                    m_depth = max_depth if show_advanced else 6
+                    # Train XGBoost
+                    xgb_model = XGBClassifier(
+                        n_estimators=n_est,
+                        max_depth=m_depth,
+                        scale_pos_weight=10,
+                        random_state=42,
+                        use_label_encoder=False,
+                        eval_metric='logloss'
+                    )
+                    xgb_model.fit(X_train_resampled, y_train_resampled)
+                    # Make predictions
+                    y_pred = xgb_model.predict(X_test_scaled)
+                    y_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
+                    # Calculate metrics
+                    accuracy = accuracy_score(y_test, y_pred)
+                    precision = precision_score(y_test, y_pred)
+                    recall = recall_score(y_test, y_pred)
+                    f1 = f1_score(y_test, y_pred)
+                    auc = roc_auc_score(y_test, y_pred_proba)
+                    cm = confusion_matrix(y_test, y_pred)
+                    # Store results
+                    xgb_results = {
+                        'model_name': 'XGBoost',
+                        'model': xgb_model,
+                        'accuracy': accuracy,
+                        'precision': precision,
+                        'recall': recall,
+                        'f1_score': f1,
+                        'auc': auc,
+                        'confusion_matrix': cm,
+                        'y_test': y_test,
+                        'y_pred_proba': y_pred_proba
+                    }
+                    results.append(xgb_results)
+                    trained_models['xgb'] = xgb_model
+                    # Save model
+                    with open("models/xgboost.pkl", "wb") as f:
+                        pickle.dump(xgb_model, f)
+                # Save test data
+                with open("models/test_data.pkl", "wb") as f:
+                    pickle.dump({"X_test": X_test_scaled, "y_test": y_test}, f)
+                st.session_state['trained_models'] = trained_models
+                # Automatically make predictions on the original dataset
+                status_text.text("Generating predictions...")
+                progress_bar.progress(90)
+                # Find the best model based on F1 score (good for imbalanced data)
+                best_model = None
+                best_f1 = -1
+                best_model_name = ""
+                for result in results:
+                    if result['f1_score'] > best_f1:
+                        best_f1 = result['f1_score']
+                        best_model = result['model']
+                        best_model_name = result['model_name']
+                if best_model is not None:
+                    # Prepare full dataset for prediction
+                    X_full = df_engineered.drop(columns=[target_col])
+                    # Scale the data
+                    X_full_scaled = scaler.transform(X_full)
+                    # Make predictions
+                    y_pred = best_model.predict(X_full_scaled)
+                    y_pred_proba = best_model.predict_proba(X_full_scaled)[:, 1]
+                    # Add predictions to the dataframe
+                    df_with_predictions = df_engineered.copy()
+                    df_with_predictions['Fraud_Probability'] = y_pred_proba
+                    df_with_predictions['Predicted_Fraud'] = y_pred
+                    # Store predictions
+                    st.session_state['predictions'] = {
+                        'df': df_with_predictions,
+                        'model_name': best_model_name,
+                        'results': results
+                    }
+                status_text.text("Training completed!")
+                progress_bar.progress(100)
+                time.sleep(0.3)
+                status_text.empty()
+                progress_bar.empty()
+            st.success("Models trained successfully!")
+            # Display comparison of results
+            if results:
+                st.subheader("Model Performance Analysis")
+                # Create comparison table
+                comparison_df = pd.DataFrame([
+                    {
+                        'Model': r['model_name'],
+                        'Accuracy': r['accuracy'],
+                        'Precision': r['precision'],
+                        'Recall': r['recall'],
+                        'F1 Score': r['f1_score'],
+                        'AUC': r['auc']
+                    } for r in results
+                ])
+                st.dataframe(comparison_df.style.highlight_max(axis=0, color='#81C784'))
+                # Plot metrics comparison with improved styling
+                fig = px.bar(
+                    comparison_df.melt(id_vars=['Model'], var_name='Metric', value_name='Value'),
+                    x='Model',
+                    y='Value',
+                    color='Metric',
+                    barmode='group',
+                    title='Model Performance Comparison',
+                    labels={'Value': 'Score', 'Model': 'Model'},
+                    color_discrete_sequence=['#2E7D32', '#43A047', '#81C784', '#1976D2', '#D32F2F']
+                )
+                fig.update_layout(
+                    height=500,
+                    template='plotly_white',
+                    margin=dict(l=20, r=20, t=40, b=20)
+                )
+                st.plotly_chart(fig)
+                # Plot ROC curves with improved styling
+                st.subheader("ROC Curve Analysis")
+                fig = go.Figure()
+                colors = ['#2E7D32', '#1976D2', '#D32F2F']
+                for i, result in enumerate(results):
+                    model_name = result['model_name']
+                    y_test = result['y_test']
+                    y_pred_proba = result['y_pred_proba']
+                    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+                    auc = result['auc']
+                    fig.add_trace(go.Scatter(
+                        x=fpr,
+                        y=tpr,
+                        mode='lines',
+                        name=f'{model_name} (AUC = {auc:.3f})',
+                        line=dict(color=colors[i % len(colors)], width=3)
+                    ))
+                fig.add_trace(go.Scatter(
+                    x=[0, 1],
+                    y=[0, 1],
+                    mode='lines',
+                    name='Random',
+                    line=dict(dash='dash', color='#757575', width=2)
+                ))
+                fig.update_layout(
+                    title='ROC Curve Analysis',
+                    xaxis_title='False Positive Rate',
+                    yaxis_title='True Positive Rate',
+                    legend=dict(x=0.01, y=0.99),
+                    height=500,
+                    template='plotly_white',
+                    margin=dict(l=20, r=20, t=40, b=20)
+                )
+                st.plotly_chart(fig)
+                # Show confusion matrices with improved styling
+                st.subheader("Confusion Matrix Analysis")
+                cols = st.columns(len(results))
+                for i, result in enumerate(results):
+                    with cols[i]:
+                        model_name = result['model_name']
+                        cm = result['confusion_matrix']
+                        # Calculate percentages
+                        cm_percent = cm / cm.sum()
+                        # Create annotation text
+                        annotations = []
+                        for i in range(cm.shape[0]):
+                            for j in range(cm.shape[1]):
+                                annotations.append({
+                                    'x': j,
+                                    'y': i,
+                                    'text': f"{cm[i, j]}<br>({cm_percent[i, j]:.1%})",
+                                    'showarrow': False,
+                                    'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'}
+                                })
+                        # Create heatmap
+                        fig = go.Figure(data=go.Heatmap(
+                            z=cm,
+                            x=['Predicted Normal', 'Predicted Fraud'],
+                            y=['Actual Normal', 'Actual Fraud'],
+                            colorscale=[[0, '#81C784'], [1, '#2E7D32']],
+                            showscale=False
+                        ))
+                        fig.update_layout(
+                            title=f"{model_name}",
+                            annotations=annotations,
+                            height=300,
+                            template='plotly_white',
+                            margin=dict(l=20, r=20, t=40, b=20)
+                        )
+                        st.plotly_chart(fig)
+                # Feature importance for tree-based models with improved styling
+                st.subheader("Feature Importance Analysis")
+                for result in results:
+                    model_name = result['model_name']
+                    model = result['model']
+                    if model_name in ['Random Forest', 'XGBoost']:
+                        # Get feature importance
+                        if hasattr(model, 'feature_importances_'):
+                            importances = model.feature_importances_
+                            feature_names = X.columns
+                            # Sort by importance
+                            indices = np.argsort(importances)[::-1]
+                            top_indices = indices[:10]  # Show top 10 features for speed
+                            # Create bar chart
+                            fig = px.bar(
+                                x=importances[top_indices],
+                                y=[feature_names[i] for i in top_indices],
+                                orientation='h',
+                                title=f'Top Features - {model_name}',
+                                labels={'x': 'Importance', 'y': 'Feature'},
+                                color=importances[top_indices],
+                                color_continuous_scale=['#81C784', '#43A047', '#2E7D32']
+                            )
+                            fig.update_layout(
+                                height=400,
+                                template='plotly_white',
+                                margin=dict(l=20, r=20, t=40, b=20)
+                            )
+                            st.plotly_chart(fig)
+        # Navigation buttons
+        col1, col2 = st.columns([1, 5])
+        with col1:
+            if st.button("← Back to Feature Extraction", key="back_to_feature_eng"):
+                st.session_state['current_page'] = 'feature_engineering'
+                st.rerun()
+        with col2:
+            if st.session_state['predictions'] is not None:
+                if st.button("Continue to Results →", key="to_results"):
+                    st.session_state['current_page'] = 'results'
+                    st.rerun()
+# Fraud Detection Results Page
+elif st.session_state['current_page'] == 'results':
+    st.markdown("<div class='animate-fade-in'><h2 class='sub-header'>Step 5: Fraud Detection Results</h2></div>", unsafe_allow_html=True)
+    if st.session_state['predictions'] is None:
+        st.error("No predictions found. Please complete model training first.")
+        if st.button("Go back to Model Training"):
+            st.session_state['current_page'] = 'model_training'
+            st.rerun()
+    else:
+        predictions = st.session_state['predictions']
+        df_with_predictions = predictions['df']
+        model_name = predictions['model_name']
+        st.markdown(f"<h3 class='sub-header'>Fraud Detection Results using {model_name}</h3>", unsafe_allow_html=True)
+        # Summary of predictions
+        fraud_count = df_with_predictions['Predicted_Fraud'].sum()
+        total_count = len(df_with_predictions)
+        fraud_percentage = (fraud_count / total_count) * 100
+        # Create metrics display with improved styling
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric(
+                label="Total Transactions",
+                value=f"{total_count:,}",
+                delta=None
+            )
+        with col2:
+            st.metric(
+                label="Detected Frauds",
+                value=f"{fraud_count:,}",
+                delta=None
+            )
+        with col3:
+            st.metric(
+                label="Fraud Percentage",
+                value=f"{fraud_percentage:.2f}%",
+                delta=None
+            )
+        # Visualization of fraud distribution with improved styling
+        st.subheader("Fraud Probability Distribution")
+        fig = px.histogram(
+            df_with_predictions,
+            x='Fraud_Probability',
+            nbins=50,
+            color='Predicted_Fraud',
+            color_discrete_map={0: "#6200EA", 1: "#D50000"},
+            labels={'Predicted_Fraud': 'Prediction', '0': 'Normal', '1': 'Fraud'},
+            title='Distribution of Fraud Probabilities',
+            marginal='box'
+        )
+        fig.update_layout(
+            height=500,
+            template='plotly_white',
+            margin=dict(l=20, r=20, t=40, b=20)
+        )
+        st.plotly_chart(fig)
+        # Show high probability transactions
+        st.subheader("High Fraud Probability Transactions")
+        # Slider for probability threshold
+        threshold = st.slider(
+            "Fraud Probability Threshold",
+            min_value=0.5,
+            max_value=0.95,
+            value=0.7,
+            step=0.05,
+            help="Transactions with fraud probability above this threshold will be shown"
+        )
+        high_prob_df = df_with_predictions[df_with_predictions['Fraud_Probability'] > threshold]
+        if len(high_prob_df) > 0:
+            st.write(f"Found {len(high_prob_df)} transactions with fraud probability > {threshold}")
+            # Sort by probability
+            high_prob_df = high_prob_df.sort_values('Fraud_Probability', ascending=False)
+            # Select columns to display
+            display_cols = ['Fraud_Probability', 'Predicted_Fraud']
+            # Add original features
+            if 'Amount' in high_prob_df.columns:
+                display_cols.insert(0, 'Amount')
+            if 'Time' in high_prob_df.columns:
+                display_cols.insert(0, 'Time')
+            # Add target column if it exists
+            if st.session_state['target_col'] in high_prob_df.columns:
+                display_cols.append(st.session_state['target_col'])
+            # Display dataframe
+            st.dataframe(high_prob_df[display_cols])
+            # Download button
+            csv = high_prob_df.to_csv(index=False)
+            st.download_button(
+                label="Download High Risk Transactions",
+                data=csv,
+                file_name="high_risk_transactions.csv",
+                mime="text/csv"
+            )
+        else:
+            st.info(f"No transactions found with fraud probability > {threshold}")
+            # Show top 10 highest probability transactions instead
+            st.write("Top 10 highest fraud probability transactions:")
+            st.dataframe(df_with_predictions.sort_values('Fraud_Probability', ascending=False).head(10))
+        # Compare actual vs predicted (if actual labels exist)
+        target_col = st.session_state['target_col']
+        if target_col in df_with_predictions.columns:
+            st.subheader("Actual vs Predicted Fraud")
+            # Confusion matrix with improved styling
+            cm = confusion_matrix(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud'])
+            # Calculate percentages
+            cm_percent = cm / cm.sum()
+            # Create annotation text
+            annotations = []
+            for i in range(cm.shape[0]):
+                for j in range(cm.shape[1]):
+                    annotations.append({
+                        'x': j,
+                        'y': i,
+                        'text': f"{cm[i, j]}<br>({cm_percent[i, j]:.1%})",
+                        'showarrow': False,
+                        'font': {'color': 'white' if cm_percent[i, j] > 0.5 else 'black'}
+                    })
+            # Create heatmap
+            fig = go.Figure(data=go.Heatmap(
+                z=cm,
+                x=['Predicted Normal', 'Predicted Fraud'],
+                y=['Actual Normal', 'Actual Fraud'],
+                colorscale=[[0, '#81C784'], [1, '#2E7D32']],
+                showscale=False
+            ))
+            fig.update_layout(
+                title=f"Confusion Matrix - {model_name}",
+                annotations=annotations,
+                height=400,
+                template='plotly_white',
+                margin=dict(l=20, r=20, t=40, b=20)
+            )
+            st.plotly_chart(fig)
+            # Calculate metrics
+            accuracy = accuracy_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud'])
+            # Calculate metrics
+            precision = precision_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud'])
+            recall = recall_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud'])
+            f1 = f1_score(df_with_predictions[target_col], df_with_predictions['Predicted_Fraud'])
+            # Display metrics with improved styling
+            st.subheader("Performance Metrics on Full Dataset")
+            col1, col2, col3, col4 = st.columns(4)
+            with col1:
+                st.metric(
+                    label="Accuracy",
+                    value=f"{accuracy:.4f}",
+                    delta=None
+                )
+            with col2:
+                st.metric(
+                    label="Precision",
+                    value=f"{precision:.4f}",
+                    delta=None
+                )
+            with col3:
+                st.metric(
+                    label="Recall",
+                    value=f"{recall:.4f}",
+                    delta=None
+                )
+            with col4:
+                st.metric(
+                    label="F1 Score",
+                    value=f"{f1:.4f}",
+                    delta=None
+                )
+        # Download all predictions
+        st.subheader("Download Results")
+        csv = df_with_predictions.to_csv(index=False)
+        st.download_button(
+            label="Download All Predictions as CSV",
+            data=csv,
+            file_name="fraud_predictions.csv",
+            mime="text/csv"
+        )
+        # Navigation buttons
+        col1, col2 = st.columns([1, 5])
+        with col1:
+            if st.button("← Back to Model Training", key="back_to_model_training"):
+                st.session_state['current_page'] = 'model_training'
+                st.rerun()
+        with col2:
+            if st.button("Start Over", key="start_over"):
+                # Reset session state
+                for key in list(st.session_state.keys()):
+                    del st.session_state[key]
+                st.session_state['current_page'] = 'home'
+                st.rerun()

data_exploration.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# pages/data_exploration.py
+import streamlit as st
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import os
+from utils.data_processor import DataProcessor
+from utils.visualizer import Visualizer
+def app():
+    st.title("Data Exploration")
+    # Initialize classes
+    data_processor = DataProcessor()
+    visualizer = Visualizer()
+    # Load data function
+    @st.cache_data
+    def load_data():
+        # Check if data exists in the data directory
+        data_path = "data/creditcard.csv"
+        if os.path.exists(data_path):
+            return pd.read_csv(data_path)
+        else:
+            st.warning("Default dataset not found. Please upload a dataset.")
+            return None
+    # Load data
+    df = load_data()
+    if df is None:
+        uploaded_file = st.file_uploader("Upload a CSV file", type="csv")
+        if uploaded_file is not None:
+            df = pd.read_csv(uploaded_file)
+            df.to_csv("data/uploaded_data.csv", index=False)
+    if df is not None:
+        st.write(f"Dataset shape: {df.shape[0]} rows and {df.shape[1]} columns")
+        # Data overview
+        st.header("Data Overview")
+        st.write(df.head())
+        # Data information
+        st.header("Data Information")
+        buffer = pd.DataFrame({
+            'Column': df.columns,
+            'Type': df.dtypes,
+            'Non-Null Count': df.count(),
+            'Null Count': df.isnull().sum(),
+            'Unique Values': [df[col].nunique() for col in df.columns]
+        })
+        st.write(buffer)
+        # Statistical summary
+        st.header("Statistical Summary")
+        st.write(df.describe())
+        # Class distribution
+        st.header("Class Distribution")
+        if 'Class' in df.columns:
+            fig = visualizer.plot_class_distribution(df)
+            st.pyplot(fig)
+            # Calculate fraud percentage
+            fraud_percentage = df['Class'].mean() * 100
+            st.write(f"Fraud transactions: {fraud_percentage:.2f}% of the dataset")
+        else:
+            st.warning("No 'Class' column found in the dataset. Please ensure your target variable is named 'Class'.")
+        # Feature distributions
+        st.header("Feature Distributions")
+        num_features = st.slider("Number of features to display", 1, min(10, len(df.columns)-1), 5)
+        fig = visualizer.plot_feature_distributions(df, n_features=num_features)
+        st.pyplot(fig)
+        # Correlation matrix
+        st.header("Correlation Matrix")
+        fig = visualizer.plot_correlation_matrix(df)
+        st.pyplot(fig)
+        # Transaction amount analysis
+        if 'Amount' in df.columns:
+            st.header("Transaction Amount Analysis")
+            col1, col2 = st.columns(2)
+            with col1:
+                st.subheader("Amount Distribution")
+                fig, ax = plt.subplots(figsize=(10, 6))
+                sns.histplot(data=df, x='Amount', bins=50, kde=True, ax=ax)
+                st.pyplot(fig)
+            with col2:
+                if 'Class' in df.columns:
+                    st.subheader("Amount by Class")
+                    fig, ax = plt.subplots(figsize=(10, 6))
+                    sns.boxplot(x='Class', y='Amount', data=df, ax=ax)
+                    st.pyplot(fig)
+        # Time analysis
+        if 'Time' in df.columns:
+            st.header("Transaction Time Analysis")
+            # Convert time to hours
+            df_time = df.copy()
+            df_time['Hour'] = (df_time['Time'] / 3600) % 24
+            fig, ax = plt.subplots(figsize=(12, 6))
+            if 'Class' in df.columns:
+                sns.histplot(data=df_time, x='Hour', hue='Class', bins=24, kde=True, ax=ax)
+            else:
+                sns.histplot(data=df_time, x='Hour', bins=24, kde=True, ax=ax)
+            plt.title('Transaction Distribution by Hour of Day')
+            plt.xlabel('Hour of Day')
+            plt.ylabel('Number of Transactions')
+            st.pyplot(fig)
+        # Feature analysis for fraud detection
+        if 'Class' in df.columns:
+            st.header("Feature Analysis for Fraud Detection")
+            # Select top features correlated with fraud
+            corr_with_fraud = df.corr()['Class'].sort_values(ascending=False)
+            top_features = corr_with_fraud[1:6].index.tolist()  # Skip Class itself
+            st.subheader("Top Features Correlated with Fraud")
+            st.write(corr_with_fraud[1:11])  # Show top 10 correlations
+            # Plot distributions of top features by fraud/non-fraud
+            st.subheader("Distributions of Top Features by Class")
+            for feature in top_features:
+                fig, ax = plt.subplots(figsize=(10, 6))
+                sns.histplot(data=df, x=feature, hue='Class', bins=50, kde=True, ax=ax)
+                plt.title(f'Distribution of {feature} by Class')
+                st.pyplot(fig)
+if __name__ == "__main__":
+    app()

data_processor.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import pandas as pd
+import numpy as np
+from sklearn.preprocessing import StandardScaler, OneHotEncoder
+from sklearn.compose import ColumnTransformer
+from sklearn.pipeline import Pipeline
+from imblearn.over_sampling import SMOTE
+from sklearn.model_selection import train_test_split
+from sklearn import __version__ as sklearn_version
+from packaging import version
+class DataProcessor:
+    def __init__(self):
+        self.scaler = StandardScaler()
+        # Handle different scikit-learn versions
+        if version.parse(sklearn_version) >= version.parse('1.2.0'):
+            self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
+        else:
+            self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
+    def load_data(self, file_path):
+        """Load the dataset from a CSV file"""
+        try:
+            df = pd.read_csv(file_path)
+            return df
+        except Exception as e:
+            print(f"Error loading data: {e}")
+            return None
+    def preprocess_data(self, df, target_col='Class'):
+        """Preprocess the data for model training"""
+        # Handle missing values
+        df = df.fillna(df.mean())
+        # Separate features and target
+        X = df.drop(columns=[target_col])
+        y = df[target_col]
+        # Split data into train and test sets
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, random_state=42, stratify=y
+        )
+        # Scale numerical features
+        num_features = X.select_dtypes(include=['int64', 'float64']).columns
+        # Get categorical features if any
+        cat_features = X.select_dtypes(include=['object', 'category']).columns
+        # Create preprocessing pipelines
+        if version.parse(sklearn_version) >= version.parse('1.2.0'):
+            preprocessor = ColumnTransformer(
+                transformers=[
+                    ('num', StandardScaler(), num_features),
+                    ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_features)
+                ] if len(cat_features) > 0 else [
+                    ('num', StandardScaler(), num_features)
+                ]
+            )
+        else:
+            preprocessor = ColumnTransformer(
+                transformers=[
+                    ('num', StandardScaler(), num_features),
+                    ('cat', OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_features)
+                ] if len(cat_features) > 0 else [
+                    ('num', StandardScaler(), num_features)
+                ]
+            )
+        # Fit and transform the training data
+        X_train_processed = preprocessor.fit_transform(X_train)
+        X_test_processed = preprocessor.transform(X_test)
+        # Handle class imbalance using SMOTE
+        smote = SMOTE(random_state=42)
+        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_processed, y_train)
+        return X_train_resampled, X_test_processed, y_train_resampled, y_test, preprocessor
+    def engineer_features(self, df):
+        """Create new features for fraud detection"""
+        # Copy the dataframe to avoid modifying the original
+        df_new = df.copy()
+        # If Time column exists, create time-based features
+        if 'Time' in df_new.columns:
+            # Convert seconds to hours of the day (assuming Time is in seconds from a reference point)
+            df_new['Hour'] = (df_new['Time'] / 3600) % 24
+            # Flag for transactions during odd hours (midnight to 5 AM)
+            df_new['Odd_Hour'] = ((df_new['Hour'] >= 0) & (df_new['Hour'] < 5)).astype(int)
+        # If Amount column exists, create amount-based features
+        if 'Amount' in df_new.columns:
+            # Log transform for amount (to handle skewed distribution)
+            df_new['Log_Amount'] = np.log1p(df_new['Amount'])
+            # Flag for high-value transactions (top 5%)
+            threshold = df_new['Amount'].quantile(0.95)
+            df_new['High_Value'] = (df_new['Amount'] > threshold).astype(int)
+        # Transaction frequency features (if multiple transactions per account)
+        if 'card_id' in df_new.columns:  # Assuming there's a card or account ID
+            # Number of transactions per card
+            tx_count = df_new.groupby('card_id').size().reset_index(name='Tx_Count')
+            df_new = df_new.merge(tx_count, on='card_id', how='left')
+            # Average transaction amount per card
+            avg_amount = df_new.groupby('card_id')['Amount'].mean().reset_index(name='Avg_Amount')
+            df_new = df_new.merge(avg_amount, on='card_id', how='left')
+            # Transaction amount deviation from average
+            df_new['Amount_Deviation'] = df_new['Amount'] - df_new['Avg_Amount']
+        return df_new

engineered_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3bb2af06deaefb7427a0878982917cbf2ee8270aa79339730a01b1e1972a3c00
+size 162508357

gitattributes ADDED Viewed

	@@ -0,0 +1,38 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+engineered_data.csv filter=lfs diff=lfs merge=lfs -text
+preprocessed_data.csv filter=lfs diff=lfs merge=lfs -text
+uploaded_data.csv filter=lfs diff=lfs merge=lfs -text

gitkeep ADDED Viewed

File without changes

model_trainer.py ADDED Viewed

	@@ -0,0 +1,121 @@

+# utils/model_trainer.py (updated)
+import pandas as pd
+import numpy as np
+import pickle
+from sklearn.linear_model import LogisticRegression
+from sklearn.ensemble import RandomForestClassifier
+from xgboost import XGBClassifier
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Dense, Dropout
+from sklearn.metrics import (
+    accuracy_score, precision_score, recall_score, f1_score,
+    roc_auc_score, confusion_matrix, classification_report
+)
+import matplotlib.pyplot as plt
+import seaborn as sns
+import warnings
+# Suppress warnings
+warnings.filterwarnings('ignore')
+class ModelTrainer:
+    def __init__(self):
+        self.models = {
+            'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
+            'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
+            'XGBoost': XGBClassifier(scale_pos_weight=10, n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
+        }
+        self.neural_net = None
+    def train_models(self, X_train, y_train):
+        """Train multiple machine learning models"""
+        trained_models = {}
+        for name, model in self.models.items():
+            print(f"Training {name}...")
+            model.fit(X_train, y_train)
+            trained_models[name] = model
+        return trained_models
+    def train_neural_network(self, X_train, y_train, input_dim):
+        """Train a neural network model"""
+        model = Sequential([
+            Dense(64, activation='relu', input_dim=input_dim),
+            Dropout(0.3),
+            Dense(32, activation='relu'),
+            Dropout(0.3),
+            Dense(16, activation='relu'),
+            Dense(1, activation='sigmoid')
+        ])
+        model.compile(
+            optimizer='adam',
+            loss='binary_crossentropy',
+            metrics=['accuracy']
+        )
+        history = model.fit(
+            X_train, y_train,
+            epochs=20,
+            batch_size=64,
+            validation_split=0.2,
+            verbose=1
+        )
+        self.neural_net = model
+        return model, history
+    def evaluate_model(self, model, X_test, y_test, model_name="Model"):
+        """Evaluate model performance with various metrics"""
+        if model_name == "Neural Network":
+            y_pred_proba = model.predict(X_test)
+            y_pred = (y_pred_proba > 0.5).astype(int)
+        else:
+            y_pred = model.predict(X_test)
+            y_pred_proba = model.predict_proba(X_test)[:, 1]
+        # Calculate metrics
+        accuracy = accuracy_score(y_test, y_pred)
+        precision = precision_score(y_test, y_pred)
+        recall = recall_score(y_test, y_pred)
+        f1 = f1_score(y_test, y_pred)
+        auc = roc_auc_score(y_test, y_pred_proba)
+        # Create confusion matrix
+        cm = confusion_matrix(y_test, y_pred)
+        # Detailed classification report
+        report = classification_report(y_test, y_pred)
+        results = {
+            'model_name': model_name,
+            'accuracy': accuracy,
+            'precision': precision,
+            'recall': recall,
+            'f1_score': f1,
+            'auc': auc,
+            'confusion_matrix': cm,
+            'classification_report': report,
+            'y_test': y_test,
+            'y_pred_proba': y_pred_proba
+        }
+        return results
+    def save_model(self, model, file_path):
+        """Save the trained model to a file"""
+        if isinstance(model, Sequential):
+            model.save(file_path)
+        else:
+            with open(file_path, 'wb') as f:
+                pickle.dump(model, f)
+    def load_model(self, file_path, model_type='sklearn'):
+        """Load a trained model from a file"""
+        if model_type == 'keras':
+            from tensorflow.keras.models import load_model
+            return load_model(file_path)
+        else:
+            with open(file_path, 'rb') as f:
+                return pickle.load(f)

preprocessed_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:895dc3dad2840ac9e05c12d6442bd739d879c2d405e9b065efc0f1973be46a84
+size 151102405

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+streamlit
+pandas
+numpy
+scikit-learn
+matplotlib
+seaborn
+plotly
+imbalanced-learn
+xgboost
+tensorflow
+shap

uploaded_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:895dc3dad2840ac9e05c12d6442bd739d879c2d405e9b065efc0f1973be46a84
+size 151102405

visualizer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+import plotly.express as px
+import plotly.graph_objects as go
+from sklearn.metrics import roc_curve, precision_recall_curve
+import shap
+class Visualizer:
+    def __init__(self):
+        pass
+    def plot_class_distribution(self, df, target_col='Class'):
+        """Plot the distribution of fraud vs non-fraud transactions"""
+        plt.figure(figsize=(10, 6))
+        sns.countplot(x=target_col, data=df)
+        plt.title('Class Distribution (Fraud vs Non-Fraud)')
+        plt.xlabel('Class (0: Normal, 1: Fraud)')
+        plt.ylabel('Count')
+        # Add percentage labels
+        total = len(df)
+        for p in plt.gca().patches:
+            height = p.get_height()
+            plt.text(p.get_x() + p.get_width()/2.,
+                    height + 3,
+                    '{:.2f}%'.format(100 * height/total),
+                    ha="center")
+        return plt
+    def plot_feature_distributions(self, df, target_col='Class', n_features=5):
+        """Plot distributions of top features by class"""
+        # Select numerical columns only
+        num_cols = df.select_dtypes(include=['int64', 'float64']).columns
+        num_cols = [col for col in num_cols if col != target_col]
+        # If there are too many features, select a subset
+        if len(num_cols) > n_features:
+            num_cols = num_cols[:n_features]
+        # Create subplots
+        fig, axes = plt.subplots(len(num_cols), 1, figsize=(12, 4*len(num_cols)))
+        # If there's only one feature, axes won't be an array
+        if len(num_cols) == 1:
+            axes = [axes]
+        for i, col in enumerate(num_cols):
+            sns.histplot(data=df, x=col, hue=target_col, bins=50, ax=axes[i], kde=True)
+            axes[i].set_title(f'Distribution of {col} by Class')
+        plt.tight_layout()
+        return fig
+    def plot_correlation_matrix(self, df, target_col='Class'):
+        """Plot correlation matrix of features"""
+        # Calculate correlation matrix
+        corr_matrix = df.corr()
+        # Create heatmap
+        plt.figure(figsize=(12, 10))
+        mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
+        sns.heatmap(corr_matrix, mask=mask, annot=False, cmap='coolwarm',
+                   linewidths=0.5, vmin=-1, vmax=1)
+        plt.title('Feature Correlation Matrix')
+        return plt
+    def plot_feature_importance(self, model, feature_names, model_name="Model"):
+        """Plot feature importance for tree-based models"""
+        if hasattr(model, 'feature_importances_'):
+            # Get feature importances
+            importances = model.feature_importances_
+            # Sort feature importances in descending order
+            indices = np.argsort(importances)[::-1]
+            # Rearrange feature names so they match the sorted feature importances
+            names = [feature_names[i] for i in indices]
+            # Create plot
+            plt.figure(figsize=(12, 8))
+            plt.title(f"Feature Importance ({model_name})")
+            plt.bar(range(len(importances)), importances[indices])
+            plt.xticks(range(len(importances)), names, rotation=90)
+            plt.tight_layout()
+            return plt
+        else:
+            print(f"Model {model_name} doesn't have feature_importances_ attribute")
+            return None
+    def plot_roc_curve(self, models_results):
+        """Plot ROC curves for multiple models"""
+        plt.figure(figsize=(10, 8))
+        for result in models_results:
+            model_name = result['model_name']
+            y_test = result['y_test']
+            y_pred_proba = result['y_pred_proba']
+            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
+            auc = result['auc']
+            plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.3f})')
+        plt.plot([0, 1], [0, 1], 'k--')
+        plt.xlabel('False Positive Rate')
+        plt.ylabel('True Positive Rate')
+        plt.title('ROC Curve')
+        plt.legend(loc='best')
+        return plt
+    def plot_precision_recall_curve(self, models_results):
+        """Plot Precision-Recall curves for multiple models"""
+        plt.figure(figsize=(10, 8))
+        for result in models_results:
+            model_name = result['model_name']
+            y_test = result['y_test']
+            y_pred_proba = result['y_pred_proba']
+            precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
+            plt.plot(recall, precision, label=f'{model_name}')
+        plt.xlabel('Recall')
+        plt.ylabel('Precision')
+        plt.title('Precision-Recall Curve')
+        plt.legend(loc='best')
+        return plt
+    def plot_confusion_matrix(self, cm, model_name="Model"):
+        """Plot confusion matrix"""
+        plt.figure(figsize=(8, 6))
+        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
+        plt.title(f'Confusion Matrix - {model_name}')
+        plt.ylabel('Actual')
+        plt.xlabel('Predicted')
+        return plt
+    def plot_shap_values(self, model, X_test, feature_names, model_name="Model"):
+        """Plot SHAP values to explain model predictions"""
+        # Create explainer
+        if model_name == "XGBoost":
+            explainer = shap.TreeExplainer(model)
+        else:
+            explainer = shap.Explainer(model)
+        # Calculate SHAP values
+        shap_values = explainer.shap_values(X_test)
+        # Summary plot
+        plt.figure(figsize=(12, 8))
+        shap.summary_plot(shap_values, X_test, feature_names=feature_names)
+        return plt