Spaces:

saherPervaiz
/

Depression

Sleeping

File size: 8,454 Bytes

import streamlit as st
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Import custom functions from your utils
from utils.data_cleaning import preprocess_data, remove_outliers_iqr, cap_extreme_values, convert_string_to_numeric
from utils.model_training import train_all_models

# New Function: Combined Histogram and Bar Plot Comparison
def combined_histogram_barplot(df):
    """
    Creates a combined histogram (numeric) and bar plot (categorical) for all attributes in the dataset.
    """
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    categorical_columns = df.select_dtypes(include=['object']).columns
    
    # Create a figure for combined plots
    fig, axes = plt.subplots(len(numeric_columns) + len(categorical_columns), 1, figsize=(10, 5 * (len(numeric_columns) + len(categorical_columns))))
    
    # Histogram for numeric columns
    for i, col in enumerate(numeric_columns):
        axes[i].hist(df[col], bins=20, color='blue', alpha=0.7, edgecolor='black')
        axes[i].set_title(f"Histogram of {col}")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("Frequency")
    
    # Bar plots for categorical columns
    for j, col in enumerate(categorical_columns, start=len(numeric_columns)):
        df[col].value_counts().plot(kind='bar', ax=axes[j], color='orange', alpha=0.7, edgecolor='black')
        axes[j].set_title(f"Bar Plot of {col}")
        axes[j].set_xlabel(col)
        axes[j].set_ylabel("Count")
    
    plt.tight_layout()
    return fig

# Plotting Functions
def plot_correlation_heatmap(df):
    """
    Plot a correlation heatmap for the numeric columns in the dataframe.
    """
    corr = df.corr()
    fig = plt.figure(figsize=(10, 8))  # Create a new figure object
    heatmap = sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title("Correlation Heatmap")
    return fig  # Return the figure object

def save_figure_as_png(fig):
    """
    Save the given figure as a PNG file to a BytesIO buffer.
    """
    buffer = io.BytesIO()
    fig.savefig(buffer, format="png")  # Save the figure to the buffer
    buffer.seek(0)  # Reset the buffer's position to the beginning
    return buffer

def plot_histogram(df, column):
    """
    Plot a histogram for a specific column in the dataframe.
    """
    plt.figure(figsize=(8, 6))
    sns.histplot(df[column], kde=True, bins=30, color="skyblue")
    plt.title(f"Histogram of {column}")
    plt.xlabel(column)
    plt.ylabel("Frequency")
    return plt.gcf()

def plot_box_plot(df, column):
    """
    Plot a box plot for a specific column in the dataframe.
    """
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[column])
    plt.title(f"Box Plot of {column}")
    return plt.gcf()

def plot_pair_plot(df):
    """
    Plot a pair plot for numeric columns in the dataframe.
    """
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    return sns.pairplot(df[numeric_columns])

def plot_scatter_plot(df, x_col, y_col):
    """
    Plot a scatter plot between two numeric columns.
    """
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=df[x_col], y=df[y_col], color="green")
    plt.title(f"Scatter Plot between {x_col} and {y_col}")
    return plt.gcf()

def plot_bar_plot(df, column):
    """
    Plot a bar plot for a categorical column.
    """
    plt.figure(figsize=(8, 6))
    sns.countplot(x=df[column])
    plt.title(f"Bar Plot of {column}")
    return plt.gcf()

# Streamlit App Title
st.title("Data Analysis, Model Training, and Visualization")

# File Uploader
uploaded_file = st.file_uploader("Upload a CSV file for data analysis", type=["csv"])

if uploaded_file is not None:
    # Load dataset
    df = pd.read_csv(uploaded_file)
    st.write("### Dataset Preview")
    st.dataframe(df)

    try:
        # Data Cleaning
        st.subheader("Data Cleaning")
        st.write("Handling missing values, removing outliers, and capping extreme values...")
        df_cleaned = preprocess_data(df)
        df_cleaned = remove_outliers_iqr(df_cleaned)
        df_cleaned = cap_extreme_values(df_cleaned)

        # Convert string columns to numeric (if any)
        st.subheader("String to Numeric Conversion")
        st.write("Converting string categorical values to numeric using Label Encoding...")
        df_cleaned = convert_string_to_numeric(df_cleaned)

        st.write("### Cleaned Dataset")
        st.dataframe(df_cleaned)

        # Download option for cleaned dataset
        st.download_button(
            label="Download Cleaned Dataset (CSV)",
            data=df_cleaned.to_csv(index=False),
            file_name="cleaned_dataset.csv",
            mime="text/csv"
        )

        # Correlation Heatmap
        st.subheader("Correlation Heatmap")
        st.write("Visualizing correlations between numeric features...")
        heatmap_fig = plot_correlation_heatmap(df_cleaned)
        st.pyplot(heatmap_fig)  # Display the heatmap using Streamlit

        # Save and download heatmap as PNG
        heatmap_buffer = save_figure_as_png(heatmap_fig)  # Save the figure to buffer

        st.download_button(
            label="Download Correlation Heatmap (PNG)",
            data=heatmap_buffer,
            file_name="correlation_heatmap.png",
            mime="image/png"
        )

        # Additional Visualizations
        st.subheader("Additional Visualizations")

        numeric_columns = df_cleaned.select_dtypes(include=['float64', 'int64']).columns.tolist()
        categorical_columns = df_cleaned.select_dtypes(include=['object']).columns.tolist()

        # Combined Histogram and Bar Plot
        st.subheader("Combined Histogram and Bar Plot")
        combined_plot = combined_histogram_barplot(df_cleaned)
        st.pyplot(combined_plot)

        # Distribution Plot
        if numeric_columns:
            st.write("### Distribution Plots (Histograms)")
            for col in numeric_columns:
                st.write(f"#### {col}")
                hist_plot = plot_histogram(df_cleaned, col)
                st.pyplot(hist_plot)

        # Box Plot
        if numeric_columns:
            st.write("### Box Plots (Outlier Detection)")
            for col in numeric_columns:
                st.write(f"#### {col}")
                box_plot = plot_box_plot(df_cleaned, col)
                st.pyplot(box_plot)

        # Pair Plot
        if len(numeric_columns) > 1:
            st.write("### Pair Plot")
            pair_plot = plot_pair_plot(df_cleaned)
            st.pyplot(pair_plot)

        # Scatter Plot
        if len(numeric_columns) > 1:
            st.write("### Scatter Plot")
            x_col = st.selectbox("Select X-axis for Scatter Plot", numeric_columns)
            y_col = st.selectbox("Select Y-axis for Scatter Plot", numeric_columns)
            if x_col and y_col:
                scatter_plot = plot_scatter_plot(df_cleaned, x_col, y_col)
                st.pyplot(scatter_plot)

        # Bar Plot
        if categorical_columns:
            st.write("### Bar Plots (For Categorical Data)")
            for col in categorical_columns:
                st.write(f"#### {col}")
                bar_plot = plot_bar_plot(df_cleaned, col)
                st.pyplot(bar_plot)

        # Select Target and Features
        st.subheader("Feature and Target Selection")
        target = st.selectbox("Select Target Variable", df_cleaned.columns)
        features = [col for col in df_cleaned.columns if col != target]

        if not features:
            st.warning("No features available after removing the target variable.")
        else:
            X = df_cleaned[features]
            y = df_cleaned[target]

            # Train and Evaluate Models
            st.subheader("Model Training and Evaluation")
            st.write("Training models and calculating metrics...")
            model_results = train_all_models(X, y)

            st.write("### Model Training Results")
            st.dataframe(model_results)

            # Download option for model results
            st.download_button(
                label="Download Model Results (CSV)",
                data=model_results.to_csv(index=False),
                file_name="model_results.csv",
                mime="text/csv"
            )
    except Exception as e:
        st.error(f"An error occurred: {e}")
else:
    st.info("Please upload a CSV file to proceed.")