Spaces:

AbdramaneB
/

Pima_reporting

Sleeping

File size: 6,242 Bytes

import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

# Load data
def load_data():
    df = pd.read_csv("./processed_data.csv")  # Make sure this file has the columns: preg, plas, pres, skin, test, mass, pedi, age, class
    return df

# Create Streamlit app
def app():
    # Title for the app
    huggingface_page_title = "Diabetes Outcomes Dashboard"
    st.set_page_config(page_title=huggingface_page_title, layout="wide")

    # --- Custom CSS to adjust sidebar width ---
    st.markdown(
        """
        <style>
            /* Sidebar width */
            [data-testid="stSidebar"] {
                width: 600px;
                min-width: 600px;
            }
        </style>
        """,
        unsafe_allow_html=True
    )

    # --- Page content ---
    st.title(huggingface_page_title)

    # Load data
    data = load_data()

    # Ensure expected columns exist (optional safety check)
    expected_cols = {"preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"}
    if not expected_cols.issubset(set(data.columns)):
        st.error(f"Dataset is missing some expected columns. Found: {list(data.columns)}")
        return

    # --- Key Metrics from the data ---
    total_obs = len(data)
    n_diabetes = int(data["class"].sum())  # assuming 1 = diabetes, 0 = no diabetes
    diabetes_rate = (n_diabetes / total_obs * 100) if total_obs > 0 else 0

    avg_age = round(data["age"].mean(), 1)
    median_age = round(data["age"].median(), 1)

    avg_preg = round(data["preg"].mean(), 1)
    avg_bmi = round(data["mass"].mean(), 1)
    avg_glucose = round(data["plas"].mean(), 1)
    avg_bp = round(data["pres"].mean(), 1)
    avg_pedi = round(data["pedi"].mean(), 3)

    # Display metrics in the sidebar
    st.sidebar.header("Key Metrics")
    st.sidebar.metric("Total patients", total_obs)
    st.sidebar.metric("Patients with diabetes", n_diabetes)
    st.sidebar.metric("Diabetes prevalence (%)", f"{diabetes_rate:.1f}")

    st.sidebar.markdown("---")
    st.sidebar.metric("Avg age (years)", avg_age)
    st.sidebar.metric("Median age (years)", median_age)

    st.sidebar.markdown("---")
    st.sidebar.metric("Avg pregnancies", avg_preg)
    st.sidebar.metric("Avg BMI", avg_bmi)
    st.sidebar.metric("Avg plasma glucose", avg_glucose)
    st.sidebar.metric("Avg blood pressure (mm Hg)", avg_bp)
    st.sidebar.metric("Avg diabetes pedigree", avg_pedi)

    # --- Data preview ---
    st.markdown("### Data preview")
    st.dataframe(data.head())

    # Styling for seaborn plots
    sns.set_style("whitegrid", {'grid.color': 'lightgrey', 'grid.linestyle': '--'})

    #////////////////////////////////////////////////////////////////////////////
    # Outcome distribution (class)
    #////////////////////////////////////////////////////////////////////////////
    if "class" in data.columns:
        st.header("Diabetes outcome distribution")
        fig, ax = plt.subplots()
        outcome_counts = data["class"].value_counts().sort_index()
        sns.barplot(x=outcome_counts.index, y=outcome_counts.values, ax=ax)
        ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
        ax.set_ylabel("Count")
        ax.set_title("Diabetes outcome count")
        for i, v in enumerate(outcome_counts.values):
            ax.text(i, v + max(outcome_counts.values) * 0.01, str(v), ha="center", va="bottom", fontsize=9)
        st.pyplot(fig)
        plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Feature distributions by outcome (histograms)
    #////////////////////////////////////////////////////////////////////////////
    st.header("Key feature distributions by outcome")

    feature_display_names = {
        "preg": "Pregnancies",
        "plas": "Plasma glucose concentration",
        "pres": "Diastolic blood pressure (mm Hg)",
        "mass": "Body mass index (BMI)",
        "age": "Age (years)"
    }

    for col, label in feature_display_names.items():
        if {col, "class"}.issubset(data.columns):
            st.subheader(f"{label} by outcome")
            fig, ax = plt.subplots()
            sns.histplot(
                data=data,
                x=col,
                hue="class",
                multiple="stack",
                bins=30,
                ax=ax
            )
            ax.set_xlabel(label)
            ax.set_title(f"{label} distribution (Diabetes vs No diabetes)")
            st.pyplot(fig)
            plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Boxplots of selected features by outcome
    #////////////////////////////////////////////////////////////////////////////
    st.header("Feature boxplots by outcome")

    box_features = {
        "plas": "Plasma glucose concentration",
        "mass": "Body mass index (BMI)",
        "pedi": "Diabetes pedigree function"
    }

    for col, label in box_features.items():
        if {col, "class"}.issubset(data.columns):
            st.subheader(f"{label} vs outcome")
            fig, ax = plt.subplots()
            sns.boxplot(
                data=data,
                x="class",
                y=col,
                ax=ax
            )
            ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
            ax.set_ylabel(label)
            ax.set_title(f"{label} vs diabetes outcome")
            st.pyplot(fig)
            plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Correlation heatmap of numeric variables
    #////////////////////////////////////////////////////////////////////////////
    st.header("Correlation heatmap (numeric features)")
    numeric_cols = data.select_dtypes(include=["number"]).columns
    if len(numeric_cols) > 1:
        fig, ax = plt.subplots(figsize=(8, 6))
        corr = data[numeric_cols].corr()
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
        ax.set_title("Correlation heatmap")
        st.pyplot(fig)
        plt.close(fig)
    else:
        st.write("Not enough numeric columns to compute correlations.")

if __name__ == "__main__":
    app()