File size: 6,242 Bytes
e2ae8b1
b7a0a90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e2ae8b1
b7a0a90
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd

# Load data
def load_data():
    df = pd.read_csv("./processed_data.csv")  # Make sure this file has the columns: preg, plas, pres, skin, test, mass, pedi, age, class
    return df

# Create Streamlit app
def app():
    # Title for the app
    huggingface_page_title = "Diabetes Outcomes Dashboard"
    st.set_page_config(page_title=huggingface_page_title, layout="wide")

    # --- Custom CSS to adjust sidebar width ---
    st.markdown(
        """
        <style>
            /* Sidebar width */
            [data-testid="stSidebar"] {
                width: 600px;
                min-width: 600px;
            }
        </style>
        """,
        unsafe_allow_html=True
    )

    # --- Page content ---
    st.title(huggingface_page_title)

    # Load data
    data = load_data()

    # Ensure expected columns exist (optional safety check)
    expected_cols = {"preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"}
    if not expected_cols.issubset(set(data.columns)):
        st.error(f"Dataset is missing some expected columns. Found: {list(data.columns)}")
        return

    # --- Key Metrics from the data ---
    total_obs = len(data)
    n_diabetes = int(data["class"].sum())  # assuming 1 = diabetes, 0 = no diabetes
    diabetes_rate = (n_diabetes / total_obs * 100) if total_obs > 0 else 0

    avg_age = round(data["age"].mean(), 1)
    median_age = round(data["age"].median(), 1)

    avg_preg = round(data["preg"].mean(), 1)
    avg_bmi = round(data["mass"].mean(), 1)
    avg_glucose = round(data["plas"].mean(), 1)
    avg_bp = round(data["pres"].mean(), 1)
    avg_pedi = round(data["pedi"].mean(), 3)

    # Display metrics in the sidebar
    st.sidebar.header("Key Metrics")
    st.sidebar.metric("Total patients", total_obs)
    st.sidebar.metric("Patients with diabetes", n_diabetes)
    st.sidebar.metric("Diabetes prevalence (%)", f"{diabetes_rate:.1f}")

    st.sidebar.markdown("---")
    st.sidebar.metric("Avg age (years)", avg_age)
    st.sidebar.metric("Median age (years)", median_age)

    st.sidebar.markdown("---")
    st.sidebar.metric("Avg pregnancies", avg_preg)
    st.sidebar.metric("Avg BMI", avg_bmi)
    st.sidebar.metric("Avg plasma glucose", avg_glucose)
    st.sidebar.metric("Avg blood pressure (mm Hg)", avg_bp)
    st.sidebar.metric("Avg diabetes pedigree", avg_pedi)

    # --- Data preview ---
    st.markdown("### Data preview")
    st.dataframe(data.head())

    # Styling for seaborn plots
    sns.set_style("whitegrid", {'grid.color': 'lightgrey', 'grid.linestyle': '--'})

    #////////////////////////////////////////////////////////////////////////////
    # Outcome distribution (class)
    #////////////////////////////////////////////////////////////////////////////
    if "class" in data.columns:
        st.header("Diabetes outcome distribution")
        fig, ax = plt.subplots()
        outcome_counts = data["class"].value_counts().sort_index()
        sns.barplot(x=outcome_counts.index, y=outcome_counts.values, ax=ax)
        ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
        ax.set_ylabel("Count")
        ax.set_title("Diabetes outcome count")
        for i, v in enumerate(outcome_counts.values):
            ax.text(i, v + max(outcome_counts.values) * 0.01, str(v), ha="center", va="bottom", fontsize=9)
        st.pyplot(fig)
        plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Feature distributions by outcome (histograms)
    #////////////////////////////////////////////////////////////////////////////
    st.header("Key feature distributions by outcome")

    feature_display_names = {
        "preg": "Pregnancies",
        "plas": "Plasma glucose concentration",
        "pres": "Diastolic blood pressure (mm Hg)",
        "mass": "Body mass index (BMI)",
        "age": "Age (years)"
    }

    for col, label in feature_display_names.items():
        if {col, "class"}.issubset(data.columns):
            st.subheader(f"{label} by outcome")
            fig, ax = plt.subplots()
            sns.histplot(
                data=data,
                x=col,
                hue="class",
                multiple="stack",
                bins=30,
                ax=ax
            )
            ax.set_xlabel(label)
            ax.set_title(f"{label} distribution (Diabetes vs No diabetes)")
            st.pyplot(fig)
            plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Boxplots of selected features by outcome
    #////////////////////////////////////////////////////////////////////////////
    st.header("Feature boxplots by outcome")

    box_features = {
        "plas": "Plasma glucose concentration",
        "mass": "Body mass index (BMI)",
        "pedi": "Diabetes pedigree function"
    }

    for col, label in box_features.items():
        if {col, "class"}.issubset(data.columns):
            st.subheader(f"{label} vs outcome")
            fig, ax = plt.subplots()
            sns.boxplot(
                data=data,
                x="class",
                y=col,
                ax=ax
            )
            ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
            ax.set_ylabel(label)
            ax.set_title(f"{label} vs diabetes outcome")
            st.pyplot(fig)
            plt.close(fig)

    #////////////////////////////////////////////////////////////////////////////
    # Correlation heatmap of numeric variables
    #////////////////////////////////////////////////////////////////////////////
    st.header("Correlation heatmap (numeric features)")
    numeric_cols = data.select_dtypes(include=["number"]).columns
    if len(numeric_cols) > 1:
        fig, ax = plt.subplots(figsize=(8, 6))
        corr = data[numeric_cols].corr()
        sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
        ax.set_title("Correlation heatmap")
        st.pyplot(fig)
        plt.close(fig)
    else:
        st.write("Not enough numeric columns to compute correlations.")

if __name__ == "__main__":
    app()