import streamlit as st import seaborn as sns import matplotlib.pyplot as plt import matplotlib.dates as mdates import pandas as pd # Load data def load_data(): df = pd.read_csv("./processed_data.csv") # Make sure this file has the columns: preg, plas, pres, skin, test, mass, pedi, age, class return df # Create Streamlit app def app(): # Title for the app huggingface_page_title = "Diabetes Outcomes Dashboard" st.set_page_config(page_title=huggingface_page_title, layout="wide") # --- Custom CSS to adjust sidebar width --- st.markdown( """ """, unsafe_allow_html=True ) # --- Page content --- st.title(huggingface_page_title) # Load data data = load_data() # Ensure expected columns exist (optional safety check) expected_cols = {"preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"} if not expected_cols.issubset(set(data.columns)): st.error(f"Dataset is missing some expected columns. Found: {list(data.columns)}") return # --- Key Metrics from the data --- total_obs = len(data) n_diabetes = int(data["class"].sum()) # assuming 1 = diabetes, 0 = no diabetes diabetes_rate = (n_diabetes / total_obs * 100) if total_obs > 0 else 0 avg_age = round(data["age"].mean(), 1) median_age = round(data["age"].median(), 1) avg_preg = round(data["preg"].mean(), 1) avg_bmi = round(data["mass"].mean(), 1) avg_glucose = round(data["plas"].mean(), 1) avg_bp = round(data["pres"].mean(), 1) avg_pedi = round(data["pedi"].mean(), 3) # Display metrics in the sidebar st.sidebar.header("Key Metrics") st.sidebar.metric("Total patients", total_obs) st.sidebar.metric("Patients with diabetes", n_diabetes) st.sidebar.metric("Diabetes prevalence (%)", f"{diabetes_rate:.1f}") st.sidebar.markdown("---") st.sidebar.metric("Avg age (years)", avg_age) st.sidebar.metric("Median age (years)", median_age) st.sidebar.markdown("---") st.sidebar.metric("Avg pregnancies", avg_preg) st.sidebar.metric("Avg BMI", avg_bmi) st.sidebar.metric("Avg plasma glucose", avg_glucose) st.sidebar.metric("Avg blood pressure (mm Hg)", avg_bp) st.sidebar.metric("Avg diabetes pedigree", avg_pedi) # --- Data preview --- st.markdown("### Data preview") st.dataframe(data.head()) # Styling for seaborn plots sns.set_style("whitegrid", {'grid.color': 'lightgrey', 'grid.linestyle': '--'}) #//////////////////////////////////////////////////////////////////////////// # Outcome distribution (class) #//////////////////////////////////////////////////////////////////////////// if "class" in data.columns: st.header("Diabetes outcome distribution") fig, ax = plt.subplots() outcome_counts = data["class"].value_counts().sort_index() sns.barplot(x=outcome_counts.index, y=outcome_counts.values, ax=ax) ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)") ax.set_ylabel("Count") ax.set_title("Diabetes outcome count") for i, v in enumerate(outcome_counts.values): ax.text(i, v + max(outcome_counts.values) * 0.01, str(v), ha="center", va="bottom", fontsize=9) st.pyplot(fig) plt.close(fig) #//////////////////////////////////////////////////////////////////////////// # Feature distributions by outcome (histograms) #//////////////////////////////////////////////////////////////////////////// st.header("Key feature distributions by outcome") feature_display_names = { "preg": "Pregnancies", "plas": "Plasma glucose concentration", "pres": "Diastolic blood pressure (mm Hg)", "mass": "Body mass index (BMI)", "age": "Age (years)" } for col, label in feature_display_names.items(): if {col, "class"}.issubset(data.columns): st.subheader(f"{label} by outcome") fig, ax = plt.subplots() sns.histplot( data=data, x=col, hue="class", multiple="stack", bins=30, ax=ax ) ax.set_xlabel(label) ax.set_title(f"{label} distribution (Diabetes vs No diabetes)") st.pyplot(fig) plt.close(fig) #//////////////////////////////////////////////////////////////////////////// # Boxplots of selected features by outcome #//////////////////////////////////////////////////////////////////////////// st.header("Feature boxplots by outcome") box_features = { "plas": "Plasma glucose concentration", "mass": "Body mass index (BMI)", "pedi": "Diabetes pedigree function" } for col, label in box_features.items(): if {col, "class"}.issubset(data.columns): st.subheader(f"{label} vs outcome") fig, ax = plt.subplots() sns.boxplot( data=data, x="class", y=col, ax=ax ) ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)") ax.set_ylabel(label) ax.set_title(f"{label} vs diabetes outcome") st.pyplot(fig) plt.close(fig) #//////////////////////////////////////////////////////////////////////////// # Correlation heatmap of numeric variables #//////////////////////////////////////////////////////////////////////////// st.header("Correlation heatmap (numeric features)") numeric_cols = data.select_dtypes(include=["number"]).columns if len(numeric_cols) > 1: fig, ax = plt.subplots(figsize=(8, 6)) corr = data[numeric_cols].corr() sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax) ax.set_title("Correlation heatmap") st.pyplot(fig) plt.close(fig) else: st.write("Not enough numeric columns to compute correlations.") if __name__ == "__main__": app()