Pima_reporting / src /streamlit_app.py
AbdramaneB's picture
Upload folder using huggingface_hub
b7a0a90 verified
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
# Load data
def load_data():
df = pd.read_csv("./processed_data.csv") # Make sure this file has the columns: preg, plas, pres, skin, test, mass, pedi, age, class
return df
# Create Streamlit app
def app():
# Title for the app
huggingface_page_title = "Diabetes Outcomes Dashboard"
st.set_page_config(page_title=huggingface_page_title, layout="wide")
# --- Custom CSS to adjust sidebar width ---
st.markdown(
"""
<style>
/* Sidebar width */
[data-testid="stSidebar"] {
width: 600px;
min-width: 600px;
}
</style>
""",
unsafe_allow_html=True
)
# --- Page content ---
st.title(huggingface_page_title)
# Load data
data = load_data()
# Ensure expected columns exist (optional safety check)
expected_cols = {"preg", "plas", "pres", "skin", "test", "mass", "pedi", "age", "class"}
if not expected_cols.issubset(set(data.columns)):
st.error(f"Dataset is missing some expected columns. Found: {list(data.columns)}")
return
# --- Key Metrics from the data ---
total_obs = len(data)
n_diabetes = int(data["class"].sum()) # assuming 1 = diabetes, 0 = no diabetes
diabetes_rate = (n_diabetes / total_obs * 100) if total_obs > 0 else 0
avg_age = round(data["age"].mean(), 1)
median_age = round(data["age"].median(), 1)
avg_preg = round(data["preg"].mean(), 1)
avg_bmi = round(data["mass"].mean(), 1)
avg_glucose = round(data["plas"].mean(), 1)
avg_bp = round(data["pres"].mean(), 1)
avg_pedi = round(data["pedi"].mean(), 3)
# Display metrics in the sidebar
st.sidebar.header("Key Metrics")
st.sidebar.metric("Total patients", total_obs)
st.sidebar.metric("Patients with diabetes", n_diabetes)
st.sidebar.metric("Diabetes prevalence (%)", f"{diabetes_rate:.1f}")
st.sidebar.markdown("---")
st.sidebar.metric("Avg age (years)", avg_age)
st.sidebar.metric("Median age (years)", median_age)
st.sidebar.markdown("---")
st.sidebar.metric("Avg pregnancies", avg_preg)
st.sidebar.metric("Avg BMI", avg_bmi)
st.sidebar.metric("Avg plasma glucose", avg_glucose)
st.sidebar.metric("Avg blood pressure (mm Hg)", avg_bp)
st.sidebar.metric("Avg diabetes pedigree", avg_pedi)
# --- Data preview ---
st.markdown("### Data preview")
st.dataframe(data.head())
# Styling for seaborn plots
sns.set_style("whitegrid", {'grid.color': 'lightgrey', 'grid.linestyle': '--'})
#////////////////////////////////////////////////////////////////////////////
# Outcome distribution (class)
#////////////////////////////////////////////////////////////////////////////
if "class" in data.columns:
st.header("Diabetes outcome distribution")
fig, ax = plt.subplots()
outcome_counts = data["class"].value_counts().sort_index()
sns.barplot(x=outcome_counts.index, y=outcome_counts.values, ax=ax)
ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
ax.set_ylabel("Count")
ax.set_title("Diabetes outcome count")
for i, v in enumerate(outcome_counts.values):
ax.text(i, v + max(outcome_counts.values) * 0.01, str(v), ha="center", va="bottom", fontsize=9)
st.pyplot(fig)
plt.close(fig)
#////////////////////////////////////////////////////////////////////////////
# Feature distributions by outcome (histograms)
#////////////////////////////////////////////////////////////////////////////
st.header("Key feature distributions by outcome")
feature_display_names = {
"preg": "Pregnancies",
"plas": "Plasma glucose concentration",
"pres": "Diastolic blood pressure (mm Hg)",
"mass": "Body mass index (BMI)",
"age": "Age (years)"
}
for col, label in feature_display_names.items():
if {col, "class"}.issubset(data.columns):
st.subheader(f"{label} by outcome")
fig, ax = plt.subplots()
sns.histplot(
data=data,
x=col,
hue="class",
multiple="stack",
bins=30,
ax=ax
)
ax.set_xlabel(label)
ax.set_title(f"{label} distribution (Diabetes vs No diabetes)")
st.pyplot(fig)
plt.close(fig)
#////////////////////////////////////////////////////////////////////////////
# Boxplots of selected features by outcome
#////////////////////////////////////////////////////////////////////////////
st.header("Feature boxplots by outcome")
box_features = {
"plas": "Plasma glucose concentration",
"mass": "Body mass index (BMI)",
"pedi": "Diabetes pedigree function"
}
for col, label in box_features.items():
if {col, "class"}.issubset(data.columns):
st.subheader(f"{label} vs outcome")
fig, ax = plt.subplots()
sns.boxplot(
data=data,
x="class",
y=col,
ax=ax
)
ax.set_xlabel("Outcome (0 = No diabetes, 1 = Diabetes)")
ax.set_ylabel(label)
ax.set_title(f"{label} vs diabetes outcome")
st.pyplot(fig)
plt.close(fig)
#////////////////////////////////////////////////////////////////////////////
# Correlation heatmap of numeric variables
#////////////////////////////////////////////////////////////////////////////
st.header("Correlation heatmap (numeric features)")
numeric_cols = data.select_dtypes(include=["number"]).columns
if len(numeric_cols) > 1:
fig, ax = plt.subplots(figsize=(8, 6))
corr = data[numeric_cols].corr()
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.set_title("Correlation heatmap")
st.pyplot(fig)
plt.close(fig)
else:
st.write("Not enough numeric columns to compute correlations.")
if __name__ == "__main__":
app()