occupation-data / src /streamlit_app.py
iurbinah's picture
Update src/streamlit_app.py
c77093a verified
"""
Interactive CPS Visualiser (patched)
===================================
๐Ÿš€ Upload this to a Hugging Face *Streamlit* Space (or run `streamlit run app.py`)
to explore the wide CPS file produced by your pipeline.
Patch โ€“ Juneย 2025
-----------------
* Added robust guards so *every* pivot table can be displayed and charted without
Seriesโ€‘styler crashes or Altair keyโ€‘errors.
* New helpers: `make_pivot_safe`, `show_table`, `robust_chart`.
* Visualization now politely refuses >2โ€‘D pivots instead of crashing.
* **Compatibility Fix**: Changed type hints to support Python 3.9 and older.
"""
import os
from pathlib import Path
from typing import Union
import numpy as np
import altair as alt
import pandas as pd
import streamlit as st
# โ”€โ”€ put Streamlit in a writable place โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# These environment variables are set to ensure Streamlit, which may not have
# write permissions in certain cloud environments (like Hugging Face Spaces),
# can store its necessary files in a temporary, writable directory.
os.environ["STREAMLIT_ROOT"] = "/tmp/.streamlit"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["STREAMLIT_BROWSER_GATHERUSAGESTATS"] = "false"
# ----------------------------------------------------------------------------
# --- Configuration ----------------------------------------------------------
# Set the default path to the data file.
DATA_PATH = "results/cps_state_occ_nat_emp_wide_full.csv"
# A mapping from FIPS state codes to full state names for prettier display.
FIPS_MAP = {
"01": "Alabama", "02": "Alaska", "04": "Arizona", "05": "Arkansas",
"06": "California", "08": "Colorado", "09": "Connecticut", "10": "Delaware",
"11": "District of Columbia", "12": "Florida", "13": "Georgia",
"15": "Hawaii", "16": "Idaho", "17": "Illinois", "18": "Indiana",
"19": "Iowa", "20": "Kansas", "21": "Kentucky", "22": "Louisiana",
"23": "Maine", "24": "Maryland", "25": "Massachusetts", "26": "Michigan",
"27": "Minnesota", "28": "Mississippi", "29": "Missouri", "30": "Montana",
"31": "Nebraska", "32": "Nevada", "33": "New Hampshire", "34": "New Jersey",
"35": "New Mexico", "36": "New York", "37": "North Carolina",
"38": "North Dakota", "39": "Ohio", "40": "Oklahoma", "41": "Oregon",
"42": "Pennsylvania", "44": "Rhode Island", "45": "South Carolina",
"46": "South Dakota", "47": "Tennessee", "48": "Texas", "49": "Utah",
"50": "Vermont", "51": "Virginia", "53": "Washington", "54": "West Virginia",
"55": "Wisconsin", "56": "Wyoming",
}
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ‚ NEW HELPERS โ‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ#
def make_pivot_safe(pivot: Union[pd.DataFrame, pd.Series], metric: str) -> pd.DataFrame:
"""Return a clean *DataFrame* ready for styling & charting."""
# Series โ†’ DataFrame so we always have .style in old pandas
if isinstance(pivot, pd.Series):
pivot = pivot.to_frame(name=metric)
# Ensure axis names are strings (Altair dislikes *None*)
if pivot.index.name is None:
pivot.index.name = "row"
if pivot.columns.name is None:
pivot.columns.name = "col"
# Replace NaN or inf that break colourโ€‘scales
pivot = pivot.replace([pd.NA, np.inf, -np.inf], 0)
return pivot
def show_table(pivot: pd.DataFrame, metric: str) -> None:
"""Display DataFrame safely and offer CSV download."""
fmt = "{:,.2f}" if "%" in metric or "Rate" in metric else "{:,.0f}"
st.dataframe(pivot.style.format(fmt), use_container_width=True)
st.download_button(
"๐Ÿ“ฅ Download as CSV", pivot.to_csv().encode(), "cps_pivot.csv", "text/csv"
)
def robust_chart(pivot: pd.DataFrame, metric: str) -> None:
"""Simple bar for 1โ€‘D, heatโ€‘map for 2โ€‘D, infoโ€‘box for anything else."""
if pivot.empty:
st.warning("Cannot generate a chart from empty data.")
return
# 1โ€‘D (one valueโ€‘column)
if pivot.shape[1] == 1:
df = (
pivot.reset_index()
.rename(columns={pivot.columns[0]: "value"})
)
category = df.columns[0]
bar = (
alt.Chart(df, title=f"{metric} by {category.title()}")
.mark_bar()
.encode(
x=f"{category}:O",
y="value:Q",
tooltip=[category, alt.Tooltip("value:Q", format=".2f")],
)
)
st.altair_chart(bar, use_container_width=True)
return
# 2โ€‘D plain โ†’ heatโ€‘map
if not isinstance(pivot.index, pd.MultiIndex) and not isinstance(
pivot.columns, pd.MultiIndex
):
tidy = (
pivot.stack()
.reset_index()
.rename(
columns={
pivot.columns.name: "x",
pivot.index.name: "y",
0: "value",
}
)
)
heat = (
alt.Chart(tidy, title=f"{metric} by {tidy['y'].name} and {tidy['x'].name}")
.mark_rect()
.encode(
x="x:O",
y="y:O",
color=alt.Color("value:Q", title=metric, scale=alt.Scale(scheme="viridis")),
tooltip=["y", "x", alt.Tooltip("value:Q", format=".2f")],
)
)
st.altair_chart(heat, use_container_width=True)
return
st.info(
"โ„น๏ธ The current pivot has more than two dimensions. Simplify your\n'Group By' choices to see a chart."
)
# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ#
@st.cache_data(show_spinner="Loading and preparing CPS dataโ€ฆ")
def load_and_prepare_data(path: str = DATA_PATH) -> pd.DataFrame:
"""Load CSV, prettify, and compute derived metrics."""
try:
df = pd.read_csv(path)
df["occ2018"] = df["occ2018"].astype(str)
# Prettify state names using FIPS code mapping
df["state"] = (
df["state_fips"].astype(str).str.zfill(2).map(FIPS_MAP).fillna("Other/Unknown")
)
# Create readable labels for the nativity flag
df["nativity"] = df["nativity_flag"].map({0: "Native-born", 1: "Foreign-born"})
total_labor_force = df["EMPLOYED"] + df["UNEMPLOYED"]
df["Unemployment Rate"] = (
df["UNEMPLOYED"] / total_labor_force
).fillna(0) * 100
return df
except FileNotFoundError:
st.error(f"Data file not found at: {path}. Please ensure the file exists.")
return pd.DataFrame()
def render_sidebar(df: pd.DataFrame):
"""Render sidebar filters and return selections."""
st.sidebar.header("๐Ÿ“Š Data Filters")
# Geographic
with st.sidebar.expander("๐Ÿ“ Geographic Filter", expanded=True):
states = sorted(df["state"].unique())
if st.checkbox("Select all states", value=True, key="state_all"):
sel_states = st.multiselect("State(s)", states, default=states)
else:
sel_states = st.multiselect("State(s)", states, default=[states[0]] if states else [])
# Occupation
with st.sidebar.expander("๐Ÿง‘โ€๐Ÿ’ผ Occupation Filter", expanded=True):
occ_search = st.text_input("Search 4-digit SOC code", "")
occ_codes = sorted(df["occ2018"].unique())
occ_filtered = [c for c in occ_codes if occ_search.lower() in c.lower()]
if not occ_filtered:
st.warning("No occupation codes match your search.")
sel_occs = []
elif st.checkbox("Select all filtered occupations", value=True, key="occ_all"):
sel_occs = st.multiselect(
f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=occ_filtered
)
else:
sel_occs = st.multiselect(
f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=[]
)
# Nativity
with st.sidebar.expander("๐ŸŒ Nativity Filter", expanded=True):
nativity_options = list(df["nativity"].unique())
if st.checkbox("Select all nativities", value=True, key="nat_all"):
sel_nat = st.multiselect("Nativity", nativity_options, default=nativity_options)
else:
sel_nat = st.multiselect("Nativity", nativity_options, default=[])
return sel_states, sel_occs, sel_nat
def render_pivot_explorer(dff: pd.DataFrame):
"""Main panel for the pivotโ€‘table explorer mode."""
st.markdown("### โš™๏ธ Configure Your Pivot View")
all_dims = ["state", "occ2018", "nativity"]
c1, c2, c3 = st.columns([1.2, 1, 1])
with c1:
metric = st.radio(
"Select Metric",
[
"Unemployment Rate",
"Foreign-Born Labor Force %",
"EMPLOYED",
"UNEMPLOYED",
],
index=0,
key="metric_select",
)
with c2:
group_rows = st.multiselect("Group Rows By", all_dims, default=["state"], key="group_rows")
available_cols = [d for d in all_dims if d not in group_rows]
with c3:
group_cols = st.multiselect("Group Columns By", available_cols, default=[], key="group_cols")
# Build pivot table
try:
grouping_cols = [col for col in (group_rows + group_cols) if col]
if not grouping_cols:
st.info("Please select at least one 'Group By' option to build the table.")
return
if metric == "Unemployment Rate":
agg = (
dff.groupby(grouping_cols)
.agg({"EMPLOYED": "sum", "UNEMPLOYED": "sum"})
.reset_index()
)
total = agg["EMPLOYED"] + agg["UNEMPLOYED"]
agg[metric] = (agg["UNEMPLOYED"] / total).fillna(0) * 100
pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0)
elif metric == "Foreign-Born Labor Force %":
temp = dff.copy()
temp["total_labor_force"] = temp["EMPLOYED"] + temp["UNEMPLOYED"]
temp["foreign_labor_force"] = temp["total_labor_force"].where(
temp["nativity"] == "Foreign-born", 0
)
agg = (
temp.groupby(grouping_cols)
.agg({"total_labor_force": "sum", "foreign_labor_force": "sum"})
.reset_index()
)
agg[metric] = (
agg["foreign_labor_force"] / agg["total_labor_force"]
).fillna(0) * 100
pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0)
else: # EMPLOYED or UNEMPLOYED totals
pivot = dff.pivot_table(
index=group_rows, columns=group_cols, values=metric, aggfunc="sum", fill_value=0
)
except Exception as e:
st.error(f"Could not create pivot table. Check your selections. Error: {e}")
return
# Universal cleanup so downstream code never crashes
pivot = make_pivot_safe(pivot, metric)
tab1, tab2, tab3 = st.tabs(["๐Ÿ“Š Pivot Table", "๐Ÿ“ˆ Visualisation", "โ„น๏ธ Data Summary"])
with tab1:
st.subheader("Aggregated Data")
show_table(pivot, metric)
with tab2:
st.subheader(f"Visualisation of {metric}")
robust_chart(pivot, metric)
with tab3:
st.subheader("Summary of Filtered Data")
st.write(f"Displaying summary for **{len(dff):,}** raw data entries.")
st.dataframe(dff.describe(include="all").T)
def render_comparative_analysis(dff: pd.DataFrame):
"""Nativityโ€‘split bar charts for a single state."""
st.markdown("### โš–๏ธ Comparative Analysis: Employment by Nativity")
available_states = sorted(dff["state"].unique())
if not available_states:
st.warning("No states available in the filtered data to compare.")
return
selected_state = st.selectbox("Select a State to Analyze", available_states)
if selected_state:
sdf = dff[dff["state"] == selected_state]
long = pd.melt(
sdf,
id_vars=["nativity"],
value_vars=["EMPLOYED", "UNEMPLOYED"],
var_name="Employment Status",
value_name="Count",
)
chart = (
alt.Chart(long)
.mark_bar()
.encode(
x=alt.X("Employment Status:N", axis=alt.Axis(labels=False, ticks=False)),
y=alt.Y("sum(Count):Q", title="Total Count"),
color=alt.Color(
"Employment Status:N",
scale=alt.Scale(domain=["EMPLOYED", "UNEMPLOYED"], range=["#1f77b4", "#d62728"]),
legend=alt.Legend(title="Status"),
),
tooltip=["nativity", "Employment Status", alt.Tooltip("sum(Count):Q", format=",.0f")],
)
.properties(title=f"Employment Status in {selected_state}")
.facet(column=alt.Column("nativity:N", title="Nativity Group"))
)
st.altair_chart(chart, use_container_width=True)
def main():
st.set_page_config(
page_title="CPS Explorer", layout="wide", initial_sidebar_state="expanded"
)
st.title("๐Ÿ“Š Interactive CPS Data Explorer")
st.markdown(
"Use the filters and controls to slice, pivot, and visualize Current Population Survey (CPS) employment data."
)
df = load_and_prepare_data(DATA_PATH)
if df.empty:
st.stop()
# Sidebar controls
st.sidebar.header("๐Ÿ•น๏ธ Controls")
analysis_mode = st.sidebar.radio(
"Analysis Mode",
["Pivot Table Explorer", "Comparative Analysis"],
)
sel_states, sel_occs, sel_nat = render_sidebar(df)
query_parts = []
if sel_states:
query_parts.append("state in @sel_states")
if sel_occs:
query_parts.append("occ2018 in @sel_occs")
if sel_nat:
query_parts.append("nativity in @sel_nat")
if not query_parts:
st.warning("Please make a selection in the sidebar to see the data.")
st.stop()
dff = df.query(" & ".join(query_parts), engine="python").copy()
if dff.empty:
st.warning("No data matches your selection. Try widening the filters.")
st.stop()
if analysis_mode == "Pivot Table Explorer":
render_pivot_explorer(dff)
else:
render_comparative_analysis(dff)
st.markdown("---")
st.caption(
"Source: Basic Monthly Current Population Survey (CPS). Weight: PWCMPWT. May 2025 Report."
)
if __name__ == "__main__":
main()