Spaces:
Paused
Paused
| """ | |
| Interactive CPS Visualiser (patched) | |
| =================================== | |
| ๐ Upload this to a Hugging Face *Streamlit* Space (or run `streamlit run app.py`) | |
| to explore the wide CPS file produced by your pipeline. | |
| Patch โ Juneย 2025 | |
| ----------------- | |
| * Added robust guards so *every* pivot table can be displayed and charted without | |
| Seriesโstyler crashes or Altair keyโerrors. | |
| * New helpers: `make_pivot_safe`, `show_table`, `robust_chart`. | |
| * Visualization now politely refuses >2โD pivots instead of crashing. | |
| * **Compatibility Fix**: Changed type hints to support Python 3.9 and older. | |
| """ | |
| import os | |
| from pathlib import Path | |
| from typing import Union | |
| import numpy as np | |
| import altair as alt | |
| import pandas as pd | |
| import streamlit as st | |
| # โโ put Streamlit in a writable place โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # These environment variables are set to ensure Streamlit, which may not have | |
| # write permissions in certain cloud environments (like Hugging Face Spaces), | |
| # can store its necessary files in a temporary, writable directory. | |
| os.environ["STREAMLIT_ROOT"] = "/tmp/.streamlit" | |
| os.environ["XDG_CONFIG_HOME"] = "/tmp" | |
| os.environ["XDG_CACHE_HOME"] = "/tmp" | |
| os.environ["STREAMLIT_BROWSER_GATHERUSAGESTATS"] = "false" | |
| # ---------------------------------------------------------------------------- | |
| # --- Configuration ---------------------------------------------------------- | |
| # Set the default path to the data file. | |
| DATA_PATH = "results/cps_state_occ_nat_emp_wide_full.csv" | |
| # A mapping from FIPS state codes to full state names for prettier display. | |
| FIPS_MAP = { | |
| "01": "Alabama", "02": "Alaska", "04": "Arizona", "05": "Arkansas", | |
| "06": "California", "08": "Colorado", "09": "Connecticut", "10": "Delaware", | |
| "11": "District of Columbia", "12": "Florida", "13": "Georgia", | |
| "15": "Hawaii", "16": "Idaho", "17": "Illinois", "18": "Indiana", | |
| "19": "Iowa", "20": "Kansas", "21": "Kentucky", "22": "Louisiana", | |
| "23": "Maine", "24": "Maryland", "25": "Massachusetts", "26": "Michigan", | |
| "27": "Minnesota", "28": "Mississippi", "29": "Missouri", "30": "Montana", | |
| "31": "Nebraska", "32": "Nevada", "33": "New Hampshire", "34": "New Jersey", | |
| "35": "New Mexico", "36": "New York", "37": "North Carolina", | |
| "38": "North Dakota", "39": "Ohio", "40": "Oklahoma", "41": "Oregon", | |
| "42": "Pennsylvania", "44": "Rhode Island", "45": "South Carolina", | |
| "46": "South Dakota", "47": "Tennessee", "48": "Texas", "49": "Utah", | |
| "50": "Vermont", "51": "Virginia", "53": "Washington", "54": "West Virginia", | |
| "55": "Wisconsin", "56": "Wyoming", | |
| } | |
| # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| # โญโโโโโโโโโโโโโโโโโโโโโโโโโโโโ NEW HELPERS โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฎ# | |
| def make_pivot_safe(pivot: Union[pd.DataFrame, pd.Series], metric: str) -> pd.DataFrame: | |
| """Return a clean *DataFrame* ready for styling & charting.""" | |
| # Series โ DataFrame so we always have .style in old pandas | |
| if isinstance(pivot, pd.Series): | |
| pivot = pivot.to_frame(name=metric) | |
| # Ensure axis names are strings (Altair dislikes *None*) | |
| if pivot.index.name is None: | |
| pivot.index.name = "row" | |
| if pivot.columns.name is None: | |
| pivot.columns.name = "col" | |
| # Replace NaN or inf that break colourโscales | |
| pivot = pivot.replace([pd.NA, np.inf, -np.inf], 0) | |
| return pivot | |
| def show_table(pivot: pd.DataFrame, metric: str) -> None: | |
| """Display DataFrame safely and offer CSV download.""" | |
| fmt = "{:,.2f}" if "%" in metric or "Rate" in metric else "{:,.0f}" | |
| st.dataframe(pivot.style.format(fmt), use_container_width=True) | |
| st.download_button( | |
| "๐ฅ Download as CSV", pivot.to_csv().encode(), "cps_pivot.csv", "text/csv" | |
| ) | |
| def robust_chart(pivot: pd.DataFrame, metric: str) -> None: | |
| """Simple bar for 1โD, heatโmap for 2โD, infoโbox for anything else.""" | |
| if pivot.empty: | |
| st.warning("Cannot generate a chart from empty data.") | |
| return | |
| # 1โD (one valueโcolumn) | |
| if pivot.shape[1] == 1: | |
| df = ( | |
| pivot.reset_index() | |
| .rename(columns={pivot.columns[0]: "value"}) | |
| ) | |
| category = df.columns[0] | |
| bar = ( | |
| alt.Chart(df, title=f"{metric} by {category.title()}") | |
| .mark_bar() | |
| .encode( | |
| x=f"{category}:O", | |
| y="value:Q", | |
| tooltip=[category, alt.Tooltip("value:Q", format=".2f")], | |
| ) | |
| ) | |
| st.altair_chart(bar, use_container_width=True) | |
| return | |
| # 2โD plain โ heatโmap | |
| if not isinstance(pivot.index, pd.MultiIndex) and not isinstance( | |
| pivot.columns, pd.MultiIndex | |
| ): | |
| tidy = ( | |
| pivot.stack() | |
| .reset_index() | |
| .rename( | |
| columns={ | |
| pivot.columns.name: "x", | |
| pivot.index.name: "y", | |
| 0: "value", | |
| } | |
| ) | |
| ) | |
| heat = ( | |
| alt.Chart(tidy, title=f"{metric} by {tidy['y'].name} and {tidy['x'].name}") | |
| .mark_rect() | |
| .encode( | |
| x="x:O", | |
| y="y:O", | |
| color=alt.Color("value:Q", title=metric, scale=alt.Scale(scheme="viridis")), | |
| tooltip=["y", "x", alt.Tooltip("value:Q", format=".2f")], | |
| ) | |
| ) | |
| st.altair_chart(heat, use_container_width=True) | |
| return | |
| st.info( | |
| "โน๏ธ The current pivot has more than two dimensions. Simplify your\n'Group By' choices to see a chart." | |
| ) | |
| # โฐโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโฏ# | |
| def load_and_prepare_data(path: str = DATA_PATH) -> pd.DataFrame: | |
| """Load CSV, prettify, and compute derived metrics.""" | |
| try: | |
| df = pd.read_csv(path) | |
| df["occ2018"] = df["occ2018"].astype(str) | |
| # Prettify state names using FIPS code mapping | |
| df["state"] = ( | |
| df["state_fips"].astype(str).str.zfill(2).map(FIPS_MAP).fillna("Other/Unknown") | |
| ) | |
| # Create readable labels for the nativity flag | |
| df["nativity"] = df["nativity_flag"].map({0: "Native-born", 1: "Foreign-born"}) | |
| total_labor_force = df["EMPLOYED"] + df["UNEMPLOYED"] | |
| df["Unemployment Rate"] = ( | |
| df["UNEMPLOYED"] / total_labor_force | |
| ).fillna(0) * 100 | |
| return df | |
| except FileNotFoundError: | |
| st.error(f"Data file not found at: {path}. Please ensure the file exists.") | |
| return pd.DataFrame() | |
| def render_sidebar(df: pd.DataFrame): | |
| """Render sidebar filters and return selections.""" | |
| st.sidebar.header("๐ Data Filters") | |
| # Geographic | |
| with st.sidebar.expander("๐ Geographic Filter", expanded=True): | |
| states = sorted(df["state"].unique()) | |
| if st.checkbox("Select all states", value=True, key="state_all"): | |
| sel_states = st.multiselect("State(s)", states, default=states) | |
| else: | |
| sel_states = st.multiselect("State(s)", states, default=[states[0]] if states else []) | |
| # Occupation | |
| with st.sidebar.expander("๐งโ๐ผ Occupation Filter", expanded=True): | |
| occ_search = st.text_input("Search 4-digit SOC code", "") | |
| occ_codes = sorted(df["occ2018"].unique()) | |
| occ_filtered = [c for c in occ_codes if occ_search.lower() in c.lower()] | |
| if not occ_filtered: | |
| st.warning("No occupation codes match your search.") | |
| sel_occs = [] | |
| elif st.checkbox("Select all filtered occupations", value=True, key="occ_all"): | |
| sel_occs = st.multiselect( | |
| f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=occ_filtered | |
| ) | |
| else: | |
| sel_occs = st.multiselect( | |
| f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=[] | |
| ) | |
| # Nativity | |
| with st.sidebar.expander("๐ Nativity Filter", expanded=True): | |
| nativity_options = list(df["nativity"].unique()) | |
| if st.checkbox("Select all nativities", value=True, key="nat_all"): | |
| sel_nat = st.multiselect("Nativity", nativity_options, default=nativity_options) | |
| else: | |
| sel_nat = st.multiselect("Nativity", nativity_options, default=[]) | |
| return sel_states, sel_occs, sel_nat | |
| def render_pivot_explorer(dff: pd.DataFrame): | |
| """Main panel for the pivotโtable explorer mode.""" | |
| st.markdown("### โ๏ธ Configure Your Pivot View") | |
| all_dims = ["state", "occ2018", "nativity"] | |
| c1, c2, c3 = st.columns([1.2, 1, 1]) | |
| with c1: | |
| metric = st.radio( | |
| "Select Metric", | |
| [ | |
| "Unemployment Rate", | |
| "Foreign-Born Labor Force %", | |
| "EMPLOYED", | |
| "UNEMPLOYED", | |
| ], | |
| index=0, | |
| key="metric_select", | |
| ) | |
| with c2: | |
| group_rows = st.multiselect("Group Rows By", all_dims, default=["state"], key="group_rows") | |
| available_cols = [d for d in all_dims if d not in group_rows] | |
| with c3: | |
| group_cols = st.multiselect("Group Columns By", available_cols, default=[], key="group_cols") | |
| # Build pivot table | |
| try: | |
| grouping_cols = [col for col in (group_rows + group_cols) if col] | |
| if not grouping_cols: | |
| st.info("Please select at least one 'Group By' option to build the table.") | |
| return | |
| if metric == "Unemployment Rate": | |
| agg = ( | |
| dff.groupby(grouping_cols) | |
| .agg({"EMPLOYED": "sum", "UNEMPLOYED": "sum"}) | |
| .reset_index() | |
| ) | |
| total = agg["EMPLOYED"] + agg["UNEMPLOYED"] | |
| agg[metric] = (agg["UNEMPLOYED"] / total).fillna(0) * 100 | |
| pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0) | |
| elif metric == "Foreign-Born Labor Force %": | |
| temp = dff.copy() | |
| temp["total_labor_force"] = temp["EMPLOYED"] + temp["UNEMPLOYED"] | |
| temp["foreign_labor_force"] = temp["total_labor_force"].where( | |
| temp["nativity"] == "Foreign-born", 0 | |
| ) | |
| agg = ( | |
| temp.groupby(grouping_cols) | |
| .agg({"total_labor_force": "sum", "foreign_labor_force": "sum"}) | |
| .reset_index() | |
| ) | |
| agg[metric] = ( | |
| agg["foreign_labor_force"] / agg["total_labor_force"] | |
| ).fillna(0) * 100 | |
| pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0) | |
| else: # EMPLOYED or UNEMPLOYED totals | |
| pivot = dff.pivot_table( | |
| index=group_rows, columns=group_cols, values=metric, aggfunc="sum", fill_value=0 | |
| ) | |
| except Exception as e: | |
| st.error(f"Could not create pivot table. Check your selections. Error: {e}") | |
| return | |
| # Universal cleanup so downstream code never crashes | |
| pivot = make_pivot_safe(pivot, metric) | |
| tab1, tab2, tab3 = st.tabs(["๐ Pivot Table", "๐ Visualisation", "โน๏ธ Data Summary"]) | |
| with tab1: | |
| st.subheader("Aggregated Data") | |
| show_table(pivot, metric) | |
| with tab2: | |
| st.subheader(f"Visualisation of {metric}") | |
| robust_chart(pivot, metric) | |
| with tab3: | |
| st.subheader("Summary of Filtered Data") | |
| st.write(f"Displaying summary for **{len(dff):,}** raw data entries.") | |
| st.dataframe(dff.describe(include="all").T) | |
| def render_comparative_analysis(dff: pd.DataFrame): | |
| """Nativityโsplit bar charts for a single state.""" | |
| st.markdown("### โ๏ธ Comparative Analysis: Employment by Nativity") | |
| available_states = sorted(dff["state"].unique()) | |
| if not available_states: | |
| st.warning("No states available in the filtered data to compare.") | |
| return | |
| selected_state = st.selectbox("Select a State to Analyze", available_states) | |
| if selected_state: | |
| sdf = dff[dff["state"] == selected_state] | |
| long = pd.melt( | |
| sdf, | |
| id_vars=["nativity"], | |
| value_vars=["EMPLOYED", "UNEMPLOYED"], | |
| var_name="Employment Status", | |
| value_name="Count", | |
| ) | |
| chart = ( | |
| alt.Chart(long) | |
| .mark_bar() | |
| .encode( | |
| x=alt.X("Employment Status:N", axis=alt.Axis(labels=False, ticks=False)), | |
| y=alt.Y("sum(Count):Q", title="Total Count"), | |
| color=alt.Color( | |
| "Employment Status:N", | |
| scale=alt.Scale(domain=["EMPLOYED", "UNEMPLOYED"], range=["#1f77b4", "#d62728"]), | |
| legend=alt.Legend(title="Status"), | |
| ), | |
| tooltip=["nativity", "Employment Status", alt.Tooltip("sum(Count):Q", format=",.0f")], | |
| ) | |
| .properties(title=f"Employment Status in {selected_state}") | |
| .facet(column=alt.Column("nativity:N", title="Nativity Group")) | |
| ) | |
| st.altair_chart(chart, use_container_width=True) | |
| def main(): | |
| st.set_page_config( | |
| page_title="CPS Explorer", layout="wide", initial_sidebar_state="expanded" | |
| ) | |
| st.title("๐ Interactive CPS Data Explorer") | |
| st.markdown( | |
| "Use the filters and controls to slice, pivot, and visualize Current Population Survey (CPS) employment data." | |
| ) | |
| df = load_and_prepare_data(DATA_PATH) | |
| if df.empty: | |
| st.stop() | |
| # Sidebar controls | |
| st.sidebar.header("๐น๏ธ Controls") | |
| analysis_mode = st.sidebar.radio( | |
| "Analysis Mode", | |
| ["Pivot Table Explorer", "Comparative Analysis"], | |
| ) | |
| sel_states, sel_occs, sel_nat = render_sidebar(df) | |
| query_parts = [] | |
| if sel_states: | |
| query_parts.append("state in @sel_states") | |
| if sel_occs: | |
| query_parts.append("occ2018 in @sel_occs") | |
| if sel_nat: | |
| query_parts.append("nativity in @sel_nat") | |
| if not query_parts: | |
| st.warning("Please make a selection in the sidebar to see the data.") | |
| st.stop() | |
| dff = df.query(" & ".join(query_parts), engine="python").copy() | |
| if dff.empty: | |
| st.warning("No data matches your selection. Try widening the filters.") | |
| st.stop() | |
| if analysis_mode == "Pivot Table Explorer": | |
| render_pivot_explorer(dff) | |
| else: | |
| render_comparative_analysis(dff) | |
| st.markdown("---") | |
| st.caption( | |
| "Source: Basic Monthly Current Population Survey (CPS). Weight: PWCMPWT. May 2025 Report." | |
| ) | |
| if __name__ == "__main__": | |
| main() | |