File size: 15,143 Bytes
d07a426
7ca9258
 
d07a426
ad178b5
d07a426
 
7ca9258
 
 
 
 
 
39dfffb
d07a426
 
883dedf
7ca9258
39dfffb
7ca9258
 
 
 
 
 
883dedf
ad178b5
 
 
 
 
 
 
883dedf
 
ad178b5
 
4b0bdb0
ad178b5
 
d07a426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f8a915d
7ca9258
 
39dfffb
7ca9258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad178b5
7ca9258
ad178b5
 
7ca9258
 
ad178b5
 
 
 
 
 
f8a915d
ad178b5
7ca9258
 
 
ad178b5
 
 
7ca9258
 
d07a426
ad178b5
7ca9258
ad178b5
7ca9258
ad178b5
 
7ca9258
ad178b5
 
bc4e2eb
7ca9258
ad178b5
 
8124cf8
f964e21
ad178b5
 
 
7ca9258
ad178b5
7ca9258
ad178b5
 
 
7ca9258
ad178b5
7ca9258
ad178b5
 
7ca9258
ad178b5
 
 
 
d07a426
7ca9258
bc4e2eb
7ca9258
bc4e2eb
f964e21
bc4e2eb
0e6cc95
bc4e2eb
 
 
7ca9258
 
 
 
 
 
bc4e2eb
7ca9258
ad178b5
bc4e2eb
7ca9258
f964e21
bc4e2eb
7ca9258
d07a426
7ca9258
ad178b5
bc4e2eb
0e6cc95
7ca9258
 
0e6cc95
ad178b5
7ca9258
 
 
 
 
 
 
 
bc4e2eb
7ca9258
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad178b5
 
 
d07a426
7ca9258
 
 
ad178b5
7ca9258
ad178b5
 
7ca9258
 
ad178b5
 
7ca9258
 
ad178b5
 
 
7ca9258
 
d07a426
bc4e2eb
7ca9258
bc4e2eb
7ca9258
bc4e2eb
 
 
 
 
 
7ca9258
 
 
 
 
 
 
bc4e2eb
7ca9258
 
 
 
 
 
 
 
 
 
 
 
d07a426
7ca9258
 
 
 
f964e21
ad178b5
 
7ca9258
 
 
ad178b5
7ca9258
 
 
ad178b5
 
 
bc4e2eb
ad178b5
7ca9258
bc4e2eb
 
 
 
ad178b5
bc4e2eb
7ca9258
ad178b5
7ca9258
 
 
 
 
 
 
ad178b5
 
 
 
7ca9258
ad178b5
 
 
 
bc4e2eb
 
7ca9258
bc4e2eb
7ca9258
ad178b5
7ca9258
c77093a
7ca9258
 
f8a915d
d07a426
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
"""
Interactive CPS Visualiser (patched)
===================================

๐Ÿš€ Upload this to a Hugging Face *Streamlit* Space (or run `streamlit run app.py`)
to explore the wide CPS file produced by your pipeline.

Patch โ€“ Juneย 2025
-----------------
* Added robust guards so *every* pivot table can be displayed and charted without
  Seriesโ€‘styler crashes or Altair keyโ€‘errors.
* New helpers: `make_pivot_safe`, `show_table`, `robust_chart`.
* Visualization now politely refuses >2โ€‘D pivots instead of crashing.
* **Compatibility Fix**: Changed type hints to support Python 3.9 and older.
"""

import os
from pathlib import Path
from typing import Union

import numpy as np
import altair as alt
import pandas as pd
import streamlit as st

# โ”€โ”€ put Streamlit in a writable place โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
# These environment variables are set to ensure Streamlit, which may not have
# write permissions in certain cloud environments (like Hugging Face Spaces),
# can store its necessary files in a temporary, writable directory.
os.environ["STREAMLIT_ROOT"] = "/tmp/.streamlit"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["STREAMLIT_BROWSER_GATHERUSAGESTATS"] = "false"
# ----------------------------------------------------------------------------

# --- Configuration ----------------------------------------------------------
# Set the default path to the data file.
DATA_PATH = "results/cps_state_occ_nat_emp_wide_full.csv"

# A mapping from FIPS state codes to full state names for prettier display.
FIPS_MAP = {
    "01": "Alabama", "02": "Alaska", "04": "Arizona", "05": "Arkansas",
    "06": "California", "08": "Colorado", "09": "Connecticut", "10": "Delaware",
    "11": "District of Columbia", "12": "Florida", "13": "Georgia",
    "15": "Hawaii", "16": "Idaho", "17": "Illinois", "18": "Indiana",
    "19": "Iowa", "20": "Kansas", "21": "Kentucky", "22": "Louisiana",
    "23": "Maine", "24": "Maryland", "25": "Massachusetts", "26": "Michigan",
    "27": "Minnesota", "28": "Mississippi", "29": "Missouri", "30": "Montana",
    "31": "Nebraska", "32": "Nevada", "33": "New Hampshire", "34": "New Jersey",
    "35": "New Mexico", "36": "New York", "37": "North Carolina",
    "38": "North Dakota", "39": "Ohio", "40": "Oklahoma", "41": "Oregon",
    "42": "Pennsylvania", "44": "Rhode Island", "45": "South Carolina",
    "46": "South Dakota", "47": "Tennessee", "48": "Texas", "49": "Utah",
    "50": "Vermont", "51": "Virginia", "53": "Washington", "54": "West Virginia",
    "55": "Wisconsin", "56": "Wyoming",
}
# โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€

# โ•ญโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ‚  NEW HELPERS  โ‚โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฎ#

def make_pivot_safe(pivot: Union[pd.DataFrame, pd.Series], metric: str) -> pd.DataFrame:
    """Return a clean *DataFrame* ready for styling & charting."""
    # Series โ†’ DataFrame so we always have .style in old pandas
    if isinstance(pivot, pd.Series):
        pivot = pivot.to_frame(name=metric)

    # Ensure axis names are strings (Altair dislikes *None*)
    if pivot.index.name is None:
        pivot.index.name = "row"
    if pivot.columns.name is None:
        pivot.columns.name = "col"

    # Replace NaN or inf that break colourโ€‘scales
    pivot = pivot.replace([pd.NA, np.inf, -np.inf], 0)
    return pivot


def show_table(pivot: pd.DataFrame, metric: str) -> None:
    """Display DataFrame safely and offer CSV download."""
    fmt = "{:,.2f}" if "%" in metric or "Rate" in metric else "{:,.0f}"
    st.dataframe(pivot.style.format(fmt), use_container_width=True)
    st.download_button(
        "๐Ÿ“ฅ Download as CSV", pivot.to_csv().encode(), "cps_pivot.csv", "text/csv"
    )


def robust_chart(pivot: pd.DataFrame, metric: str) -> None:
    """Simple bar for 1โ€‘D, heatโ€‘map for 2โ€‘D, infoโ€‘box for anything else."""
    if pivot.empty:
        st.warning("Cannot generate a chart from empty data.")
        return

    # 1โ€‘D (one valueโ€‘column)
    if pivot.shape[1] == 1:
        df = (
            pivot.reset_index()
            .rename(columns={pivot.columns[0]: "value"})
        )
        category = df.columns[0]
        bar = (
            alt.Chart(df, title=f"{metric} by {category.title()}")
            .mark_bar()
            .encode(
                x=f"{category}:O",
                y="value:Q",
                tooltip=[category, alt.Tooltip("value:Q", format=".2f")],
            )
        )
        st.altair_chart(bar, use_container_width=True)
        return

    # 2โ€‘D plain โ†’ heatโ€‘map
    if not isinstance(pivot.index, pd.MultiIndex) and not isinstance(
        pivot.columns, pd.MultiIndex
    ):
        tidy = (
            pivot.stack()
            .reset_index()
            .rename(
                columns={
                    pivot.columns.name: "x",
                    pivot.index.name: "y",
                    0: "value",
                }
            )
        )
        heat = (
            alt.Chart(tidy, title=f"{metric} by {tidy['y'].name} and {tidy['x'].name}")
            .mark_rect()
            .encode(
                x="x:O",
                y="y:O",
                color=alt.Color("value:Q", title=metric, scale=alt.Scale(scheme="viridis")),
                tooltip=["y", "x", alt.Tooltip("value:Q", format=".2f")],
            )
        )
        st.altair_chart(heat, use_container_width=True)
        return

    st.info(
        "โ„น๏ธ The current pivot has more than two dimensions. Simplify your\n'Group By' choices to see a chart."
    )

# โ•ฐโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ•ฏ#

@st.cache_data(show_spinner="Loading and preparing CPS dataโ€ฆ")
def load_and_prepare_data(path: str = DATA_PATH) -> pd.DataFrame:
    """Load CSV, prettify, and compute derived metrics."""
    try:
        df = pd.read_csv(path)
        df["occ2018"] = df["occ2018"].astype(str)

        # Prettify state names using FIPS code mapping
        df["state"] = (
            df["state_fips"].astype(str).str.zfill(2).map(FIPS_MAP).fillna("Other/Unknown")
        )
        # Create readable labels for the nativity flag
        df["nativity"] = df["nativity_flag"].map({0: "Native-born", 1: "Foreign-born"})

        total_labor_force = df["EMPLOYED"] + df["UNEMPLOYED"]
        df["Unemployment Rate"] = (
            df["UNEMPLOYED"] / total_labor_force
        ).fillna(0) * 100
        return df
    except FileNotFoundError:
        st.error(f"Data file not found at: {path}. Please ensure the file exists.")
        return pd.DataFrame()


def render_sidebar(df: pd.DataFrame):
    """Render sidebar filters and return selections."""
    st.sidebar.header("๐Ÿ“Š Data Filters")
    # Geographic
    with st.sidebar.expander("๐Ÿ“ Geographic Filter", expanded=True):
        states = sorted(df["state"].unique())
        if st.checkbox("Select all states", value=True, key="state_all"):
            sel_states = st.multiselect("State(s)", states, default=states)
        else:
            sel_states = st.multiselect("State(s)", states, default=[states[0]] if states else [])
    # Occupation
    with st.sidebar.expander("๐Ÿง‘โ€๐Ÿ’ผ Occupation Filter", expanded=True):
        occ_search = st.text_input("Search 4-digit SOC code", "")
        occ_codes = sorted(df["occ2018"].unique())
        occ_filtered = [c for c in occ_codes if occ_search.lower() in c.lower()]
        if not occ_filtered:
            st.warning("No occupation codes match your search.")
            sel_occs = []
        elif st.checkbox("Select all filtered occupations", value=True, key="occ_all"):
            sel_occs = st.multiselect(
                f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=occ_filtered
            )
        else:
            sel_occs = st.multiselect(
                f"Occupation code(s) ({len(occ_filtered)} found)", occ_filtered, default=[]
            )
    # Nativity
    with st.sidebar.expander("๐ŸŒ Nativity Filter", expanded=True):
        nativity_options = list(df["nativity"].unique())
        if st.checkbox("Select all nativities", value=True, key="nat_all"):
            sel_nat = st.multiselect("Nativity", nativity_options, default=nativity_options)
        else:
            sel_nat = st.multiselect("Nativity", nativity_options, default=[])
    return sel_states, sel_occs, sel_nat


def render_pivot_explorer(dff: pd.DataFrame):
    """Main panel for the pivotโ€‘table explorer mode."""
    st.markdown("### โš™๏ธ Configure Your Pivot View")
    all_dims = ["state", "occ2018", "nativity"]

    c1, c2, c3 = st.columns([1.2, 1, 1])
    with c1:
        metric = st.radio(
            "Select Metric",
            [
                "Unemployment Rate",
                "Foreign-Born Labor Force %",
                "EMPLOYED",
                "UNEMPLOYED",
            ],
            index=0,
            key="metric_select",
        )
    with c2:
        group_rows = st.multiselect("Group Rows By", all_dims, default=["state"], key="group_rows")
    available_cols = [d for d in all_dims if d not in group_rows]
    with c3:
        group_cols = st.multiselect("Group Columns By", available_cols, default=[], key="group_cols")

    # Build pivot table
    try:
        grouping_cols = [col for col in (group_rows + group_cols) if col]
        if not grouping_cols:
            st.info("Please select at least one 'Group By' option to build the table.")
            return

        if metric == "Unemployment Rate":
            agg = (
                dff.groupby(grouping_cols)
                .agg({"EMPLOYED": "sum", "UNEMPLOYED": "sum"})
                .reset_index()
            )
            total = agg["EMPLOYED"] + agg["UNEMPLOYED"]
            agg[metric] = (agg["UNEMPLOYED"] / total).fillna(0) * 100
            pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0)
        elif metric == "Foreign-Born Labor Force %":
            temp = dff.copy()
            temp["total_labor_force"] = temp["EMPLOYED"] + temp["UNEMPLOYED"]
            temp["foreign_labor_force"] = temp["total_labor_force"].where(
                temp["nativity"] == "Foreign-born", 0
            )
            agg = (
                temp.groupby(grouping_cols)
                .agg({"total_labor_force": "sum", "foreign_labor_force": "sum"})
                .reset_index()
            )
            agg[metric] = (
                agg["foreign_labor_force"] / agg["total_labor_force"]
            ).fillna(0) * 100
            pivot = agg.pivot_table(index=group_rows, columns=group_cols, values=metric, fill_value=0)
        else:  # EMPLOYED or UNEMPLOYED totals
            pivot = dff.pivot_table(
                index=group_rows, columns=group_cols, values=metric, aggfunc="sum", fill_value=0
            )

    except Exception as e:
        st.error(f"Could not create pivot table. Check your selections. Error: {e}")
        return

    # Universal cleanup so downstream code never crashes
    pivot = make_pivot_safe(pivot, metric)

    tab1, tab2, tab3 = st.tabs(["๐Ÿ“Š Pivot Table", "๐Ÿ“ˆ Visualisation", "โ„น๏ธ Data Summary"])

    with tab1:
        st.subheader("Aggregated Data")
        show_table(pivot, metric)

    with tab2:
        st.subheader(f"Visualisation of {metric}")
        robust_chart(pivot, metric)

    with tab3:
        st.subheader("Summary of Filtered Data")
        st.write(f"Displaying summary for **{len(dff):,}** raw data entries.")
        st.dataframe(dff.describe(include="all").T)


def render_comparative_analysis(dff: pd.DataFrame):
    """Nativityโ€‘split bar charts for a single state."""
    st.markdown("### โš–๏ธ Comparative Analysis: Employment by Nativity")
    available_states = sorted(dff["state"].unique())
    if not available_states:
        st.warning("No states available in the filtered data to compare.")
        return

    selected_state = st.selectbox("Select a State to Analyze", available_states)
    if selected_state:
        sdf = dff[dff["state"] == selected_state]
        long = pd.melt(
            sdf,
            id_vars=["nativity"],
            value_vars=["EMPLOYED", "UNEMPLOYED"],
            var_name="Employment Status",
            value_name="Count",
        )
        chart = (
            alt.Chart(long)
            .mark_bar()
            .encode(
                x=alt.X("Employment Status:N", axis=alt.Axis(labels=False, ticks=False)),
                y=alt.Y("sum(Count):Q", title="Total Count"),
                color=alt.Color(
                    "Employment Status:N",
                    scale=alt.Scale(domain=["EMPLOYED", "UNEMPLOYED"], range=["#1f77b4", "#d62728"]),
                    legend=alt.Legend(title="Status"),
                ),
                tooltip=["nativity", "Employment Status", alt.Tooltip("sum(Count):Q", format=",.0f")],
            )
            .properties(title=f"Employment Status in {selected_state}")
            .facet(column=alt.Column("nativity:N", title="Nativity Group"))
        )
        st.altair_chart(chart, use_container_width=True)


def main():
    st.set_page_config(
        page_title="CPS Explorer", layout="wide", initial_sidebar_state="expanded"
    )
    st.title("๐Ÿ“Š Interactive CPS Data Explorer")
    st.markdown(
        "Use the filters and controls to slice, pivot, and visualize Current Population Survey (CPS) employment data."
    )

    df = load_and_prepare_data(DATA_PATH)
    if df.empty:
        st.stop()

    # Sidebar controls
    st.sidebar.header("๐Ÿ•น๏ธ Controls")
    analysis_mode = st.sidebar.radio(
        "Analysis Mode",
        ["Pivot Table Explorer", "Comparative Analysis"],
    )
    sel_states, sel_occs, sel_nat = render_sidebar(df)

    query_parts = []
    if sel_states:
        query_parts.append("state in @sel_states")
    if sel_occs:
        query_parts.append("occ2018 in @sel_occs")
    if sel_nat:
        query_parts.append("nativity in @sel_nat")

    if not query_parts:
        st.warning("Please make a selection in the sidebar to see the data.")
        st.stop()

    dff = df.query(" & ".join(query_parts), engine="python").copy()
    if dff.empty:
        st.warning("No data matches your selection. Try widening the filters.")
        st.stop()

    if analysis_mode == "Pivot Table Explorer":
        render_pivot_explorer(dff)
    else:
        render_comparative_analysis(dff)

    st.markdown("---")
    st.caption(
        "Source: Basic Monthly Current Population Survey (CPS). Weight: PWCMPWT. May 2025 Report."
    )


if __name__ == "__main__":
    main()