Files changed (1) hide show
  1. app4.py +396 -396
app4.py CHANGED
@@ -1,396 +1,396 @@
1
- # streamlit_app.py
2
- import streamlit as st
3
- import pandas as pd
4
- import numpy as np
5
- import joblib
6
- from typing import Tuple, Dict, Any, List
7
-
8
- st.set_page_config(page_title="USA Salary — Synthetic/Hybrid Prediction", layout="wide")
9
-
10
- # --- Top nav to teammates' apps ---
11
- c1, c2, c3 = st.columns(3)
12
- with c1:
13
- st.link_button("Hamna", "https://example.com/hamna") # TODO: replace with real link
14
- with c2:
15
- st.link_button("Mahesh", "https://example.com/mahesh") # TODO: replace with real link
16
- with c3:
17
- st.link_button("Tian", "https://example.com/tian") # TODO: replace with real link
18
-
19
- st.title("USA Salary — Predict with Hybrid/Synthetic Inputs")
20
-
21
- # =================== Load assets ===================
22
- @st.cache_resource(show_spinner=False)
23
- def _load_assets():
24
- pipe = joblib.load("final_xgbr_usa_model.pkl")
25
- usa_2024 = pd.read_csv("usa_salary_data.csv")
26
- usa_2025 = pd.read_csv("2025_survey.csv")
27
- return pipe, usa_2024, usa_2025
28
-
29
- pipe, usa_data, usa_25 = _load_assets()
30
- LABEL = "CompTotal"
31
-
32
- # =================== RNG utilities ===================
33
- def _ensure_rng():
34
- if "rng_seedseq" not in st.session_state:
35
- st.session_state.rng_seedseq = np.random.SeedSequence()
36
- child = st.session_state.rng_seedseq.spawn(1)[0]
37
- return np.random.default_rng(child)
38
-
39
- def new_rng():
40
- st.session_state.rng_seedseq = st.session_state.rng_seedseq.spawn(1)[0]
41
- return np.random.default_rng(st.session_state.rng_seedseq)
42
-
43
- # =================== Precompute dropdown choices (ONCE) ===================
44
- @st.cache_resource(show_spinner=False)
45
- def _precompute_choices(usa_2024: pd.DataFrame, usa_2025: pd.DataFrame, label: str) -> Dict[str, List]:
46
- """
47
- Build a dict of column -> list of allowed choices.
48
- - Categorical: top-k frequent from union(2024, 2025)
49
- - Numeric: percentile grid + most frequent rounded values
50
- """
51
- CHOICES: Dict[str, List] = {}
52
- both = pd.concat([usa_2024, usa_2025.reindex(columns=usa_2024.columns, fill_value=np.nan)], axis=0, ignore_index=True)
53
- for col in usa_2024.columns:
54
- if col == label: # never expose label to edit
55
- continue
56
-
57
- s = both[col]
58
- if pd.api.types.is_numeric_dtype(usa_2024[col]):
59
- s_num = pd.to_numeric(s, errors="coerce").dropna()
60
- if len(s_num) == 0:
61
- CHOICES[col] = [0]
62
- continue
63
-
64
- # Percentile grid (5th..95th) + median
65
- q_list = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95]
66
- qs = np.percentile(s_num, q_list).tolist()
67
-
68
- # Common rounded values (to nearest 1000) by frequency
69
- rounded = (np.round(s_num / 1000) * 1000).astype(int)
70
- top_round = rounded.value_counts().head(10).index.astype(int).tolist()
71
-
72
- # Merge, deduplicate, sort, and keep as Python numbers (not numpy types)
73
- merged = sorted(set(int(v) for v in qs + top_round))
74
- CHOICES[col] = merged[:50] # cap length
75
- else:
76
- s_cat = s.astype(str).replace({"nan": None})
77
- s_cat = s_cat.dropna()
78
- if len(s_cat) == 0:
79
- CHOICES[col] = [""]
80
- continue
81
- topk = s_cat.value_counts().head(40).index.tolist()
82
- CHOICES[col] = topk
83
- return CHOICES
84
-
85
- CHOICES_DICT = _precompute_choices(usa_data, usa_25, LABEL)
86
-
87
- # =================== Core sampling funcs (your logic) ===================
88
- def _sample_from_2024(colname: str, df_2024: pd.DataFrame, rng: np.random.Generator):
89
- series = df_2024[colname].dropna()
90
- if series.empty:
91
- if pd.api.types.is_numeric_dtype(df_2024[colname]):
92
- return 0
93
- return ""
94
- return series.sample(1, random_state=rng.integers(0, 10_000)).iloc[0]
95
-
96
- def build_synthetic_row_with_trace(
97
- usa_25: pd.DataFrame,
98
- usa_2024: pd.DataFrame,
99
- label: str = "CompTotal",
100
- rng: np.random.Generator | None = None
101
- ) -> Tuple[pd.DataFrame, float | None, Dict[str, str], Dict[str, Any]]:
102
- if rng is None:
103
- rng = _ensure_rng()
104
-
105
- expected_features = [c for c in usa_2024.columns if c != label]
106
- row25 = usa_25.sample(1, random_state=rng.integers(0, 10_000)).iloc[0]
107
-
108
- synthetic = {}
109
- source_info: Dict[str, str] = {}
110
-
111
- for col in expected_features:
112
- use_25_val = False
113
- val = None
114
-
115
- if col in row25.index:
116
- val = row25[col]
117
- if pd.api.types.is_numeric_dtype(usa_2024[col]):
118
- val = pd.to_numeric(val, errors="coerce")
119
- if not pd.isna(val):
120
- use_25_val = True
121
-
122
- if not use_25_val:
123
- val = _sample_from_2024(col, usa_2024, rng)
124
- source_info[col] = "2024"
125
- else:
126
- source_info[col] = "2025"
127
-
128
- if pd.api.types.is_numeric_dtype(usa_2024[col]):
129
- val = pd.to_numeric(val, errors="coerce")
130
- if pd.isna(val):
131
- val = _sample_from_2024(col, usa_2024, rng)
132
- source_info[col] = "2024"
133
- else:
134
- if pd.isna(val):
135
- val = _sample_from_2024(col, usa_2024, rng)
136
- source_info[col] = "2024"
137
- if not isinstance(val, str):
138
- val = str(val)
139
-
140
- synthetic[col] = val
141
-
142
- X_one = pd.DataFrame([synthetic], columns=expected_features)
143
-
144
- y_true = None
145
- if label in row25.index:
146
- y_true = pd.to_numeric(row25[label], errors="coerce")
147
- if pd.isna(y_true):
148
- y_true = None
149
-
150
- used_from_2025 = sum(v == "2025" for v in source_info.values())
151
- used_from_2024 = sum(v == "2024" for v in source_info.values())
152
- total = len(source_info)
153
- report = dict(
154
- total_features=total,
155
- n_2025=used_from_2025,
156
- n_2024=used_from_2024,
157
- pct_2025=0.0 if total == 0 else used_from_2025 / total * 100,
158
- pct_2024=0.0 if total == 0 else used_from_2024 / total * 100,
159
- filled_cols=[c for c, s in source_info.items() if s == "2024"]
160
- )
161
- return X_one, y_true, source_info, report
162
-
163
- def random_sample_row_2024(
164
- df_2024: pd.DataFrame,
165
- label: str,
166
- rng: np.random.Generator
167
- ) -> pd.DataFrame:
168
- expected_features = [c for c in df_2024.columns if c != label]
169
- sampled = {}
170
- for col in expected_features:
171
- sampled[col] = _sample_from_2024(col, df_2024, rng)
172
- return pd.DataFrame([sampled], columns=expected_features)
173
-
174
- # =================== Session state ===================
175
- if "history" not in st.session_state:
176
- st.session_state.history = [] # list of dicts: pred, truth, abs_err, pct_err
177
-
178
- if "prepared" not in st.session_state:
179
- st.session_state.prepared = None # dict: X, y_true, info, report, mode
180
-
181
- # =================== Sidebar ===================
182
- st.sidebar.header("Controls")
183
- mode = st.sidebar.radio(
184
- "Choose input mode:",
185
- ["Hybrid (Random 2025 + fill from 2024)", "Pure 2024 synthetic"],
186
- index=0
187
- )
188
-
189
- if st.sidebar.button("Reload new random data"):
190
- rng = new_rng()
191
- if mode.startswith("Hybrid"):
192
- X_one, y_true, src_info, rep = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
193
- st.session_state.prepared = dict(X=X_one, y_true=y_true, info=src_info, report=rep, mode="hybrid")
194
- else:
195
- X_2024 = random_sample_row_2024(usa_data, LABEL, rng)
196
- st.session_state.prepared = dict(X=X_2024, y_true=None, info={}, report={}, mode="pure2024")
197
- st.toast("New random input prepared.", icon="✅")
198
-
199
- # =================== Explanation ===================
200
- with st.expander("What is happening here? "):
201
- st.markdown(
202
- """
203
- **Goal:** Create one model-ready row and predict **CompTotal(Annual Income)**.
204
- **dataset** trained on: USA data from 2024 stackoverflow survey.
205
- predict on: synthetic/hybrid data mixing 2025 & 2024.
206
- **Modes:**
207
- 1) **Hybrid** — Pick a random 2025 respondent; for any missing required feature, fill using a value sampled from the 2024 distribution for that column.
208
- 2) **Pure 2024** — Build an entirely synthetic row, sampling every feature from the 2024 distribution.
209
-
210
- **Editing:**
211
- Below, you can adjust the *current* row via dropdowns. All dropdown choices were **precomputed once at startup** from the union of 2024 & 2025 data (to keep the app fast).
212
- - **Categorical**: most frequent categories (+ always includes the current value).
213
- - **Numeric**: percentile grid & common rounded values (+ current value).
214
-
215
- Click **Submit & Predict** to see predicted vs. true (if available), absolute and percentage errors, and running averages.
216
- """
217
- )
218
-
219
- st.markdown("---")
220
-
221
- # =================== Prepare first candidate if needed ===================
222
- def _prepare_candidate_if_needed():
223
- if st.session_state.prepared is None:
224
- rng = _ensure_rng()
225
- if mode.startswith("Hybrid"):
226
- X_one, y_true, src_info, rep = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
227
- st.session_state.prepared = dict(X=X_one, y_true=y_true, info=src_info, report=rep, mode="hybrid")
228
- else:
229
- X_2024 = random_sample_row_2024(usa_data, LABEL, rng)
230
- st.session_state.prepared = dict(X=X_2024, y_true=None, info={}, report={}, mode="pure2024")
231
-
232
- _prepare_candidate_if_needed()
233
-
234
- # =================== Helper: coerce edited values to correct dtype ===================
235
- def _coerce_value(col: str, val):
236
- if pd.api.types.is_numeric_dtype(usa_data[col]):
237
- # Accept numbers that might come from selectbox as str/float
238
- try:
239
- return pd.to_numeric(val)
240
- except Exception:
241
- return np.nan
242
- else:
243
- return "" if val is None else str(val)
244
-
245
-
246
- # =================== Predict & History ===================
247
- left, right = st.columns([1.1, 0.9], gap="large")
248
-
249
- with left:
250
- st.subheader("Submit a prediction")
251
-
252
- # Preview current candidate
253
- if st.session_state.prepared is not None:
254
- curr = st.session_state.prepared
255
- mode_tag = "Hybrid (2025+2024)" if curr["mode"] == "hybrid" else "Pure 2024"
256
- st.caption(f"Next input mode: **{mode_tag}**")
257
-
258
- with st.expander("Show current input row (after your edits)"):
259
- st.dataframe(curr["X"].T.rename(columns={0: "value"}))
260
-
261
- if curr["mode"] == "hybrid":
262
- rep = curr["report"]
263
- st.caption(
264
- f"Data completion: **{rep['n_2025']}** from 2025 "
265
- f"({rep['pct_2025']:.1f}%); **{rep['n_2024']}** from 2024 "
266
- f"({rep['pct_2024']:.1f}%)"
267
- )
268
-
269
- submitted = st.button("Submit & Predict", type="primary", use_container_width=True)
270
-
271
- if submitted and st.session_state.prepared is not None:
272
- curr = st.session_state.prepared
273
- X_one = curr["X"]
274
- y_true = curr["y_true"]
275
-
276
- # (Optional) Final dtype alignment just before predict
277
- try:
278
- # Align numeric dtypes to training schema to be safe
279
- for col in X_one.columns:
280
- if pd.api.types.is_numeric_dtype(usa_data[col]):
281
- X_one[col] = pd.to_numeric(X_one[col], errors="coerce")
282
- else:
283
- X_one[col] = X_one[col].astype(str).fillna("")
284
- except Exception:
285
- pass
286
-
287
- try:
288
- y_pred = float(pipe.predict(X_one)[0])
289
- except Exception as e:
290
- st.error(f"Prediction failed: {e}")
291
- y_pred = None
292
-
293
- if y_pred is not None:
294
- st.success(f"**Predicted CompTotal:** {y_pred:,.0f} USD")
295
-
296
- if y_true is not None:
297
- st.info(f"**2025 true:** {y_true:,.0f} USD")
298
-
299
- abs_err = abs(y_pred - y_true)
300
- pct_err = abs_err / y_true * 100 if y_true != 0 else np.nan
301
-
302
- st.write(f"**Absolute error:** {abs_err:,.0f} USD")
303
- st.write(f"**Percentage error:** {pct_err:.2f}%")
304
-
305
- st.session_state.history.append(
306
- dict(pred=y_pred, truth=y_true, abs_err=abs_err, pct_err=pct_err)
307
- )
308
- else:
309
- st.warning("No ground-truth value available for this input (pure 2024 synthetic).")
310
-
311
- # Prepare a new random candidate (and your edits will apply to the new one next round)
312
- rng = new_rng()
313
- if mode.startswith("Hybrid"):
314
- X_one2, y_true2, src_info2, rep2 = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
315
- st.session_state.prepared = dict(X=X_one2, y_true=y_true2, info=src_info2, report=rep2, mode="hybrid")
316
- else:
317
- X_2024b = random_sample_row_2024(usa_data, LABEL, rng)
318
- st.session_state.prepared = dict(X=X_2024b, y_true=None, info={}, report={}, mode="pure2024")
319
- st.toast("New random input prepared.", icon="✨")
320
-
321
- with right:
322
- st.subheader("Results history")
323
- if len(st.session_state.history) == 0:
324
- st.write("No submissions yet.")
325
- else:
326
- hist_df = pd.DataFrame(st.session_state.history)
327
- st.dataframe(
328
- hist_df.style.format({"pred": "{:,.0f}", "truth": "{:,.0f}", "abs_err": "{:,.0f}", "pct_err": "{:.2f}"}),
329
- use_container_width=True
330
- )
331
-
332
- valid = hist_df.dropna(subset=["truth"])
333
- if len(valid) > 0:
334
- mae = valid["abs_err"].mean()
335
- mape = valid["pct_err"].mean()
336
- st.metric(label="Mean Absolute Error (USD)", value=f"{mae:,.0f}")
337
- st.metric(label="Mean Absolute Percentage Error", value=f"{mape:.2f}%")
338
- else:
339
- st.write("No entries with ground truth yet.")
340
-
341
- st.markdown("---")
342
- st.caption("Tip: Use the sidebar **Reload new random data** to resample without submitting.")
343
- # =================== Editable input row UI ===================
344
- st.subheader("Edit current input (optional)")
345
- if st.session_state.prepared is not None:
346
- curr = st.session_state.prepared
347
- X_row = curr["X"].iloc[0].copy()
348
-
349
- # Two tabs for readability
350
- tab_cat, tab_num = st.tabs(["Categorical features", "Numeric features"])
351
-
352
- # Build lists
353
- cat_cols = [c for c in X_row.index if not pd.api.types.is_numeric_dtype(usa_data[c])]
354
- num_cols = [c for c in X_row.index if pd.api.types.is_numeric_dtype(usa_data[c])]
355
-
356
- with tab_cat:
357
- st.caption("Pick from common categories (precomputed). Your current value is preselected.")
358
- for col in cat_cols:
359
- choices = CHOICES_DICT.get(col, [])
360
- # ensure current value is present
361
- curr_val = "" if pd.isna(X_row[col]) else str(X_row[col])
362
- if curr_val not in choices and curr_val != "":
363
- choices = [curr_val] + choices
364
- sel = st.selectbox(
365
- label=col,
366
- options=choices if len(choices) > 0 else [""],
367
- index=0 if len(choices) == 0 else (choices.index(curr_val) if curr_val in choices else 0),
368
- key=f"edit_cat_{col}",
369
- )
370
- X_row[col] = _coerce_value(col, sel)
371
-
372
- with tab_num:
373
- st.caption("Pick typical numeric values (percentiles/rounded) or keep current.")
374
- for col in num_cols:
375
- choices = CHOICES_DICT.get(col, [])
376
- curr_val = X_row[col]
377
- # ensure current value is present and cast to int for display if close to int
378
- if pd.isna(curr_val):
379
- curr_val = choices[0] if len(choices) else 0
380
- # make sure current is in choices
381
- if len(choices) == 0:
382
- choices = [curr_val]
383
- elif curr_val not in choices:
384
- choices = [curr_val] + choices
385
- sel = st.selectbox(
386
- label=col,
387
- options=choices,
388
- index=choices.index(curr_val) if curr_val in choices else 0,
389
- key=f"edit_num_{col}",
390
- )
391
- X_row[col] = _coerce_value(col, sel)
392
-
393
- # Save edits back
394
- st.session_state.prepared["X"].iloc[0] = X_row
395
-
396
-
 
1
+ # streamlit_app.py
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import joblib
6
+ from typing import Tuple, Dict, Any, List
7
+
8
+ st.set_page_config(page_title="USA Salary — Synthetic/Hybrid Prediction", layout="wide")
9
+
10
+ # --- Top nav to teammates' apps ---
11
+ c1, c2, c3 = st.columns(3)
12
+ with c1:
13
+ st.link_button("Hamna", "https://project-ytgzknejj7pcncnqxbcwua.streamlit.app/") # TODO: replace with real link
14
+ with c2:
15
+ st.link_button("Mahesh", "https://example.com/mahesh") # TODO: replace with real link
16
+ with c3:
17
+ st.link_button("Tian", "https://www.canva.com/design/DAG1MBF-wU0/8rWZE_GCqcqBnNCoEECpGw/edit?utm_content=DAG1MBF-wU0&utm_campaign=designshare&utm_medium=link2&utm_source=sharebutton") # TODO: replace with real link
18
+
19
+ st.title("USA Salary — Predict with Hybrid/Synthetic Inputs")
20
+
21
+ # =================== Load assets ===================
22
+ @st.cache_resource(show_spinner=False)
23
+ def _load_assets():
24
+ pipe = joblib.load("final_xgbr_usa_model.pkl")
25
+ usa_2024 = pd.read_csv("usa_salary_data.csv")
26
+ usa_2025 = pd.read_csv("2025_survey.csv")
27
+ return pipe, usa_2024, usa_2025
28
+
29
+ pipe, usa_data, usa_25 = _load_assets()
30
+ LABEL = "CompTotal"
31
+
32
+ # =================== RNG utilities ===================
33
+ def _ensure_rng():
34
+ if "rng_seedseq" not in st.session_state:
35
+ st.session_state.rng_seedseq = np.random.SeedSequence()
36
+ child = st.session_state.rng_seedseq.spawn(1)[0]
37
+ return np.random.default_rng(child)
38
+
39
+ def new_rng():
40
+ st.session_state.rng_seedseq = st.session_state.rng_seedseq.spawn(1)[0]
41
+ return np.random.default_rng(st.session_state.rng_seedseq)
42
+
43
+ # =================== Precompute dropdown choices (ONCE) ===================
44
+ @st.cache_resource(show_spinner=False)
45
+ def _precompute_choices(usa_2024: pd.DataFrame, usa_2025: pd.DataFrame, label: str) -> Dict[str, List]:
46
+ """
47
+ Build a dict of column -> list of allowed choices.
48
+ - Categorical: top-k frequent from union(2024, 2025)
49
+ - Numeric: percentile grid + most frequent rounded values
50
+ """
51
+ CHOICES: Dict[str, List] = {}
52
+ both = pd.concat([usa_2024, usa_2025.reindex(columns=usa_2024.columns, fill_value=np.nan)], axis=0, ignore_index=True)
53
+ for col in usa_2024.columns:
54
+ if col == label: # never expose label to edit
55
+ continue
56
+
57
+ s = both[col]
58
+ if pd.api.types.is_numeric_dtype(usa_2024[col]):
59
+ s_num = pd.to_numeric(s, errors="coerce").dropna()
60
+ if len(s_num) == 0:
61
+ CHOICES[col] = [0]
62
+ continue
63
+
64
+ # Percentile grid (5th..95th) + median
65
+ q_list = [5, 10, 20, 30, 40, 50, 60, 70, 80, 90, 95]
66
+ qs = np.percentile(s_num, q_list).tolist()
67
+
68
+ # Common rounded values (to nearest 1000) by frequency
69
+ rounded = (np.round(s_num / 1000) * 1000).astype(int)
70
+ top_round = rounded.value_counts().head(10).index.astype(int).tolist()
71
+
72
+ # Merge, deduplicate, sort, and keep as Python numbers (not numpy types)
73
+ merged = sorted(set(int(v) for v in qs + top_round))
74
+ CHOICES[col] = merged[:50] # cap length
75
+ else:
76
+ s_cat = s.astype(str).replace({"nan": None})
77
+ s_cat = s_cat.dropna()
78
+ if len(s_cat) == 0:
79
+ CHOICES[col] = [""]
80
+ continue
81
+ topk = s_cat.value_counts().head(40).index.tolist()
82
+ CHOICES[col] = topk
83
+ return CHOICES
84
+
85
+ CHOICES_DICT = _precompute_choices(usa_data, usa_25, LABEL)
86
+
87
+ # =================== Core sampling funcs (your logic) ===================
88
+ def _sample_from_2024(colname: str, df_2024: pd.DataFrame, rng: np.random.Generator):
89
+ series = df_2024[colname].dropna()
90
+ if series.empty:
91
+ if pd.api.types.is_numeric_dtype(df_2024[colname]):
92
+ return 0
93
+ return ""
94
+ return series.sample(1, random_state=rng.integers(0, 10_000)).iloc[0]
95
+
96
+ def build_synthetic_row_with_trace(
97
+ usa_25: pd.DataFrame,
98
+ usa_2024: pd.DataFrame,
99
+ label: str = "CompTotal",
100
+ rng: np.random.Generator | None = None
101
+ ) -> Tuple[pd.DataFrame, float | None, Dict[str, str], Dict[str, Any]]:
102
+ if rng is None:
103
+ rng = _ensure_rng()
104
+
105
+ expected_features = [c for c in usa_2024.columns if c != label]
106
+ row25 = usa_25.sample(1, random_state=rng.integers(0, 10_000)).iloc[0]
107
+
108
+ synthetic = {}
109
+ source_info: Dict[str, str] = {}
110
+
111
+ for col in expected_features:
112
+ use_25_val = False
113
+ val = None
114
+
115
+ if col in row25.index:
116
+ val = row25[col]
117
+ if pd.api.types.is_numeric_dtype(usa_2024[col]):
118
+ val = pd.to_numeric(val, errors="coerce")
119
+ if not pd.isna(val):
120
+ use_25_val = True
121
+
122
+ if not use_25_val:
123
+ val = _sample_from_2024(col, usa_2024, rng)
124
+ source_info[col] = "2024"
125
+ else:
126
+ source_info[col] = "2025"
127
+
128
+ if pd.api.types.is_numeric_dtype(usa_2024[col]):
129
+ val = pd.to_numeric(val, errors="coerce")
130
+ if pd.isna(val):
131
+ val = _sample_from_2024(col, usa_2024, rng)
132
+ source_info[col] = "2024"
133
+ else:
134
+ if pd.isna(val):
135
+ val = _sample_from_2024(col, usa_2024, rng)
136
+ source_info[col] = "2024"
137
+ if not isinstance(val, str):
138
+ val = str(val)
139
+
140
+ synthetic[col] = val
141
+
142
+ X_one = pd.DataFrame([synthetic], columns=expected_features)
143
+
144
+ y_true = None
145
+ if label in row25.index:
146
+ y_true = pd.to_numeric(row25[label], errors="coerce")
147
+ if pd.isna(y_true):
148
+ y_true = None
149
+
150
+ used_from_2025 = sum(v == "2025" for v in source_info.values())
151
+ used_from_2024 = sum(v == "2024" for v in source_info.values())
152
+ total = len(source_info)
153
+ report = dict(
154
+ total_features=total,
155
+ n_2025=used_from_2025,
156
+ n_2024=used_from_2024,
157
+ pct_2025=0.0 if total == 0 else used_from_2025 / total * 100,
158
+ pct_2024=0.0 if total == 0 else used_from_2024 / total * 100,
159
+ filled_cols=[c for c, s in source_info.items() if s == "2024"]
160
+ )
161
+ return X_one, y_true, source_info, report
162
+
163
+ def random_sample_row_2024(
164
+ df_2024: pd.DataFrame,
165
+ label: str,
166
+ rng: np.random.Generator
167
+ ) -> pd.DataFrame:
168
+ expected_features = [c for c in df_2024.columns if c != label]
169
+ sampled = {}
170
+ for col in expected_features:
171
+ sampled[col] = _sample_from_2024(col, df_2024, rng)
172
+ return pd.DataFrame([sampled], columns=expected_features)
173
+
174
+ # =================== Session state ===================
175
+ if "history" not in st.session_state:
176
+ st.session_state.history = [] # list of dicts: pred, truth, abs_err, pct_err
177
+
178
+ if "prepared" not in st.session_state:
179
+ st.session_state.prepared = None # dict: X, y_true, info, report, mode
180
+
181
+ # =================== Sidebar ===================
182
+ st.sidebar.header("Controls")
183
+ mode = st.sidebar.radio(
184
+ "Choose input mode:",
185
+ ["Hybrid (Random 2025 + fill from 2024)", "Pure 2024 synthetic"],
186
+ index=0
187
+ )
188
+
189
+ if st.sidebar.button("Reload new random data"):
190
+ rng = new_rng()
191
+ if mode.startswith("Hybrid"):
192
+ X_one, y_true, src_info, rep = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
193
+ st.session_state.prepared = dict(X=X_one, y_true=y_true, info=src_info, report=rep, mode="hybrid")
194
+ else:
195
+ X_2024 = random_sample_row_2024(usa_data, LABEL, rng)
196
+ st.session_state.prepared = dict(X=X_2024, y_true=None, info={}, report={}, mode="pure2024")
197
+ st.toast("New random input prepared.", icon="✅")
198
+
199
+ # =================== Explanation ===================
200
+ with st.expander("What is happening here? "):
201
+ st.markdown(
202
+ """
203
+ **Goal:** Create one model-ready row and predict **CompTotal(Annual Income)**.
204
+ **dataset** trained on: USA data from 2024 stackoverflow survey.
205
+ predict on: synthetic/hybrid data mixing 2025 & 2024.
206
+ **Modes:**
207
+ 1) **Hybrid** — Pick a random 2025 respondent; for any missing required feature, fill using a value sampled from the 2024 distribution for that column.
208
+ 2) **Pure 2024** — Build an entirely synthetic row, sampling every feature from the 2024 distribution.
209
+
210
+ **Editing:**
211
+ Below, you can adjust the *current* row via dropdowns. All dropdown choices were **precomputed once at startup** from the union of 2024 & 2025 data (to keep the app fast).
212
+ - **Categorical**: most frequent categories (+ always includes the current value).
213
+ - **Numeric**: percentile grid & common rounded values (+ current value).
214
+
215
+ Click **Submit & Predict** to see predicted vs. true (if available), absolute and percentage errors, and running averages.
216
+ """
217
+ )
218
+
219
+ st.markdown("---")
220
+
221
+ # =================== Prepare first candidate if needed ===================
222
+ def _prepare_candidate_if_needed():
223
+ if st.session_state.prepared is None:
224
+ rng = _ensure_rng()
225
+ if mode.startswith("Hybrid"):
226
+ X_one, y_true, src_info, rep = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
227
+ st.session_state.prepared = dict(X=X_one, y_true=y_true, info=src_info, report=rep, mode="hybrid")
228
+ else:
229
+ X_2024 = random_sample_row_2024(usa_data, LABEL, rng)
230
+ st.session_state.prepared = dict(X=X_2024, y_true=None, info={}, report={}, mode="pure2024")
231
+
232
+ _prepare_candidate_if_needed()
233
+
234
+ # =================== Helper: coerce edited values to correct dtype ===================
235
+ def _coerce_value(col: str, val):
236
+ if pd.api.types.is_numeric_dtype(usa_data[col]):
237
+ # Accept numbers that might come from selectbox as str/float
238
+ try:
239
+ return pd.to_numeric(val)
240
+ except Exception:
241
+ return np.nan
242
+ else:
243
+ return "" if val is None else str(val)
244
+
245
+
246
+ # =================== Predict & History ===================
247
+ left, right = st.columns([1.1, 0.9], gap="large")
248
+
249
+ with left:
250
+ st.subheader("Submit a prediction")
251
+
252
+ # Preview current candidate
253
+ if st.session_state.prepared is not None:
254
+ curr = st.session_state.prepared
255
+ mode_tag = "Hybrid (2025+2024)" if curr["mode"] == "hybrid" else "Pure 2024"
256
+ st.caption(f"Next input mode: **{mode_tag}**")
257
+
258
+ with st.expander("Show current input row (after your edits)"):
259
+ st.dataframe(curr["X"].T.rename(columns={0: "value"}))
260
+
261
+ if curr["mode"] == "hybrid":
262
+ rep = curr["report"]
263
+ st.caption(
264
+ f"Data completion: **{rep['n_2025']}** from 2025 "
265
+ f"({rep['pct_2025']:.1f}%); **{rep['n_2024']}** from 2024 "
266
+ f"({rep['pct_2024']:.1f}%)"
267
+ )
268
+
269
+ submitted = st.button("Submit & Predict", type="primary", use_container_width=True)
270
+
271
+ if submitted and st.session_state.prepared is not None:
272
+ curr = st.session_state.prepared
273
+ X_one = curr["X"]
274
+ y_true = curr["y_true"]
275
+
276
+ # (Optional) Final dtype alignment just before predict
277
+ try:
278
+ # Align numeric dtypes to training schema to be safe
279
+ for col in X_one.columns:
280
+ if pd.api.types.is_numeric_dtype(usa_data[col]):
281
+ X_one[col] = pd.to_numeric(X_one[col], errors="coerce")
282
+ else:
283
+ X_one[col] = X_one[col].astype(str).fillna("")
284
+ except Exception:
285
+ pass
286
+
287
+ try:
288
+ y_pred = float(pipe.predict(X_one)[0])
289
+ except Exception as e:
290
+ st.error(f"Prediction failed: {e}")
291
+ y_pred = None
292
+
293
+ if y_pred is not None:
294
+ st.success(f"**Predicted CompTotal:** {y_pred:,.0f} USD")
295
+
296
+ if y_true is not None:
297
+ st.info(f"**2025 true:** {y_true:,.0f} USD")
298
+
299
+ abs_err = abs(y_pred - y_true)
300
+ pct_err = abs_err / y_true * 100 if y_true != 0 else np.nan
301
+
302
+ st.write(f"**Absolute error:** {abs_err:,.0f} USD")
303
+ st.write(f"**Percentage error:** {pct_err:.2f}%")
304
+
305
+ st.session_state.history.append(
306
+ dict(pred=y_pred, truth=y_true, abs_err=abs_err, pct_err=pct_err)
307
+ )
308
+ else:
309
+ st.warning("No ground-truth value available for this input (pure 2024 synthetic).")
310
+
311
+ # Prepare a new random candidate (and your edits will apply to the new one next round)
312
+ rng = new_rng()
313
+ if mode.startswith("Hybrid"):
314
+ X_one2, y_true2, src_info2, rep2 = build_synthetic_row_with_trace(usa_25, usa_data, label=LABEL, rng=rng)
315
+ st.session_state.prepared = dict(X=X_one2, y_true=y_true2, info=src_info2, report=rep2, mode="hybrid")
316
+ else:
317
+ X_2024b = random_sample_row_2024(usa_data, LABEL, rng)
318
+ st.session_state.prepared = dict(X=X_2024b, y_true=None, info={}, report={}, mode="pure2024")
319
+ st.toast("New random input prepared.", icon="✨")
320
+
321
+ with right:
322
+ st.subheader("Results history")
323
+ if len(st.session_state.history) == 0:
324
+ st.write("No submissions yet.")
325
+ else:
326
+ hist_df = pd.DataFrame(st.session_state.history)
327
+ st.dataframe(
328
+ hist_df.style.format({"pred": "{:,.0f}", "truth": "{:,.0f}", "abs_err": "{:,.0f}", "pct_err": "{:.2f}"}),
329
+ use_container_width=True
330
+ )
331
+
332
+ valid = hist_df.dropna(subset=["truth"])
333
+ if len(valid) > 0:
334
+ mae = valid["abs_err"].mean()
335
+ mape = valid["pct_err"].mean()
336
+ st.metric(label="Mean Absolute Error (USD)", value=f"{mae:,.0f}")
337
+ st.metric(label="Mean Absolute Percentage Error", value=f"{mape:.2f}%")
338
+ else:
339
+ st.write("No entries with ground truth yet.")
340
+
341
+ st.markdown("---")
342
+ st.caption("Tip: Use the sidebar **Reload new random data** to resample without submitting.")
343
+ # =================== Editable input row UI ===================
344
+ st.subheader("Edit current input (optional)")
345
+ if st.session_state.prepared is not None:
346
+ curr = st.session_state.prepared
347
+ X_row = curr["X"].iloc[0].copy()
348
+
349
+ # Two tabs for readability
350
+ tab_cat, tab_num = st.tabs(["Categorical features", "Numeric features"])
351
+
352
+ # Build lists
353
+ cat_cols = [c for c in X_row.index if not pd.api.types.is_numeric_dtype(usa_data[c])]
354
+ num_cols = [c for c in X_row.index if pd.api.types.is_numeric_dtype(usa_data[c])]
355
+
356
+ with tab_cat:
357
+ st.caption("Pick from common categories (precomputed). Your current value is preselected.")
358
+ for col in cat_cols:
359
+ choices = CHOICES_DICT.get(col, [])
360
+ # ensure current value is present
361
+ curr_val = "" if pd.isna(X_row[col]) else str(X_row[col])
362
+ if curr_val not in choices and curr_val != "":
363
+ choices = [curr_val] + choices
364
+ sel = st.selectbox(
365
+ label=col,
366
+ options=choices if len(choices) > 0 else [""],
367
+ index=0 if len(choices) == 0 else (choices.index(curr_val) if curr_val in choices else 0),
368
+ key=f"edit_cat_{col}",
369
+ )
370
+ X_row[col] = _coerce_value(col, sel)
371
+
372
+ with tab_num:
373
+ st.caption("Pick typical numeric values (percentiles/rounded) or keep current.")
374
+ for col in num_cols:
375
+ choices = CHOICES_DICT.get(col, [])
376
+ curr_val = X_row[col]
377
+ # ensure current value is present and cast to int for display if close to int
378
+ if pd.isna(curr_val):
379
+ curr_val = choices[0] if len(choices) else 0
380
+ # make sure current is in choices
381
+ if len(choices) == 0:
382
+ choices = [curr_val]
383
+ elif curr_val not in choices:
384
+ choices = [curr_val] + choices
385
+ sel = st.selectbox(
386
+ label=col,
387
+ options=choices,
388
+ index=choices.index(curr_val) if curr_val in choices else 0,
389
+ key=f"edit_num_{col}",
390
+ )
391
+ X_row[col] = _coerce_value(col, sel)
392
+
393
+ # Save edits back
394
+ st.session_state.prepared["X"].iloc[0] = X_row
395
+
396
+