Spaces:

Ashish-K
/

ITBA_Streamlit

Sleeping

App Files Files Community

ITBA_Streamlit / app2.py

Ashish-K

Update app2.py

ee1785b verified 8 months ago

raw

history blame contribute delete

27.2 kB

	# app2.py
	# Intro to Business Analytics - With Python. Module 1 (Upload-Only, Coeff Tables + What-If)

	import os
	import io
	import csv
	import re
	import numpy as np
	import pandas as pd
	import streamlit as st

	from typing import List, Tuple
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, OneHotEncoder
	from sklearn.compose import ColumnTransformer
	from sklearn.pipeline import Pipeline
	from sklearn.impute import SimpleImputer
	from sklearn.linear_model import LinearRegression, LogisticRegression
	from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, roc_auc_score

	# ---------- Helpers ----------
	def rmse_compat(y_true, y_pred):
	"""RMSE compatible with old/new scikit-learn."""
	try:
	return float(mean_squared_error(y_true, y_pred, squared=False))
	except TypeError:
	return float(np.sqrt(mean_squared_error(y_true, y_pred)))

	def load_uploaded_csv_or_excel(up, desired_parse_cols):
	"""
	Robustly read uploaded file as CSV (auto-sep, encoding) or Excel.
	Only convert to datetime for columns that actually exist.
	"""
	name = (up.name or "").lower()

	# Excel first
	if name.endswith((".xlsx", ".xls")):
	up.seek(0)
	df = pd.read_excel(up)
	for c in (desired_parse_cols or []):
	if c in df.columns:
	df[c] = pd.to_datetime(df[c], errors="coerce")
	return df

	# CSV: sniff delimiter + encoding
	up.seek(0)
	head_bytes = up.read(8192)
	try:
	head_text = head_bytes.decode("utf-8")
	encoding_used = "utf-8"
	except UnicodeDecodeError:
	head_text = head_bytes.decode("latin-1", errors="replace")
	encoding_used = "latin-1"

	try:
	dialect = csv.Sniffer().sniff(head_text, delimiters=",;\t\|")
	sep = dialect.delimiter
	except Exception:
	sep = ","

	# Read once without parse_dates
	up.seek(0)
	try:
	df = pd.read_csv(up, sep=sep, encoding=encoding_used)
	except pd.errors.EmptyDataError:
	st.error("The uploaded file appears to be empty.")
	return pd.DataFrame()
	except Exception:
	# last resort: python engine auto-sep
	up.seek(0)
	try:
	df = pd.read_csv(up, sep=None, engine="python", encoding=encoding_used)
	except Exception:
	st.error("Could not read the uploaded file as CSV or Excel. It might use an unusual delimiter/encoding, or be corrupted.")
	st.caption("First 256 bytes (for debugging):")
	st.code(head_text[:256])
	return pd.DataFrame()

	# Convert only present date cols
	for c in (desired_parse_cols or []):
	if c in df.columns:
	df[c] = pd.to_datetime(df[c], errors="coerce")

	return df

	def infer_types(df: pd.DataFrame):
	num = df.select_dtypes(include=[np.number]).columns.tolist()
	dt = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist()
	cat = [c for c in df.columns if c not in num + dt]
	return num, cat, dt

	def winsorize_series(s: pd.Series, lower_q: float, upper_q: float) -> pd.Series:
	lo, hi = s.quantile(lower_q), s.quantile(upper_q)
	return s.clip(lower=lo, upper=hi)

	def build_preprocessor(feature_names, df_context, standardize: bool):
	"""
	Imputation + (optional) scaling + OHE(drop='first'); drops all-NaN columns.
	"""
	feature_names = [c for c in feature_names if df_context[c].notna().any()]
	numX = [c for c in feature_names if pd.api.types.is_numeric_dtype(df_context[c])]
	catX = [c for c in feature_names if c not in numX]

	# OneHotEncoder API compatibility + drop reference for interpretability
	try:
	ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True, drop="first")
	except TypeError:
	ohe = OneHotEncoder(handle_unknown="ignore", sparse=True, drop="first")

	transformers = []
	if numX:
	from sklearn.pipeline import Pipeline as _P
	num_pipe = _P([
	("imputer", SimpleImputer(strategy="median")),
	("scaler", StandardScaler(with_mean=False) if standardize else "passthrough"),
	])
	transformers.append(("num", num_pipe, numX))
	if catX:
	from sklearn.pipeline import Pipeline as _P
	cat_pipe = _P([
	("imputer", SimpleImputer(strategy="most_frequent")),
	("ohe", ohe),
	])
	transformers.append(("cat", cat_pipe, catX))
	return ColumnTransformer(transformers=transformers, remainder="drop")

	def get_transformed_feature_names(prep: ColumnTransformer) -> list:
	"""
	Try to recover output feature names after preprocessing.
	Works for numeric (scaled) and categorical (OHE).
	"""
	names_out = []
	for name, trans, cols in prep.transformers_:
	if name == "remainder":
	continue
	if hasattr(trans, "named_steps"):
	# find the last step that exposes get_feature_names_out
	last = None
	for step_name in reversed(trans.named_steps):
	cand = trans.named_steps[step_name]
	if hasattr(cand, "get_feature_names_out"):
	last = cand
	break
	if last is not None:
	try:
	out = last.get_feature_names_out(cols)
	names_out.extend(list(out))
	continue
	except Exception:
	pass
	# fallback to original cols
	names_out.extend(list(cols))
	else:
	if hasattr(trans, "get_feature_names_out"):
	try:
	out = trans.get_feature_names_out(cols)
	names_out.extend(list(out))
	continue
	except Exception:
	pass
	names_out.extend(list(cols))
	return list(map(str, names_out))

	def map_out_to_original(prep: ColumnTransformer, out_names: List[str], orig_num: List[str], orig_cat: List[str]) -> pd.DataFrame:
	"""
	Build a mapping from transformed feature names back to their original column.
	For numeric: 1:1. For OHE: original col inferred by prefix match '<col>_'.
	"""
	rows = []
	orig_set = set(orig_num + orig_cat)

	# Gather columns each transformer handled to mark actives
	handled_cols = set()
	for name, trans, cols in prep.transformers_:
	if name == "remainder":
	continue
	handled_cols.update(cols)

	for out in out_names:
	base = None
	# numeric: 1:1 if name equals original
	if out in orig_set:
	base = out
	else:
	# categorical one-hot: try prefix match '<col>_'
	for c in orig_cat:
	# exact prefix 'col_' or 'col=' (older versions)
	if out.startswith(f"{c}_") or out.startswith(f"{c}="):
	base = c
	break
	if base is None:
	# fallback: try to match by longest original col that is a prefix
	candidates = [c for c in orig_set if out.startswith(f"{c}_") or out.startswith(c)]
	base = max(candidates, key=len) if candidates else out
	rows.append({"feature_out": out, "original_feature": base, "is_active": (base in handled_cols)})
	return pd.DataFrame(rows)

	# ---------- Page ----------
	st.set_page_config(page_title="Intro to Business Analytics - Python Demo", page_icon="📈", layout="wide")
	st.title("📈 Intro to Business Analytics - Python Demo")
	with st.expander("👋 Getting started / What this app does", expanded=True):
	st.markdown(
	"- Upload your data (CSV or Excel).\n"
	"- Map columns: select date/region and targets.\n"
	"- Explore: time series, breakdowns, correlations.\n"
	"- Model: regression or classification with preprocessing (imputation/scaling/OHE).\n"
	"- What-If: tweak any model features and see predicted impact.\n"
	"- Downloads: preview, predictions, scores."
	)

	# ===== Upload (mandatory) =====
	up = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
	date_cols_text = st.text_input("Date column(s) to parse (comma-separated)", "")

	if up is None:
	st.warning("Please upload a CSV or Excel file to proceed.")
	st.stop()

	parse = [c.strip() for c in date_cols_text.split(",") if c.strip()]
	df_raw = load_uploaded_csv_or_excel(up, parse)

	# Clean empty rows/cols early
	df_raw = df_raw.dropna(axis=0, how="all")
	df_raw = df_raw.dropna(axis=1, how="all")

	if df_raw.empty:
	st.error("No data after parsing/cleanup. Check the file or delimiter/encoding.")
	st.stop()

	# ===== Column mapping =====
	with st.sidebar.expander("Column Mapping", True):
	all_cols = df_raw.columns.tolist()

	# date column (optional)
	default_date_idx = (all_cols.index("week")+1) if "week" in all_cols else 0
	date_col = st.selectbox("Date column (optional)", ["<none>"] + all_cols, index=default_date_idx)
	date_field = None if date_col == "<none>" else date_col

	# region column (optional)
	default_region_idx = (all_cols.index("region")+1) if "region" in all_cols else 0
	region_col = st.selectbox("Region column (optional)", ["<none>"] + all_cols, index=default_region_idx)
	region_field = None if region_col == "<none>" else region_col

	# targets
	numeric_cols_all = df_raw.select_dtypes(include=[np.number]).columns.tolist()
	binary_cols = [c for c in numeric_cols_all if df_raw[c].dropna().isin([0,1]).all()]

	reg_default_idx = (numeric_cols_all.index("revenue_next_week")+1) if "revenue_next_week" in numeric_cols_all else 0
	reg_target = st.selectbox("Regression target", ["<none>"] + numeric_cols_all, index=reg_default_idx)
	reg_target = None if reg_target == "<none>" else reg_target

	clf_default_idx = (binary_cols.index("churn_4w")+1) if "churn_4w" in binary_cols else 0
	clf_target = st.selectbox("Classification target (binary 0/1)", ["<none>"] + binary_cols, index=clf_default_idx)
	clf_target = None if clf_target == "<none>" else clf_target

	# ===== Prep, Filters, Modeling (sidebar) =====
	with st.sidebar.expander("Data Prep", True):
	lower_q = st.slider("Outlier floor (winsorize)", 0.0, 0.2, 0.01, step=0.01)
	upper_q = st.slider("Outlier cap (winsorize)", 0.8, 1.0, 0.99, step=0.01)
	smoothing = st.slider("Smoothing window (periods)", 1, 8, 1)
	agg_grain = st.selectbox("Aggregation granularity", ["none (row)", "date"])

	with st.sidebar.expander("Filters", False):
	if date_field and pd.api.types.is_datetime64_any_dtype(df_raw.get(date_field, pd.Series([], dtype="datetime64[ns]"))):
	min_d, max_d = df_raw[date_field].min(), df_raw[date_field].max()
	try:
	date_range = st.date_input("Date range", (min_d.date(), max_d.date()))
	except Exception:
	date_range = None
	else:
	date_range = None
	if region_field:
	region_vals = sorted(df_raw[region_field].dropna().unique().tolist())
	regions_sel = st.multiselect("Region filter", region_vals, default=region_vals if region_vals else [])
	else:
	regions_sel = []

	with st.sidebar.expander("Modeling", False):
	task = st.selectbox("Task", ["Regression", "Classification"])
	test_size = st.slider("Test size", 0.1, 0.5, 0.2, step=0.05)
	standardize = st.checkbox("Standardize numeric features", True)

	# ===== Data overview & prep =====
	df = df_raw.copy()

	# date filter
	if date_field and 'date_range' in locals() and date_range:
	try:
	start_d, end_d = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1])
	df = df[(df[date_field] >= start_d) & (df[date_field] <= end_d)]
	except Exception:
	pass

	# region filter
	if region_field and regions_sel:
	df = df[df[region_field].isin(regions_sel)]

	# types & winsorize
	num_cols, cat_cols, dt_cols = infer_types(df)
	for c in num_cols:
	try:
	df[c] = winsorize_series(df[c], lower_q, upper_q)
	except Exception:
	pass

	# aggregation
	if agg_grain == "date" and (date_field in df.columns):
	group_cols = [date_field] + ([region_field] if region_field in df.columns else [])
	agg = {c: ("mean" if ("rate" in c or "pct" in c) else "sum") for c in num_cols if c != date_field}
	if agg:
	df = df.groupby(group_cols, as_index=False).agg(agg)

	# smoothing
	if smoothing > 1 and (date_field in df.columns):
	df = df.sort_values(date_field).reset_index(drop=True)
	for c in num_cols:
	if c != date_field:
	try:
	df[c] = df[c].rolling(smoothing, min_periods=1).mean()
	except Exception:
	pass

	# Overview
	st.subheader("📁 Data Overview")
	c1, c2, c3, c4 = st.columns(4)
	with c1: st.metric("Rows", f"{len(df):,}")
	with c2: st.metric("Columns", f"{len(df.columns):,}")
	with c3: st.metric("Numeric", f"{len(num_cols):,}")
	with c4: st.metric("Categorical", f"{len(cat_cols):,}")
	with st.expander("Preview dataframe"):
	st.dataframe(df.head(50).round(2), use_container_width=True)
	csv_preview = df.head(1000).round(2).to_csv(index=False).encode("utf-8")
	st.download_button("Download preview (CSV)", data=csv_preview, file_name="preview.csv", mime="text/csv")

	# ===== Descriptive =====
	st.header("1) Descriptive Analytics")
	tab_ts, tab_break, tab_corr = st.tabs(["Time Series","Breakdowns","Correlations"])

	with tab_ts:
	if date_field in df.columns:
	y_opt = [c for c in num_cols if c != date_field]
	if y_opt:
	y_metric = st.selectbox("Metric", y_opt, index=min(1, len(y_opt)-1))
	by = st.multiselect("Stratify by", [region_field] if region_field in df.columns else [])
	if by:
	for k, sub in df.groupby(by):
	st.line_chart(sub.set_index(date_field)[y_metric], height=240)
	else:
	st.line_chart(df.set_index(date_field)[y_metric], height=300)
	else:
	st.info("No date column selected. Choose one in Column Mapping to enable time series.")

	with tab_break:
	if region_field in df.columns:
	measure = st.selectbox("Measure", [c for c in num_cols if c != date_field] or num_cols)
	pivot = df.groupby(region_field, as_index=False)[measure].sum().sort_values(measure, ascending=False)
	st.bar_chart(pivot.set_index(region_field)[measure], height=300)
	st.dataframe(pivot.round(2), use_container_width=True)
	else:
	st.info("No region column selected.")

	with tab_corr:
	if len(num_cols) >= 2:
	corr = df[num_cols].corr(numeric_only=True)
	st.dataframe(corr.round(2), use_container_width=True) # enforce 2 decimals
	else:
	st.info("Not enough numeric columns to compute correlations.")

	# ===== Exploratory =====
	st.header("2) Exploratory Analytics")
	if len(num_cols) >= 2:
	y_opt = [c for c in num_cols if c != date_field]
	if y_opt:
	y = st.selectbox("Y (outcome)", y_opt, index=0)
	x = st.selectbox("X (feature)", [c for c in y_opt if c != y], index=0)
	st.scatter_chart(df[[x, y]], x=x, y=y, height=300)
	try:
	r = df[[x, y]].corr().iloc[0, 1]
	st.caption(f"Pearson r ≈ {r:.2f}")
	except Exception:
	st.caption("Pearson r not available for the selected columns.")
	else:
	st.info("Need at least two numeric columns.")

	# ===== Diagnostic and Predictive Modeling =====
	st.header("3) Diagnostic and Predictive Modeling")

	if task == "Regression":
	if not reg_target:
	st.warning("Pick a regression target in Column Mapping.")
	elif reg_target not in df.columns:
	st.warning(f"Regression target '{reg_target}' not found in data.")
	else:
	features = [c for c in df.columns if c not in [reg_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])]
	features = [c for c in features if df[c].notna().any()] # remove all-NaN features
	if len(df.dropna(subset=[reg_target])) < 50:
	st.warning("Not enough rows for modeling after filtering.")
	else:
	pre = build_preprocessor(features, df, standardize)
	model = LinearRegression()
	pipe = Pipeline([("prep", pre), ("model", model)])

	train_df = df.dropna(subset=[reg_target]).copy()
	X, yv = train_df[features], train_df[reg_target]
	try:
	X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42)
	except ValueError:
	st.warning("Could not split the data; try lowering the test size or checking the target.")
	else:
	pipe.fit(X_tr, y_tr)
	pred = pipe.predict(X_te)
	r2 = r2_score(y_te, pred)
	mae = mean_absolute_error(y_te, pred)
	rmse = rmse_compat(y_te, pred)

	m1, m2, m3 = st.columns(3)
	with m1: st.metric("R² (test)", f"{r2:0.2f}")
	with m2: st.metric("MAE", f"{mae:,.2f}")
	with m3: st.metric("RMSE", f"{rmse:,.2f}")

	preds_df = pd.DataFrame({"y_true": y_te, "y_pred": pred}).reset_index(drop=True).round(2)
	with st.expander("Predictions vs. Actuals"):
	st.dataframe(preds_df, use_container_width=True)
	st.download_button("Download predictions (CSV)", data=preds_df.to_csv(index=False).encode("utf-8"),
	file_name="regression_predictions.csv", mime="text/csv")

	# ---- Coefficient tables ----
	try:
	prep = pipe.named_steps["prep"]
	model_step = pipe.named_steps["model"]

	out_names = get_transformed_feature_names(prep)
	coefs = np.ravel(model_step.coef_)
	# align sizes if any mismatch
	L = min(len(out_names), len(coefs))
	coef_df = pd.DataFrame({
	"feature_out": out_names[:L],
	"coef": coefs[:L]
	})
	# Map back to original features
	orig_num = [c for c in features if pd.api.types.is_numeric_dtype(train_df[c])]
	orig_cat = [c for c in features if c not in orig_num]
	map_df = map_out_to_original(prep, coef_df["feature_out"].tolist(), orig_num, orig_cat)
	coef_df = coef_df.merge(map_df, on="feature_out", how="left")
	coef_df["abs_coef"] = coef_df["coef"].abs()

	st.subheader("Regression Coefficients (transformed features)")
	st.dataframe(coef_df[["feature_out", "original_feature", "coef"]]
	.sort_values("coef", key=lambda s: s.abs(), ascending=False)
	.round(2),
	use_container_width=True)

	# Summary by original feature (aggregate OHE effects)
	summary = (coef_df.groupby("original_feature", dropna=False)
	.agg(total_abs_effect=("abs_coef", "sum"),
	net_effect=("coef", "sum"),
	components=("coef", "size"))
	.reset_index()
	.sort_values("total_abs_effect", ascending=False))
	st.subheader("Feature Importance Summary (by original feature)")
	st.caption("total_abs_effect = sum of \|coefficients\| across all transformed components for that feature; net_effect = sum of coefficients.")
	st.dataframe(summary.round(2), use_container_width=True)
	except Exception as e:
	st.info(f"Could not extract regression coefficients: {e}")

	# Persist model & baseline for What-If
	st.session_state["reg_pipe"] = pipe
	st.session_state["reg_features"] = features

	numX = [c for c in features if pd.api.types.is_numeric_dtype(train_df[c])]
	catX = [c for c in features if c not in numX]
	base_vals = {}
	for c in numX:
	base_vals[c] = float(train_df[c].mean())
	for c in catX:
	try:
	base_vals[c] = train_df[c].mode(dropna=True).iloc[0]
	except Exception:
	base_vals[c] = train_df[c].dropna().iloc[0] if len(train_df[c].dropna()) else None
	baseline_row = pd.DataFrame([base_vals], columns=features)
	st.session_state["reg_baseline_row"] = baseline_row

	else: # Classification
	if not clf_target:
	st.warning("Pick a classification target (binary 0/1) in Column Mapping.")
	elif clf_target not in df.columns:
	st.warning(f"Classification target '{clf_target}' not found in data.")
	elif not df[clf_target].dropna().isin([0,1]).all():
	st.warning("Selected classification target is not binary 0/1.")
	else:
	features = [c for c in df.columns if c not in [clf_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])]
	features = [c for c in features if df[c].notna().any()]
	if len(df.dropna(subset=[clf_target])) < 50:
	st.warning("Not enough rows for modeling after filtering.")
	else:
	pre = build_preprocessor(features, df, standardize)
	model = LogisticRegression(max_iter=1000)
	pipe = Pipeline([("prep", pre), ("model", model)])

	train_df = df.dropna(subset=[clf_target]).copy()
	X, yv = train_df[features], train_df[clf_target]
	strat = yv if yv.nunique() == 2 else None
	try:
	X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42, stratify=strat)
	except ValueError:
	st.warning("Could not split the data; ensure both classes are present after filtering.")
	else:
	pipe.fit(X_tr, y_tr)
	proba = pipe.predict_proba(X_te)[:, 1]
	pred = (proba >= 0.5).astype(int)
	acc = accuracy_score(y_te, pred)
	try:
	auc = roc_auc_score(y_te, proba) if len(np.unique(y_te)) == 2 else float("nan")
	except Exception:
	auc = float("nan")
	m1, m2 = st.columns(2)
	with m1: st.metric("Accuracy", f"{acc:0.2f}")
	with m2: st.metric("ROC AUC", f"{auc:0.2f}")

	scores_df = pd.DataFrame({"y_true": y_te, "p_hat": proba, "y_pred": pred}).reset_index(drop=True).round(2)
	with st.expander("Scores (y_true, p_hat, y_pred)"):
	st.dataframe(scores_df, use_container_width=True)
	st.download_button("Download classification scores (CSV)",
	data=scores_df.to_csv(index=False).encode("utf-8"),
	file_name="classification_scores.csv", mime="text/csv")

	# ===== Explanatory / What-If =====
	st.header("4) Explanatory & What-If (Model-Driven)")

	pipe = st.session_state.get("reg_pipe")
	features = st.session_state.get("reg_features")
	baseline_row = st.session_state.get("reg_baseline_row")

	def _local_slope(pipe, base_df, feat, h):
	"""Finite-difference slope dŷ/dx at baseline for one feature."""
	b = base_df.copy()
	y0 = float(pipe.predict(b)[0])
	s = b.copy()
	try:
	s[feat] = s[feat] + h
	except Exception:
	return np.nan
	y1 = float(pipe.predict(s)[0])
	return (y1 - y0) / h if h != 0 else np.nan

	if pipe is not None and features is not None and baseline_row is not None:
	st.caption("Pick any features (from the trained model) to modify and see the predicted impact.")
	selectable = features
	default_choices = [c for c in selectable if c in ["spend_search","discount_pct","site_errors"]][:3]
	chosen = st.multiselect("What-If features", selectable, default=default_choices)

	# Determine which chosen features actually influence the model
	active_info = []
	try:
	prep = pipe.named_steps["prep"]
	for name, trans, cols in prep.transformers_:
	if name == "remainder":
	continue
	active_info.extend(list(cols))
	except Exception:
	active_info = features

	inactive = [f for f in chosen if f not in active_info]
	if inactive:
	st.warning(f"These selected features don’t influence the model (dropped or constant after preprocessing): {', '.join(inactive)}")

	scenario_row = baseline_row.copy()

	# UI + local slope per numeric feature
	for feat in chosen:
	if feat in inactive:
	continue
	if pd.api.types.is_numeric_dtype(baseline_row[feat]):
	series = df[feat].dropna() if feat in df.columns else pd.Series([baseline_row[feat].iloc[0]])
	mean = float(series.mean()) if len(series) else float(baseline_row[feat].iloc[0])
	std = float(series.std()) if len(series) else 0.0

	if 0.0 <= mean <= 1.0 and (len(series)==0 or (series.min() >= 0.0 and series.max() <= 1.0)):
	lo, hi, default_val, step = 0.0, 1.0, float(baseline_row[feat].iloc[0]), 0.01
	else:
	lo = round(mean - 2std, 2); hi = round(mean + 2std, 2)
	if lo == hi: hi = lo + 1.0
	default_val = float(baseline_row[feat].iloc[0])
	step = max(0.01, (hi - lo) / 100.0)

	h = 1.0 if std == 0 else max(0.1*std, 1e-3)
	slope = _local_slope(pipe, baseline_row, feat, h)

	col1, col2 = st.columns([3,1])
	with col1:
	val = st.slider(f"{feat}", lo, hi, default_val, step=step)
	with col2:
	st.markdown(f"Δŷ/Δ{feat} ≈ {0.0 if pd.isna(slope) else float(slope):.2f}")
	scenario_row[feat] = val
	else:
	opts = sorted(df[feat].dropna().unique().tolist()) if feat in df.columns else sorted(baseline_row[feat].dropna().unique().tolist())
	default = baseline_row[feat].iloc[0] if feat in baseline_row.columns else (opts[0] if opts else None)
	idx = (opts.index(default) if (opts and default in opts) else 0) if opts else 0
	val = st.selectbox(f"{feat}", opts if opts else [default], index=idx)
	scenario_row[feat] = val

	try:
	base_pred = float(pipe.predict(baseline_row)[0])
	new_pred = float(pipe.predict(scenario_row)[0])
	delta = new_pred - base_pred
	c1, c2 = st.columns(2)
	with c1: st.metric("Baseline (model prediction)", f"{base_pred:,.2f}")
	with c2: st.metric("What-If (model prediction)", f"{new_pred:,.2f}", delta=f"{delta:,.2f}")
	except Exception as e:
	st.warning(f"Could not compute predictions for scenario: {e}")
	else:
	st.info("Train the regression model in Section 3 to enable a model-driven What-If.")