Spaces:
Sleeping
Sleeping
| # app2.py | |
| # Intro to Business Analytics - With Python. Module 1 (Upload-Only, Coeff Tables + What-If) | |
| import os | |
| import io | |
| import csv | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| from typing import List, Tuple | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, OneHotEncoder | |
| from sklearn.compose import ColumnTransformer | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.impute import SimpleImputer | |
| from sklearn.linear_model import LinearRegression, LogisticRegression | |
| from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, roc_auc_score | |
| # ---------- Helpers ---------- | |
| def rmse_compat(y_true, y_pred): | |
| """RMSE compatible with old/new scikit-learn.""" | |
| try: | |
| return float(mean_squared_error(y_true, y_pred, squared=False)) | |
| except TypeError: | |
| return float(np.sqrt(mean_squared_error(y_true, y_pred))) | |
| def load_uploaded_csv_or_excel(up, desired_parse_cols): | |
| """ | |
| Robustly read uploaded file as CSV (auto-sep, encoding) or Excel. | |
| Only convert to datetime for columns that actually exist. | |
| """ | |
| name = (up.name or "").lower() | |
| # Excel first | |
| if name.endswith((".xlsx", ".xls")): | |
| up.seek(0) | |
| df = pd.read_excel(up) | |
| for c in (desired_parse_cols or []): | |
| if c in df.columns: | |
| df[c] = pd.to_datetime(df[c], errors="coerce") | |
| return df | |
| # CSV: sniff delimiter + encoding | |
| up.seek(0) | |
| head_bytes = up.read(8192) | |
| try: | |
| head_text = head_bytes.decode("utf-8") | |
| encoding_used = "utf-8" | |
| except UnicodeDecodeError: | |
| head_text = head_bytes.decode("latin-1", errors="replace") | |
| encoding_used = "latin-1" | |
| try: | |
| dialect = csv.Sniffer().sniff(head_text, delimiters=",;\t|") | |
| sep = dialect.delimiter | |
| except Exception: | |
| sep = "," | |
| # Read once without parse_dates | |
| up.seek(0) | |
| try: | |
| df = pd.read_csv(up, sep=sep, encoding=encoding_used) | |
| except pd.errors.EmptyDataError: | |
| st.error("The uploaded file appears to be empty.") | |
| return pd.DataFrame() | |
| except Exception: | |
| # last resort: python engine auto-sep | |
| up.seek(0) | |
| try: | |
| df = pd.read_csv(up, sep=None, engine="python", encoding=encoding_used) | |
| except Exception: | |
| st.error("Could not read the uploaded file as CSV or Excel. It might use an unusual delimiter/encoding, or be corrupted.") | |
| st.caption("First 256 bytes (for debugging):") | |
| st.code(head_text[:256]) | |
| return pd.DataFrame() | |
| # Convert only present date cols | |
| for c in (desired_parse_cols or []): | |
| if c in df.columns: | |
| df[c] = pd.to_datetime(df[c], errors="coerce") | |
| return df | |
| def infer_types(df: pd.DataFrame): | |
| num = df.select_dtypes(include=[np.number]).columns.tolist() | |
| dt = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist() | |
| cat = [c for c in df.columns if c not in num + dt] | |
| return num, cat, dt | |
| def winsorize_series(s: pd.Series, lower_q: float, upper_q: float) -> pd.Series: | |
| lo, hi = s.quantile(lower_q), s.quantile(upper_q) | |
| return s.clip(lower=lo, upper=hi) | |
| def build_preprocessor(feature_names, df_context, standardize: bool): | |
| """ | |
| Imputation + (optional) scaling + OHE(drop='first'); drops all-NaN columns. | |
| """ | |
| feature_names = [c for c in feature_names if df_context[c].notna().any()] | |
| numX = [c for c in feature_names if pd.api.types.is_numeric_dtype(df_context[c])] | |
| catX = [c for c in feature_names if c not in numX] | |
| # OneHotEncoder API compatibility + drop reference for interpretability | |
| try: | |
| ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True, drop="first") | |
| except TypeError: | |
| ohe = OneHotEncoder(handle_unknown="ignore", sparse=True, drop="first") | |
| transformers = [] | |
| if numX: | |
| from sklearn.pipeline import Pipeline as _P | |
| num_pipe = _P([ | |
| ("imputer", SimpleImputer(strategy="median")), | |
| ("scaler", StandardScaler(with_mean=False) if standardize else "passthrough"), | |
| ]) | |
| transformers.append(("num", num_pipe, numX)) | |
| if catX: | |
| from sklearn.pipeline import Pipeline as _P | |
| cat_pipe = _P([ | |
| ("imputer", SimpleImputer(strategy="most_frequent")), | |
| ("ohe", ohe), | |
| ]) | |
| transformers.append(("cat", cat_pipe, catX)) | |
| return ColumnTransformer(transformers=transformers, remainder="drop") | |
| def get_transformed_feature_names(prep: ColumnTransformer) -> list: | |
| """ | |
| Try to recover output feature names after preprocessing. | |
| Works for numeric (scaled) and categorical (OHE). | |
| """ | |
| names_out = [] | |
| for name, trans, cols in prep.transformers_: | |
| if name == "remainder": | |
| continue | |
| if hasattr(trans, "named_steps"): | |
| # find the last step that exposes get_feature_names_out | |
| last = None | |
| for step_name in reversed(trans.named_steps): | |
| cand = trans.named_steps[step_name] | |
| if hasattr(cand, "get_feature_names_out"): | |
| last = cand | |
| break | |
| if last is not None: | |
| try: | |
| out = last.get_feature_names_out(cols) | |
| names_out.extend(list(out)) | |
| continue | |
| except Exception: | |
| pass | |
| # fallback to original cols | |
| names_out.extend(list(cols)) | |
| else: | |
| if hasattr(trans, "get_feature_names_out"): | |
| try: | |
| out = trans.get_feature_names_out(cols) | |
| names_out.extend(list(out)) | |
| continue | |
| except Exception: | |
| pass | |
| names_out.extend(list(cols)) | |
| return list(map(str, names_out)) | |
| def map_out_to_original(prep: ColumnTransformer, out_names: List[str], orig_num: List[str], orig_cat: List[str]) -> pd.DataFrame: | |
| """ | |
| Build a mapping from transformed feature names back to their original column. | |
| For numeric: 1:1. For OHE: original col inferred by prefix match '<col>_'. | |
| """ | |
| rows = [] | |
| orig_set = set(orig_num + orig_cat) | |
| # Gather columns each transformer handled to mark actives | |
| handled_cols = set() | |
| for name, trans, cols in prep.transformers_: | |
| if name == "remainder": | |
| continue | |
| handled_cols.update(cols) | |
| for out in out_names: | |
| base = None | |
| # numeric: 1:1 if name equals original | |
| if out in orig_set: | |
| base = out | |
| else: | |
| # categorical one-hot: try prefix match '<col>_' | |
| for c in orig_cat: | |
| # exact prefix 'col_' or 'col=' (older versions) | |
| if out.startswith(f"{c}_") or out.startswith(f"{c}="): | |
| base = c | |
| break | |
| if base is None: | |
| # fallback: try to match by longest original col that is a prefix | |
| candidates = [c for c in orig_set if out.startswith(f"{c}_") or out.startswith(c)] | |
| base = max(candidates, key=len) if candidates else out | |
| rows.append({"feature_out": out, "original_feature": base, "is_active": (base in handled_cols)}) | |
| return pd.DataFrame(rows) | |
| # ---------- Page ---------- | |
| st.set_page_config(page_title="Intro to Business Analytics - Python Demo", page_icon="📈", layout="wide") | |
| st.title("📈 Intro to Business Analytics - Python Demo") | |
| with st.expander("👋 Getting started / What this app does", expanded=True): | |
| st.markdown( | |
| "- **Upload your data** (CSV or Excel).\n" | |
| "- **Map columns**: select date/region and targets.\n" | |
| "- **Explore**: time series, breakdowns, correlations.\n" | |
| "- **Model**: regression or classification with preprocessing (imputation/scaling/OHE).\n" | |
| "- **What-If**: tweak *any* model features and see predicted impact.\n" | |
| "- **Downloads**: preview, predictions, scores." | |
| ) | |
| # ===== Upload (mandatory) ===== | |
| up = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"]) | |
| date_cols_text = st.text_input("Date column(s) to parse (comma-separated)", "") | |
| if up is None: | |
| st.warning("Please upload a CSV or Excel file to proceed.") | |
| st.stop() | |
| parse = [c.strip() for c in date_cols_text.split(",") if c.strip()] | |
| df_raw = load_uploaded_csv_or_excel(up, parse) | |
| # Clean empty rows/cols early | |
| df_raw = df_raw.dropna(axis=0, how="all") | |
| df_raw = df_raw.dropna(axis=1, how="all") | |
| if df_raw.empty: | |
| st.error("No data after parsing/cleanup. Check the file or delimiter/encoding.") | |
| st.stop() | |
| # ===== Column mapping ===== | |
| with st.sidebar.expander("Column Mapping", True): | |
| all_cols = df_raw.columns.tolist() | |
| # date column (optional) | |
| default_date_idx = (all_cols.index("week")+1) if "week" in all_cols else 0 | |
| date_col = st.selectbox("Date column (optional)", ["<none>"] + all_cols, index=default_date_idx) | |
| date_field = None if date_col == "<none>" else date_col | |
| # region column (optional) | |
| default_region_idx = (all_cols.index("region")+1) if "region" in all_cols else 0 | |
| region_col = st.selectbox("Region column (optional)", ["<none>"] + all_cols, index=default_region_idx) | |
| region_field = None if region_col == "<none>" else region_col | |
| # targets | |
| numeric_cols_all = df_raw.select_dtypes(include=[np.number]).columns.tolist() | |
| binary_cols = [c for c in numeric_cols_all if df_raw[c].dropna().isin([0,1]).all()] | |
| reg_default_idx = (numeric_cols_all.index("revenue_next_week")+1) if "revenue_next_week" in numeric_cols_all else 0 | |
| reg_target = st.selectbox("Regression target", ["<none>"] + numeric_cols_all, index=reg_default_idx) | |
| reg_target = None if reg_target == "<none>" else reg_target | |
| clf_default_idx = (binary_cols.index("churn_4w")+1) if "churn_4w" in binary_cols else 0 | |
| clf_target = st.selectbox("Classification target (binary 0/1)", ["<none>"] + binary_cols, index=clf_default_idx) | |
| clf_target = None if clf_target == "<none>" else clf_target | |
| # ===== Prep, Filters, Modeling (sidebar) ===== | |
| with st.sidebar.expander("Data Prep", True): | |
| lower_q = st.slider("Outlier floor (winsorize)", 0.0, 0.2, 0.01, step=0.01) | |
| upper_q = st.slider("Outlier cap (winsorize)", 0.8, 1.0, 0.99, step=0.01) | |
| smoothing = st.slider("Smoothing window (periods)", 1, 8, 1) | |
| agg_grain = st.selectbox("Aggregation granularity", ["none (row)", "date"]) | |
| with st.sidebar.expander("Filters", False): | |
| if date_field and pd.api.types.is_datetime64_any_dtype(df_raw.get(date_field, pd.Series([], dtype="datetime64[ns]"))): | |
| min_d, max_d = df_raw[date_field].min(), df_raw[date_field].max() | |
| try: | |
| date_range = st.date_input("Date range", (min_d.date(), max_d.date())) | |
| except Exception: | |
| date_range = None | |
| else: | |
| date_range = None | |
| if region_field: | |
| region_vals = sorted(df_raw[region_field].dropna().unique().tolist()) | |
| regions_sel = st.multiselect("Region filter", region_vals, default=region_vals if region_vals else []) | |
| else: | |
| regions_sel = [] | |
| with st.sidebar.expander("Modeling", False): | |
| task = st.selectbox("Task", ["Regression", "Classification"]) | |
| test_size = st.slider("Test size", 0.1, 0.5, 0.2, step=0.05) | |
| standardize = st.checkbox("Standardize numeric features", True) | |
| # ===== Data overview & prep ===== | |
| df = df_raw.copy() | |
| # date filter | |
| if date_field and 'date_range' in locals() and date_range: | |
| try: | |
| start_d, end_d = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1]) | |
| df = df[(df[date_field] >= start_d) & (df[date_field] <= end_d)] | |
| except Exception: | |
| pass | |
| # region filter | |
| if region_field and regions_sel: | |
| df = df[df[region_field].isin(regions_sel)] | |
| # types & winsorize | |
| num_cols, cat_cols, dt_cols = infer_types(df) | |
| for c in num_cols: | |
| try: | |
| df[c] = winsorize_series(df[c], lower_q, upper_q) | |
| except Exception: | |
| pass | |
| # aggregation | |
| if agg_grain == "date" and (date_field in df.columns): | |
| group_cols = [date_field] + ([region_field] if region_field in df.columns else []) | |
| agg = {c: ("mean" if ("rate" in c or "pct" in c) else "sum") for c in num_cols if c != date_field} | |
| if agg: | |
| df = df.groupby(group_cols, as_index=False).agg(agg) | |
| # smoothing | |
| if smoothing > 1 and (date_field in df.columns): | |
| df = df.sort_values(date_field).reset_index(drop=True) | |
| for c in num_cols: | |
| if c != date_field: | |
| try: | |
| df[c] = df[c].rolling(smoothing, min_periods=1).mean() | |
| except Exception: | |
| pass | |
| # Overview | |
| st.subheader("📁 Data Overview") | |
| c1, c2, c3, c4 = st.columns(4) | |
| with c1: st.metric("Rows", f"{len(df):,}") | |
| with c2: st.metric("Columns", f"{len(df.columns):,}") | |
| with c3: st.metric("Numeric", f"{len(num_cols):,}") | |
| with c4: st.metric("Categorical", f"{len(cat_cols):,}") | |
| with st.expander("Preview dataframe"): | |
| st.dataframe(df.head(50).round(2), use_container_width=True) | |
| csv_preview = df.head(1000).round(2).to_csv(index=False).encode("utf-8") | |
| st.download_button("Download preview (CSV)", data=csv_preview, file_name="preview.csv", mime="text/csv") | |
| # ===== Descriptive ===== | |
| st.header("1) Descriptive Analytics") | |
| tab_ts, tab_break, tab_corr = st.tabs(["Time Series","Breakdowns","Correlations"]) | |
| with tab_ts: | |
| if date_field in df.columns: | |
| y_opt = [c for c in num_cols if c != date_field] | |
| if y_opt: | |
| y_metric = st.selectbox("Metric", y_opt, index=min(1, len(y_opt)-1)) | |
| by = st.multiselect("Stratify by", [region_field] if region_field in df.columns else []) | |
| if by: | |
| for k, sub in df.groupby(by): | |
| st.line_chart(sub.set_index(date_field)[y_metric], height=240) | |
| else: | |
| st.line_chart(df.set_index(date_field)[y_metric], height=300) | |
| else: | |
| st.info("No date column selected. Choose one in Column Mapping to enable time series.") | |
| with tab_break: | |
| if region_field in df.columns: | |
| measure = st.selectbox("Measure", [c for c in num_cols if c != date_field] or num_cols) | |
| pivot = df.groupby(region_field, as_index=False)[measure].sum().sort_values(measure, ascending=False) | |
| st.bar_chart(pivot.set_index(region_field)[measure], height=300) | |
| st.dataframe(pivot.round(2), use_container_width=True) | |
| else: | |
| st.info("No region column selected.") | |
| with tab_corr: | |
| if len(num_cols) >= 2: | |
| corr = df[num_cols].corr(numeric_only=True) | |
| st.dataframe(corr.round(2), use_container_width=True) # enforce 2 decimals | |
| else: | |
| st.info("Not enough numeric columns to compute correlations.") | |
| # ===== Exploratory ===== | |
| st.header("2) Exploratory Analytics") | |
| if len(num_cols) >= 2: | |
| y_opt = [c for c in num_cols if c != date_field] | |
| if y_opt: | |
| y = st.selectbox("Y (outcome)", y_opt, index=0) | |
| x = st.selectbox("X (feature)", [c for c in y_opt if c != y], index=0) | |
| st.scatter_chart(df[[x, y]], x=x, y=y, height=300) | |
| try: | |
| r = df[[x, y]].corr().iloc[0, 1] | |
| st.caption(f"Pearson r ≈ {r:.2f}") | |
| except Exception: | |
| st.caption("Pearson r not available for the selected columns.") | |
| else: | |
| st.info("Need at least two numeric columns.") | |
| # ===== Diagnostic and Predictive Modeling ===== | |
| st.header("3) Diagnostic and Predictive Modeling") | |
| if task == "Regression": | |
| if not reg_target: | |
| st.warning("Pick a regression target in Column Mapping.") | |
| elif reg_target not in df.columns: | |
| st.warning(f"Regression target '{reg_target}' not found in data.") | |
| else: | |
| features = [c for c in df.columns if c not in [reg_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])] | |
| features = [c for c in features if df[c].notna().any()] # remove all-NaN features | |
| if len(df.dropna(subset=[reg_target])) < 50: | |
| st.warning("Not enough rows for modeling after filtering.") | |
| else: | |
| pre = build_preprocessor(features, df, standardize) | |
| model = LinearRegression() | |
| pipe = Pipeline([("prep", pre), ("model", model)]) | |
| train_df = df.dropna(subset=[reg_target]).copy() | |
| X, yv = train_df[features], train_df[reg_target] | |
| try: | |
| X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42) | |
| except ValueError: | |
| st.warning("Could not split the data; try lowering the test size or checking the target.") | |
| else: | |
| pipe.fit(X_tr, y_tr) | |
| pred = pipe.predict(X_te) | |
| r2 = r2_score(y_te, pred) | |
| mae = mean_absolute_error(y_te, pred) | |
| rmse = rmse_compat(y_te, pred) | |
| m1, m2, m3 = st.columns(3) | |
| with m1: st.metric("R² (test)", f"{r2:0.2f}") | |
| with m2: st.metric("MAE", f"{mae:,.2f}") | |
| with m3: st.metric("RMSE", f"{rmse:,.2f}") | |
| preds_df = pd.DataFrame({"y_true": y_te, "y_pred": pred}).reset_index(drop=True).round(2) | |
| with st.expander("Predictions vs. Actuals"): | |
| st.dataframe(preds_df, use_container_width=True) | |
| st.download_button("Download predictions (CSV)", data=preds_df.to_csv(index=False).encode("utf-8"), | |
| file_name="regression_predictions.csv", mime="text/csv") | |
| # ---- Coefficient tables ---- | |
| try: | |
| prep = pipe.named_steps["prep"] | |
| model_step = pipe.named_steps["model"] | |
| out_names = get_transformed_feature_names(prep) | |
| coefs = np.ravel(model_step.coef_) | |
| # align sizes if any mismatch | |
| L = min(len(out_names), len(coefs)) | |
| coef_df = pd.DataFrame({ | |
| "feature_out": out_names[:L], | |
| "coef": coefs[:L] | |
| }) | |
| # Map back to original features | |
| orig_num = [c for c in features if pd.api.types.is_numeric_dtype(train_df[c])] | |
| orig_cat = [c for c in features if c not in orig_num] | |
| map_df = map_out_to_original(prep, coef_df["feature_out"].tolist(), orig_num, orig_cat) | |
| coef_df = coef_df.merge(map_df, on="feature_out", how="left") | |
| coef_df["abs_coef"] = coef_df["coef"].abs() | |
| st.subheader("Regression Coefficients (transformed features)") | |
| st.dataframe(coef_df[["feature_out", "original_feature", "coef"]] | |
| .sort_values("coef", key=lambda s: s.abs(), ascending=False) | |
| .round(2), | |
| use_container_width=True) | |
| # Summary by original feature (aggregate OHE effects) | |
| summary = (coef_df.groupby("original_feature", dropna=False) | |
| .agg(total_abs_effect=("abs_coef", "sum"), | |
| net_effect=("coef", "sum"), | |
| components=("coef", "size")) | |
| .reset_index() | |
| .sort_values("total_abs_effect", ascending=False)) | |
| st.subheader("Feature Importance Summary (by original feature)") | |
| st.caption("total_abs_effect = sum of |coefficients| across all transformed components for that feature; net_effect = sum of coefficients.") | |
| st.dataframe(summary.round(2), use_container_width=True) | |
| except Exception as e: | |
| st.info(f"Could not extract regression coefficients: {e}") | |
| # Persist model & baseline for What-If | |
| st.session_state["reg_pipe"] = pipe | |
| st.session_state["reg_features"] = features | |
| numX = [c for c in features if pd.api.types.is_numeric_dtype(train_df[c])] | |
| catX = [c for c in features if c not in numX] | |
| base_vals = {} | |
| for c in numX: | |
| base_vals[c] = float(train_df[c].mean()) | |
| for c in catX: | |
| try: | |
| base_vals[c] = train_df[c].mode(dropna=True).iloc[0] | |
| except Exception: | |
| base_vals[c] = train_df[c].dropna().iloc[0] if len(train_df[c].dropna()) else None | |
| baseline_row = pd.DataFrame([base_vals], columns=features) | |
| st.session_state["reg_baseline_row"] = baseline_row | |
| else: # Classification | |
| if not clf_target: | |
| st.warning("Pick a classification target (binary 0/1) in Column Mapping.") | |
| elif clf_target not in df.columns: | |
| st.warning(f"Classification target '{clf_target}' not found in data.") | |
| elif not df[clf_target].dropna().isin([0,1]).all(): | |
| st.warning("Selected classification target is not binary 0/1.") | |
| else: | |
| features = [c for c in df.columns if c not in [clf_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])] | |
| features = [c for c in features if df[c].notna().any()] | |
| if len(df.dropna(subset=[clf_target])) < 50: | |
| st.warning("Not enough rows for modeling after filtering.") | |
| else: | |
| pre = build_preprocessor(features, df, standardize) | |
| model = LogisticRegression(max_iter=1000) | |
| pipe = Pipeline([("prep", pre), ("model", model)]) | |
| train_df = df.dropna(subset=[clf_target]).copy() | |
| X, yv = train_df[features], train_df[clf_target] | |
| strat = yv if yv.nunique() == 2 else None | |
| try: | |
| X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42, stratify=strat) | |
| except ValueError: | |
| st.warning("Could not split the data; ensure both classes are present after filtering.") | |
| else: | |
| pipe.fit(X_tr, y_tr) | |
| proba = pipe.predict_proba(X_te)[:, 1] | |
| pred = (proba >= 0.5).astype(int) | |
| acc = accuracy_score(y_te, pred) | |
| try: | |
| auc = roc_auc_score(y_te, proba) if len(np.unique(y_te)) == 2 else float("nan") | |
| except Exception: | |
| auc = float("nan") | |
| m1, m2 = st.columns(2) | |
| with m1: st.metric("Accuracy", f"{acc:0.2f}") | |
| with m2: st.metric("ROC AUC", f"{auc:0.2f}") | |
| scores_df = pd.DataFrame({"y_true": y_te, "p_hat": proba, "y_pred": pred}).reset_index(drop=True).round(2) | |
| with st.expander("Scores (y_true, p_hat, y_pred)"): | |
| st.dataframe(scores_df, use_container_width=True) | |
| st.download_button("Download classification scores (CSV)", | |
| data=scores_df.to_csv(index=False).encode("utf-8"), | |
| file_name="classification_scores.csv", mime="text/csv") | |
| # ===== Explanatory / What-If ===== | |
| st.header("4) Explanatory & What-If (Model-Driven)") | |
| pipe = st.session_state.get("reg_pipe") | |
| features = st.session_state.get("reg_features") | |
| baseline_row = st.session_state.get("reg_baseline_row") | |
| def _local_slope(pipe, base_df, feat, h): | |
| """Finite-difference slope dŷ/dx at baseline for one feature.""" | |
| b = base_df.copy() | |
| y0 = float(pipe.predict(b)[0]) | |
| s = b.copy() | |
| try: | |
| s[feat] = s[feat] + h | |
| except Exception: | |
| return np.nan | |
| y1 = float(pipe.predict(s)[0]) | |
| return (y1 - y0) / h if h != 0 else np.nan | |
| if pipe is not None and features is not None and baseline_row is not None: | |
| st.caption("Pick any features (from the trained model) to modify and see the predicted impact.") | |
| selectable = features | |
| default_choices = [c for c in selectable if c in ["spend_search","discount_pct","site_errors"]][:3] | |
| chosen = st.multiselect("What-If features", selectable, default=default_choices) | |
| # Determine which chosen features actually influence the model | |
| active_info = [] | |
| try: | |
| prep = pipe.named_steps["prep"] | |
| for name, trans, cols in prep.transformers_: | |
| if name == "remainder": | |
| continue | |
| active_info.extend(list(cols)) | |
| except Exception: | |
| active_info = features | |
| inactive = [f for f in chosen if f not in active_info] | |
| if inactive: | |
| st.warning(f"These selected features don’t influence the model (dropped or constant after preprocessing): {', '.join(inactive)}") | |
| scenario_row = baseline_row.copy() | |
| # UI + local slope per numeric feature | |
| for feat in chosen: | |
| if feat in inactive: | |
| continue | |
| if pd.api.types.is_numeric_dtype(baseline_row[feat]): | |
| series = df[feat].dropna() if feat in df.columns else pd.Series([baseline_row[feat].iloc[0]]) | |
| mean = float(series.mean()) if len(series) else float(baseline_row[feat].iloc[0]) | |
| std = float(series.std()) if len(series) else 0.0 | |
| if 0.0 <= mean <= 1.0 and (len(series)==0 or (series.min() >= 0.0 and series.max() <= 1.0)): | |
| lo, hi, default_val, step = 0.0, 1.0, float(baseline_row[feat].iloc[0]), 0.01 | |
| else: | |
| lo = round(mean - 2*std, 2); hi = round(mean + 2*std, 2) | |
| if lo == hi: hi = lo + 1.0 | |
| default_val = float(baseline_row[feat].iloc[0]) | |
| step = max(0.01, (hi - lo) / 100.0) | |
| h = 1.0 if std == 0 else max(0.1*std, 1e-3) | |
| slope = _local_slope(pipe, baseline_row, feat, h) | |
| col1, col2 = st.columns([3,1]) | |
| with col1: | |
| val = st.slider(f"{feat}", lo, hi, default_val, step=step) | |
| with col2: | |
| st.markdown(f"**Δŷ/Δ{feat} ≈ {0.0 if pd.isna(slope) else float(slope):.2f}**") | |
| scenario_row[feat] = val | |
| else: | |
| opts = sorted(df[feat].dropna().unique().tolist()) if feat in df.columns else sorted(baseline_row[feat].dropna().unique().tolist()) | |
| default = baseline_row[feat].iloc[0] if feat in baseline_row.columns else (opts[0] if opts else None) | |
| idx = (opts.index(default) if (opts and default in opts) else 0) if opts else 0 | |
| val = st.selectbox(f"{feat}", opts if opts else [default], index=idx) | |
| scenario_row[feat] = val | |
| try: | |
| base_pred = float(pipe.predict(baseline_row)[0]) | |
| new_pred = float(pipe.predict(scenario_row)[0]) | |
| delta = new_pred - base_pred | |
| c1, c2 = st.columns(2) | |
| with c1: st.metric("Baseline (model prediction)", f"{base_pred:,.2f}") | |
| with c2: st.metric("What-If (model prediction)", f"{new_pred:,.2f}", delta=f"{delta:,.2f}") | |
| except Exception as e: | |
| st.warning(f"Could not compute predictions for scenario: {e}") | |
| else: | |
| st.info("Train the regression model in Section 3 to enable a model-driven What-If.") | |