# Intro to Business Analytics - With Python. Module 1 import os import io import csv import numpy as np import pandas as pd import streamlit as st from typing import List, Tuple from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, OneHotEncoder from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score, roc_auc_score # ---------- Helpers ---------- def rmse_compat(y_true, y_pred): """RMSE compatible with old/new scikit-learn.""" try: return float(mean_squared_error(y_true, y_pred, squared=False)) except TypeError: return float(np.sqrt(mean_squared_error(y_true, y_pred))) @st.cache_data(show_spinner=False) def load_demo_csv(): """Load built-in demo; prefer realistic if present.""" for path in ["/mnt/data/sample_data_realistic.csv", "/mnt/data/sample_data.csv"]: if os.path.exists(path): try: return pd.read_csv(path, parse_dates=["week"]) except Exception: try: return pd.read_csv(path) except Exception: pass return pd.DataFrame() def load_uploaded_csv_or_excel(up, desired_parse_cols): """ Robustly read uploaded file as CSV (auto-sep, encoding) or Excel. Only convert to datetime for columns that actually exist. """ name = (up.name or "").lower() # Excel first if name.endswith((".xlsx", ".xls")): up.seek(0) df = pd.read_excel(up) for c in (desired_parse_cols or []): if c in df.columns: df[c] = pd.to_datetime(df[c], errors="coerce") return df # CSV: sniff delimiter + encoding up.seek(0) head_bytes = up.read(8192) try: head_text = head_bytes.decode("utf-8") encoding_used = "utf-8" except UnicodeDecodeError: head_text = head_bytes.decode("latin-1", errors="replace") encoding_used = "latin-1" try: dialect = csv.Sniffer().sniff(head_text, delimiters=",;\t|") sep = dialect.delimiter except Exception: sep = "," # Read once without parse_dates up.seek(0) try: df = pd.read_csv(up, sep=sep, encoding=encoding_used) except pd.errors.EmptyDataError: st.error("The uploaded file appears to be empty.") return pd.DataFrame() except Exception: # last resort: python engine auto-sep up.seek(0) try: df = pd.read_csv(up, sep=None, engine="python", encoding=encoding_used) except Exception: st.error("Could not read the uploaded file as CSV or Excel. It might use an unusual delimiter/encoding, or be corrupted.") st.caption("First 256 bytes (for debugging):") st.code(head_text[:256]) return pd.DataFrame() # Convert only present date cols for c in (desired_parse_cols or []): if c in df.columns: df[c] = pd.to_datetime(df[c], errors="coerce") return df def infer_types(df: pd.DataFrame): num = df.select_dtypes(include=[np.number]).columns.tolist() dt = df.select_dtypes(include=["datetime64[ns]", "datetime64[ns, UTC]"]).columns.tolist() cat = [c for c in df.columns if c not in num + dt] return num, cat, dt def winsorize_series(s: pd.Series, lower_q: float, upper_q: float) -> pd.Series: lo, hi = s.quantile(lower_q), s.quantile(upper_q) return s.clip(lower=lo, upper=hi) def build_preprocessor(feature_names, df_context, standardize: bool): """Imputation + (optional) scaling + OHE; drops all-NaN columns.""" feature_names = [c for c in feature_names if df_context[c].notna().any()] numX = [c for c in feature_names if pd.api.types.is_numeric_dtype(df_context[c])] catX = [c for c in feature_names if c not in numX] # OneHotEncoder API compatibility try: ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=True) except TypeError: ohe = OneHotEncoder(handle_unknown="ignore", sparse=True) transformers = [] if numX: from sklearn.pipeline import Pipeline as _P num_pipe = _P([ ("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler(with_mean=False) if standardize else "passthrough"), ]) transformers.append(("num", num_pipe, numX)) if catX: from sklearn.pipeline import Pipeline as _P cat_pipe = _P([ ("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", ohe), ]) transformers.append(("cat", cat_pipe, catX)) return ColumnTransformer(transformers=transformers, remainder="drop") # ---------- Page ---------- st.set_page_config(page_title="Intro to Business Analytics - Python Demo", page_icon="📈", layout="wide") st.title("📈 Intro to Business Analytics - Python Demo") with st.expander("👋 Getting started / What this app does", expanded=True): st.markdown( "- **Choose data**: built-in demo or upload CSV/Excel.\n" "- **Map columns**: select date/region and targets.\n" "- **Explore**: time series, breakdowns, correlations.\n" "- **Model**: regression or classification with preprocessing.\n" "- **What-If**: tweak *any* model features and see predicted impact.\n" "- **Downloads**: preview, predictions, scores, scenarios." ) # ===== Sidebar: Data source ===== st.sidebar.header("Controls") data_mode = st.sidebar.radio("Data Source", ["Demo dataset", "Upload file"]) if data_mode == "Upload file": up = st.sidebar.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"]) date_cols_text = st.sidebar.text_input("Date column(s) to parse (comma-separated)", "week") if up is not None: parse = [c.strip() for c in date_cols_text.split(",") if c.strip()] df_raw = load_uploaded_csv_or_excel(up, parse) if df_raw.empty: st.warning("The uploaded file produced an empty table after parsing. Check delimiter/encoding, or try a different file.") else: st.sidebar.info("Using demo until a file is uploaded.") df_raw = load_demo_csv() else: # DEMO branch ALWAYS defines df_raw df_raw = load_demo_csv() # ===== Sidebar: Column mapping ===== with st.sidebar.expander("Column Mapping", True): all_cols = df_raw.columns.tolist() # date column default_date_idx = (all_cols.index("week")+1) if "week" in all_cols else 0 date_col = st.selectbox("Date column (optional)", [""] + all_cols, index=default_date_idx) date_field = None if date_col == "" else date_col # region column default_region_idx = (all_cols.index("region")+1) if "region" in all_cols else 0 region_col = st.selectbox("Region column (optional)", [""] + all_cols, index=default_region_idx) region_field = None if region_col == "" else region_col # targets numeric_cols_all = df_raw.select_dtypes(include=[np.number]).columns.tolist() binary_cols = [c for c in numeric_cols_all if df_raw[c].dropna().isin([0,1]).all()] reg_default_idx = (numeric_cols_all.index("revenue_next_week")+1) if "revenue_next_week" in numeric_cols_all else 0 reg_target = st.selectbox("Regression target", [""] + numeric_cols_all, index=reg_default_idx) reg_target = None if reg_target == "" else reg_target clf_default_idx = (binary_cols.index("churn_4w")+1) if "churn_4w" in binary_cols else 0 clf_target = st.selectbox("Classification target (binary 0/1)", [""] + binary_cols, index=clf_default_idx) clf_target = None if clf_target == "" else clf_target # ===== Sidebar: Prep, Filters, Modeling ===== with st.sidebar.expander("Data Prep", True): lower_q = st.slider("Outlier floor (winsorize)", 0.0, 0.2, 0.01, step=0.01) upper_q = st.slider("Outlier cap (winsorize)", 0.8, 1.0, 0.99, step=0.01) smoothing = st.slider("Smoothing window (periods)", 1, 8, 1) agg_grain = st.selectbox("Aggregation granularity", ["none (row)", "date"]) with st.sidebar.expander("Filters", False): if date_field and pd.api.types.is_datetime64_any_dtype(df_raw.get(date_field, pd.Series([], dtype="datetime64[ns]"))): min_d, max_d = df_raw[date_field].min(), df_raw[date_field].max() try: date_range = st.date_input("Date range", (min_d.date(), max_d.date())) except Exception: date_range = None else: date_range = None if region_field: region_vals = sorted(df_raw[region_field].dropna().unique().tolist()) regions_sel = st.multiselect("Region filter", region_vals, default=region_vals if region_vals else []) else: regions_sel = [] with st.sidebar.expander("Modeling", False): task = st.selectbox("Task", ["Regression", "Classification"]) test_size = st.slider("Test size", 0.1, 0.5, 0.2, step=0.05) standardize = st.checkbox("Standardize numeric features", True) # ===== Data overview & prep ===== df = df_raw.copy() # date filter if date_field and date_range: try: start_d, end_d = pd.to_datetime(date_range[0]), pd.to_datetime(date_range[1]) df = df[(df[date_field] >= start_d) & (df[date_field] <= end_d)] except Exception: pass # region filter if region_field and regions_sel: df = df[df[region_field].isin(regions_sel)] # types & winsorize num_cols, cat_cols, dt_cols = infer_types(df) for c in num_cols: try: df[c] = winsorize_series(df[c], lower_q, upper_q) except Exception: pass # aggregation if agg_grain == "date" and (date_field in df.columns): group_cols = [date_field] + ([region_field] if region_field in df.columns else []) agg = {c: ("mean" if ("rate" in c or "pct" in c) else "sum") for c in num_cols if c != date_field} if agg: df = df.groupby(group_cols, as_index=False).agg(agg) # smoothing if smoothing > 1 and (date_field in df.columns): df = df.sort_values(date_field).reset_index(drop=True) for c in num_cols: if c != date_field: try: df[c] = df[c].rolling(smoothing, min_periods=1).mean() except Exception: pass # Overview st.subheader("📁 Data Overview") c1, c2, c3, c4 = st.columns(4) with c1: st.metric("Rows", f"{len(df):,}") with c2: st.metric("Columns", f"{len(df.columns):,}") with c3: st.metric("Numeric", f"{len(num_cols):,}") with c4: st.metric("Categorical", f"{len(cat_cols):,}") with st.expander("Preview dataframe"): st.dataframe(df.head(50).round(2), use_container_width=True) csv_preview = df.head(1000).round(2).to_csv(index=False).encode("utf-8") st.download_button("Download preview (CSV)", data=csv_preview, file_name="preview.csv", mime="text/csv") # ===== Descriptive ===== st.header("1) Descriptive Analytics") tab_ts, tab_break, tab_corr = st.tabs(["Time Series","Breakdowns","Correlations"]) with tab_ts: if date_field in df.columns: y_opt = [c for c in num_cols if c != date_field] if y_opt: y_metric = st.selectbox("Metric", y_opt, index=min(1, len(y_opt)-1)) by = st.multiselect("Stratify by", [region_field] if region_field in df.columns else []) if by: for k, sub in df.groupby(by): st.line_chart(sub.set_index(date_field)[y_metric], height=240) else: st.line_chart(df.set_index(date_field)[y_metric], height=300) else: st.info("No date column selected. Choose one in Column Mapping to enable time series.") with tab_break: if region_field in df.columns: measure = st.selectbox("Measure", [c for c in num_cols if c != date_field] or num_cols) pivot = df.groupby(region_field, as_index=False)[measure].sum().sort_values(measure, ascending=False) st.bar_chart(pivot.set_index(region_field)[measure], height=300) st.dataframe(pivot.round(2), use_container_width=True) else: st.info("No region column selected.") with tab_corr: if len(num_cols) >= 2: corr = df[num_cols].corr(numeric_only=True) st.dataframe(corr.round(2), use_container_width=True) # enforce 2 decimals else: st.info("Not enough numeric columns to compute correlations.") # ===== Inferential ===== st.header("2) Inferential Analytics") if len(num_cols) >= 2: y_opt = [c for c in num_cols if c != date_field] if y_opt: y = st.selectbox("Y (outcome)", y_opt, index=0) x = st.selectbox("X (feature)", [c for c in y_opt if c != y], index=0) st.scatter_chart(df[[x, y]], x=x, y=y, height=300) try: r = df[[x, y]].corr().iloc[0, 1] st.caption(f"Pearson r ≈ {r:.2f}") except Exception: st.caption("Pearson r not available for the selected columns.") else: st.info("Need at least two numeric columns.") # ===== Predictive Modeling ===== st.header("3) Predictive Modeling") # Regression if task == "Regression": if not reg_target: st.warning("Pick a regression target in Column Mapping.") elif reg_target not in df.columns: st.warning(f"Regression target '{reg_target}' not found in data.") else: features = [c for c in df.columns if c not in [reg_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])] features = [c for c in features if df[c].notna().any()] # remove all-NaN features if len(df.dropna(subset=[reg_target])) < 50: st.warning("Not enough rows for modeling after filtering.") else: pre = build_preprocessor(features, df, standardize) model = LinearRegression() pipe = Pipeline([("prep", pre), ("model", model)]) train_df = df.dropna(subset=[reg_target]).copy() X, yv = train_df[features], train_df[reg_target] try: X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42) except ValueError: st.warning("Could not split the data; try lowering the test size or checking the target.") else: pipe.fit(X_tr, y_tr) pred = pipe.predict(X_te) r2 = r2_score(y_te, pred) mae = mean_absolute_error(y_te, pred) rmse = rmse_compat(y_te, pred) m1, m2, m3 = st.columns(3) with m1: st.metric("R² (test)", f"{r2:0.2f}") with m2: st.metric("MAE", f"{mae:,.2f}") with m3: st.metric("RMSE", f"{rmse:,.2f}") preds_df = pd.DataFrame({"y_true": y_te, "y_pred": pred}).reset_index(drop=True).round(2) with st.expander("Predictions vs. Actuals"): st.dataframe(preds_df, use_container_width=True) st.download_button("Download predictions (CSV)", data=preds_df.to_csv(index=False).encode("utf-8"), file_name="regression_predictions.csv", mime="text/csv") # Persist model & baseline for What-If st.session_state["reg_pipe"] = pipe st.session_state["reg_features"] = features numX = [c for c in features if pd.api.types.is_numeric_dtype(train_df[c])] catX = [c for c in features if c not in numX] base_vals = {} for c in numX: base_vals[c] = float(train_df[c].mean()) for c in catX: try: base_vals[c] = train_df[c].mode(dropna=True).iloc[0] except Exception: base_vals[c] = train_df[c].dropna().iloc[0] if len(train_df[c].dropna()) else None baseline_row = pd.DataFrame([base_vals], columns=features) st.session_state["reg_baseline_row"] = baseline_row # Classification else: if not clf_target: st.warning("Pick a classification target (binary 0/1) in Column Mapping.") elif clf_target not in df.columns: st.warning(f"Classification target '{clf_target}' not found in data.") elif not df[clf_target].dropna().isin([0,1]).all(): st.warning("Selected classification target is not binary 0/1.") else: features = [c for c in df.columns if c not in [clf_target, date_field] and not pd.api.types.is_datetime64_any_dtype(df[c])] features = [c for c in features if df[c].notna().any()] if len(df.dropna(subset=[clf_target])) < 50: st.warning("Not enough rows for modeling after filtering.") else: pre = build_preprocessor(features, df, standardize) model = LogisticRegression(max_iter=1000) pipe = Pipeline([("prep", pre), ("model", model)]) train_df = df.dropna(subset=[clf_target]).copy() X, yv = train_df[features], train_df[clf_target] strat = yv if yv.nunique() == 2 else None try: X_tr, X_te, y_tr, y_te = train_test_split(X, yv, test_size=test_size, random_state=42, stratify=strat) except ValueError: st.warning("Could not split the data; ensure both classes are present after filtering.") else: pipe.fit(X_tr, y_tr) proba = pipe.predict_proba(X_te)[:, 1] pred = (proba >= 0.5).astype(int) acc = accuracy_score(y_te, pred) try: auc = roc_auc_score(y_te, proba) if len(np.unique(y_te)) == 2 else float("nan") except Exception: auc = float("nan") m1, m2 = st.columns(2) with m1: st.metric("Accuracy", f"{acc:0.2f}") with m2: st.metric("ROC AUC", f"{auc:0.2f}") scores_df = pd.DataFrame({"y_true": y_te, "p_hat": proba, "y_pred": pred}).reset_index(drop=True).round(2) with st.expander("Scores (y_true, p_hat, y_pred)"): st.dataframe(scores_df, use_container_width=True) st.download_button("Download classification scores (CSV)", data=scores_df.to_csv(index=False).encode("utf-8"), file_name="classification_scores.csv", mime="text/csv") # ===== Explanatory / What-If ===== st.header("4) Explanatory & What-If (Model-Driven)") pipe = st.session_state.get("reg_pipe") features = st.session_state.get("reg_features") baseline_row = st.session_state.get("reg_baseline_row") if pipe is not None and features is not None and baseline_row is not None: st.caption("Pick any features (from the trained model) to modify and see the predicted impact.") selectable = features default_choices = [c for c in selectable if c in ["spend_search","discount_pct","site_errors"]][:3] chosen = st.multiselect("What-If features", selectable, default=default_choices) scenario_row = baseline_row.copy() for feat in chosen: if pd.api.types.is_numeric_dtype(baseline_row[feat]): series = df[feat].dropna() if feat in df.columns else pd.Series([baseline_row[feat].iloc[0]]) mean = float(series.mean()) if len(series) else float(baseline_row[feat].iloc[0]) std = float(series.std()) if len(series) else 0.0 if 0.0 <= mean <= 1.0 and (len(series)==0 or (series.min() >= 0.0 and series.max() <= 1.0)): val = st.slider(f"{feat} (0–1)", 0.0, 1.0, float(baseline_row[feat].iloc[0]), step=0.01) else: lo = round(mean - 2*std, 2); hi = round(mean + 2*std, 2) if lo == hi: hi = lo + 1.0 step = max(0.01, (hi-lo)/100.0) val = st.slider(f"{feat} (absolute)", lo, hi, float(baseline_row[feat].iloc[0]), step=step) scenario_row[feat] = val else: opts = sorted(df[feat].dropna().unique().tolist()) if feat in df.columns else sorted(baseline_row[feat].dropna().unique().tolist()) default = baseline_row[feat].iloc[0] if feat in baseline_row.columns else (opts[0] if opts else None) idx = (opts.index(default) if (opts and default in opts) else 0) if opts else 0 val = st.selectbox(f"{feat} (category)", opts if opts else [default], index=idx) scenario_row[feat] = val try: base_pred = float(pipe.predict(baseline_row)[0]) new_pred = float(pipe.predict(scenario_row)[0]) delta = new_pred - base_pred c1, c2 = st.columns(2) with c1: st.metric("Baseline (model prediction)", f"{base_pred:,.2f}") with c2: st.metric("What-If (model prediction)", f"{new_pred:,.2f}", delta=f"{delta:,.2f}") except Exception as e: st.warning(f"Could not compute predictions for scenario: {e}") else: st.info("Train the regression model in Section 3 to enable a model-driven What-If.")