atividade1_nova

Sleeping

App Files Files Community

ricardoadriano commited on Oct 28, 2025

Commit

3331850

verified ·

1 Parent(s): d3051a3

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +295 -291

src/streamlit_app.py CHANGED Viewed

@@ -1,6 +1,11 @@
-# App: Análise de Reclamações de Consumidores
-# Dataset esperado: Dados/marketing_campaign.csv  (Kaggle - Customer Personality Analysis)
 import os
 import numpy as np
 import pandas as pd
@@ -9,67 +14,105 @@ import altair as alt
 from typing import List, Tuple
-# Sci-kit / stats
 from sklearn.model_selection import train_test_split
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
-from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, RocCurveDisplay
-from sklearn.inspection import permutation_importance
-from sklearn.linear_model import LogisticRegression
-from sklearn.ensemble import RandomForestClassifier
 import statsmodels.api as sm
-# -----------------------------
-# Configurações gerais
-# -----------------------------
-st.set_page_config(
-    page_title="Reclamações de Consumidores — PPCA/UnB",
-    layout="wide",
-)
-st.title("Reclamações de Consumidores — Predição & Explicação")
-st.caption("Com base no conjunto **Customer Personality Analysis** (marketing_campaign.csv)")
 DATA_PATH = "Dados/marketing_campaign.csv"
 # -----------------------------
-# Utilitários
 # -----------------------------
 @st.cache_data(show_spinner=False)
-def load_data(path: str) -> pd.DataFrame:
-    df = pd.read_csv(path, sep=",", encoding="utf-8")
-    # Alguns CSVs deste dataset vêm com separador ';'. Se falhar, tenta novamente:
-    if df.shape[1] == 1:
-        df = pd.read_csv(path, sep=";", encoding="utf-8")
-    return df
-def infer_target_column(df: pd.DataFrame) -> str:
-    # No dataset da Kaggle, a variável é "Complain" (0/1).
-    # Se não existir, tenta nomes comuns.
-    candidates = ["Complain", "complain", "Complaint", "has_complaint", "has_complain"]
-    for c in candidates:
-        if c in df.columns:
-            return c
-    # fallback: se não achou, cria guiagem
-    return None
-def split_features(df: pd.DataFrame, y_col: str) -> Tuple[List[str], List[str]]:
-    cat_cols = [c for c in df.columns if (df[c].dtype == "object" or df[c].dtype.name == "category") and c != y_col]
-    num_cols = [c for c in df.columns if (np.issubdtype(df[c].dtype, np.number)) and c != y_col]
     return num_cols, cat_cols
 def build_preprocessor(num_cols: List[str], cat_cols: List[str]) -> ColumnTransformer:
     num_pipe = Pipeline([
-        ("imputer", SimpleImputer(strategy="median")),
         ("scaler", StandardScaler())
     ])
     cat_pipe = Pipeline([
-        ("imputer", SimpleImputer(strategy="most_frequent")),
-        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
     ])
     pre = ColumnTransformer([
         ("num", num_pipe, num_cols),
@@ -77,303 +120,264 @@ def build_preprocessor(num_cols: List[str], cat_cols: List[str]) -> ColumnTransf
     ])
     return pre
-def get_model(name: str):
-    if name == "Regressão Logística":
-        return LogisticRegression(max_iter=200, n_jobs=None)  # n_jobs só no liblinear/saga; usamos solver padrão (lbfgs)
-    elif name == "Random Forest":
-        return RandomForestClassifier(
-            n_estimators=300,
-            max_depth=None,
-            random_state=42,
-            n_jobs=-1
-        )
-    else:
-        raise ValueError("Modelo desconhecido.")
-def coefficient_table_for_logit(statsmodels_result, feature_names):
-    # Retorna DataFrame com OR e IC 95%
-    params = statsmodels_result.params
-    conf = statsmodels_result.conf_int(alpha=0.05)
-    df_coef = pd.DataFrame({
-        "feature": ["Intercept"] + feature_names,
-        "coef": params.values
-    })
-    conf = pd.DataFrame(conf.values, columns=["ci_low", "ci_high"])
-    df_coef["ci_low"] = conf["ci_low"].values
-    df_coef["ci_high"] = conf["ci_high"].values
-    # Odds ratios
-    df_coef["odds_ratio"] = np.exp(df_coef["coef"])
-    df_coef["or_ci_low"] = np.exp(df_coef["ci_low"])
-    df_coef["or_ci_high"] = np.exp(df_coef["ci_high"])
-    return df_coef
-def make_recommendations(imp_df: pd.DataFrame, top_k: int = 5) -> List[str]:
     """
-    Gera recomendações de alto nível com base nas variáveis mais importantes.
-    imp_df precisa ter colunas: feature, importance, sign (para regressão logística; senão assume neutro).
     """
-    recs = []
-    # Pega top_k
-    core = imp_df.sort_values("importance", ascending=False).head(top_k)
-    for _, row in core.iterrows():
-        feat = row["feature"]
-        sign = row.get("sign", 0)
-        if sign > 0:
-            recs.append(
-                f"🔧 **Reduzir a exposição associada a `{feat}`**, pois aumento nessa variável eleva a probabilidade de reclamação. "
-                f"Considere políticas específicas (p.ex., comunicação proativa, revisão de políticas de entrega/atendimento, "
-                f"ou benefícios segmentados para o grupo impactado por `{feat}`)."
-            )
-        elif sign < 0:
-            recs.append(
-                f" **Ampliar ações relacionadas a `{feat}`**, já que maior valor nessa variável tende a reduzir reclamações. "
-                f"Ex.: expandir programas de fidelidade ou incentivos que reforcem o comportamento ligado a `{feat}`."
-            )
         else:
-            recs.append(
-                f" **Monitorar `{feat}`** de perto: é importante, ainda que a direção do efeito varie entre segmentos. "
-                f"Teste intervenções com experimentos A/B e avalie impacto nas métricas de reclamação."
-            )
-    # Recomendações genéricas de processo:
-    recs.append(" **Implantar testes A/B** para validar intervenções nas variáveis-chave e medir impacto em taxa de reclamação.")
-    recs.append(" **Aprimorar o 1º contato (FCR)**: reduzir transferência/recontato; scripts e treinamentos focados nas causas top-1/2.")
-    recs.append(" **Feedback loop**: alimentar o time de Produto/Qualidade com causas de reclamação mais preditivas para correções upstream.")
-    return recs
 # -----------------------------
-# Carregamento
 # -----------------------------
 with st.sidebar:
-    st.header("Dados")
-    st.write("Esperado: `Dados/marketing_campaign.csv`")
     if not os.path.exists(DATA_PATH):
-        st.error(f"Arquivo não encontrado em `{DATA_PATH}`. Suba o CSV na pasta `Dados/` do Space.")
-    else:
-        st.success("Arquivo encontrado")
-try:
-    df_raw = load_data(DATA_PATH)
-except Exception as e:
-    st.stop()
-target_col = infer_target_column(df_raw)
-if target_col is None:
-    st.error("Não encontrei a coluna alvo (ex.: `Complain`). Confirme o nome no CSV.")
-    st.dataframe(df_raw.head())
-    st.stop()
-# -----------------------------
-# Sidebar — Configuração
-# -----------------------------
 with st.sidebar:
-    st.header("Configuração do Modelo")
-    st.caption("**Item (c)** — Definição & Seleção de Modelos")
-    model_name = st.selectbox("Modelo preditivo", ["Regressão Logística", "Random Forest"], index=0)
-    # Seleção de variáveis explicativas
-    st.subheader("Variáveis explicativas")
-    num_cols, cat_cols = split_features(df_raw, target_col)
-    all_feats = num_cols + cat_cols
-    default_feats = [c for c in all_feats if c != target_col]
-    selected_feats = st.multiselect(
-        "Selecione as variáveis de entrada",
-        options=default_feats,
-        default=default_feats[: min(12, len(default_feats))]
-    )
     test_size = st.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
     random_state = st.number_input("Random seed", value=42, step=1)
-st.markdown("### Visão geral dos dados")
-st.write(f"Linhas: **{df_raw.shape[0]}**, Colunas: **{df_raw.shape[1]}**")
-st.dataframe(df_raw[[c for c in [target_col] + selected_feats if c in df_raw.columns]].head(10))
 # -----------------------------
-# Preparação
 # -----------------------------
-df = df_raw.dropna(subset=[target_col]).copy()
-y = df[target_col].astype(int)
-X = df[selected_feats].copy()
-# Tipos
-sel_num = [c for c in selected_feats if c in X.columns and np.issubdtype(X[c].dtype, np.number)]
-sel_cat = [c for c in selected_feats if c in X.columns and (X[c].dtype == "object" or X[c].dtype.name == "category")]
-pre = build_preprocessor(sel_num, sel_cat)
-model = get_model(model_name)
-pipe = Pipeline([
-    ("pre", pre),
-    ("clf", model)
-])
 X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=test_size, random_state=random_state, stratify=y
 )
 # -----------------------------
-# Treinamento
 # -----------------------------
-with st.spinner("Treinando modelo..."):
-    pipe.fit(X_train, y_train)
-# Predições e métricas
-proba_test = pipe.predict_proba(X_test)[:, 1]
-pred_test = (proba_test >= 0.5).astype(int)
-auc = roc_auc_score(y_test, proba_test)
-acc = accuracy_score(y_test, pred_test)
-cm = confusion_matrix(y_test, pred_test)
-met1, met2 = st.columns(2)
-with met1:
-    st.metric("AUC (ROC)", f"{auc:.3f}")
-with met2:
-    st.metric("Acurácia", f"{acc:.3f}")
-st.markdown("#### Matriz de confusão")
-cm_df = pd.DataFrame(cm, index=["Real 0", "Real 1"], columns=["Pred 0", "Pred 1"])
-st.dataframe(cm_df)
-# Curva ROC (usando altair simples)
-roc_points = []
-fpr_list = np.linspace(0, 1, 101)
-# Calcular TPR para limiares aproximados
-from sklearn.metrics import roc_curve
-fpr, tpr, thr = roc_curve(y_test, proba_test)
-roc_data = pd.DataFrame({"fpr": fpr, "tpr": tpr})
-roc_chart = alt.Chart(roc_data).mark_line().encode(x="fpr:Q", y="tpr:Q").properties(
-    height=250, width=380
-)
-diag = alt.Chart(pd.DataFrame({"x":[0,1],"y":[0,1]})).mark_line(strokeDash=[4,4]).encode(x="x", y="y")
-st.altair_chart(roc_chart + diag, use_container_width=True)
 # -----------------------------
-# Importância das variáveis
 # -----------------------------
-st.markdown("### Importância das variáveis")
-with st.spinner("Calculando importância (permutation importance)..."):
-    perm = permutation_importance(pipe, X_test, y_test, n_repeats=10, random_state=42, scoring="roc_auc")
-    # Nomear features após o preprocessamento:
-    # Recupera nomes one-hot para categoricas
-    ohe = pipe.named_steps["pre"].named_transformers_.get("cat")
-    ohe_feat_names = []
-    if ohe is not None and hasattr(ohe, "named_steps"):
-        onehot = ohe.named_steps["onehot"]
-        if hasattr(onehot, "get_feature_names_out"):
-            ohe_feat_names = list(onehot.get_feature_names_out(sel_cat))
-    # Nomes finais
-    feat_names = sel_num + ohe_feat_names
-    imp_df = pd.DataFrame({
-        "feature": feat_names,
-        "importance": perm.importances_mean[:len(feat_names)]
-    }).sort_values("importance", ascending=False)
-# Para regressão logística, calcular sinal aproximado por coeficientes
-sign_map = {}
-if model_name == "Regressão Logística":
-    # Reconstruir coeficientes no espaço expandido:
-    # Ajusta novamente em X_train pretransformado para extrair coef
-    X_train_proc = pipe.named_steps["pre"].fit_transform(X_train)
-    clf = LogisticRegression(max_iter=200)
-    clf.fit(X_train_proc, y_train)
-    coefs = clf.coef_.ravel()
-    # Alinha tamanho; pode haver diferença por features descartadas
-    k = min(len(coefs), len(feat_names))
-    for i in range(k):
-        sign_map[feat_names[i]] = np.sign(coefs[i])
-imp_df["sign"] = imp_df["feature"].map(lambda f: sign_map.get(f, 0))
-st.dataframe(imp_df.head(15))
-# Chart
-bar = alt.Chart(imp_df.head(20)).mark_bar().encode(
-    x=alt.X("importance:Q", title="Perm. importance (AUC)"),
-    y=alt.Y("feature:N", sort='-x', title="Feature"),
-    color=alt.value("#3165d4")
-).properties(height=450)
-st.altair_chart(bar, use_container_width=True)
 # -----------------------------
 # Predição interativa
 # -----------------------------
-st.markdown("## Predição interativa (probabilidade de reclamação)")
-st.caption("Ajuste os valores no painel e veja a probabilidade prevista pelo modelo.")
-# Constrói um dicionário de entradas
 with st.form("pred_form"):
     cols = st.columns(3)
-    inputs = {}
-    for idx, col in enumerate(selected_feats):
-        col_container = cols[idx % 3]
-        with col_container:
             if col in sel_num:
-                # Usa faixa baseada nos quantis do treino
-                q1, q5, q95, q99 = X_train[col].quantile([0.01, 0.05, 0.95, 0.99])
-                val = st.number_input(
-                    f"{col}",
-                    value=float(np.nan_to_num(X_train[col].median(), nan=0.0)),
                     help=f"Faixa típica ~ {q5:.2f}–{q95:.2f} (1–99%: {q1:.2f}–{q99:.2f})"
                 )
-                inputs[col] = val
             else:
-                opts = sorted([str(x) for x in X_train[col].dropna().unique().tolist()])[:30]
-                default = opts[0] if opts else ""
-                val = st.selectbox(f"{col}", options=opts if opts else [""], index=0 if opts else 0)
-                inputs[col] = val
-    submitted = st.form_submit_button("Calcular probabilidade")
 if submitted:
-    x_new = pd.DataFrame([inputs])
-    prob = pipe.predict_proba(x_new)[0, 1]
-    st.success(f"Probabilidade de registrar reclamação (Complain=1): **{prob:.2%}**")
-# -----------------------------
-# Inferência estatística (Logística)
-# -----------------------------
-st.markdown("## Inferência estatística (para mitigação)")
-st.caption("Quando o modelo selecionado é Regressão Logística, mostramos *odds ratios* com IC 95% (explicabilidade estatística).")
-if model_name == "Regressão Logística":
-    try:
-        # Reconstruir design matrix com OHE + padronização (para statsmodels, manter padronização ajuda numérica)
-        pre_fit = pipe.named_steps["pre"].fit(X_train, y_train)
-        X_train_proc = pre_fit.transform(X_train)
-        feature_names = sel_num + (
-            list(pre_fit.named_transformers_["cat"].named_steps["onehot"].get_feature_names_out(sel_cat))
-            if sel_cat else []
-        )
-        X_sm = sm.add_constant(pd.DataFrame(X_train_proc, columns=feature_names))
-        y_sm = y_train.values
-        logit = sm.Logit(y_sm, X_sm).fit(disp=False)
-        or_table = coefficient_table_for_logit(logit, feature_names)
-        st.dataframe(or_table[["feature", "odds_ratio", "or_ci_low", "or_ci_high"]].round(3))
-        st.info(
-            "Interpretação: valores de *odds ratio* > 1 aumentam a chance de reclamação; "
-            "< 1 reduzem. Use os IC para priorizar intervenções mais robustas."
-        )
-    except Exception as e:
-        st.warning(f"Não foi possível calcular os intervalos de confiança: {e}")
-# -----------------------------
-# Item (c): Definição & Seleção de Modelos
-# -----------------------------
-st.markdown("## Item (c) — Definição & Seleção de Modelos")
-st.write("""
-**Regressão Logística** foi escolhida por sua interpretabilidade (odds ratios) e por modelar diretamente a probabilidade de `Complain=1`.
-Em paralelo, **Random Forest** foi incluída como baseline não linear robusto a interações e efeitos não lineares. A escolha final pode ser
-guiada por **AUC/ROC**, **acurácia** e capacidade de **explicação** necessária ao negócio. Para variáveis mistas (numéricas/categóricas),
-aplicamos *imputação*, *padronização* (numéricas) e *one-hot* (categóricas) para garantir comparabilidade e estabilidade do treinamento.
-""")
 # -----------------------------
-# Item (e): Recomendações estratégicas
 # -----------------------------
-st.markdown("## Item (e) — Recomendações para a Tomada de Decisão")
-recs = make_recommendations(imp_df, top_k=5)
-for r in recs:
     st.markdown("- " + r)
 st.markdown("---")
-st.caption("PPCA/UnB • Tarefa 6 — Modelos Supervisionados • App em Streamlit para Hugging Face Spaces")

+# -*- coding: utf-8 -*-
+# -------------------------------------------------------------------
+# App: Inferência Estatística — Customer Personality Analysis
+# Dataset: Dados/marketing_campaign.csv
+# Objetivo: permitir escolher ALVO e PREDITORES e produzir INFERÊNCIA (item e),
+#           usando Logit (alvo binário) ou OLS (alvo contínuo).
+#           Inclui engenharia de atributos inspirada no seu notebook.
+# -------------------------------------------------------------------
 import os
 import numpy as np
 import pandas as pd
 from typing import List, Tuple
+# Pré-processamento e métricas (apoio)
 from sklearn.model_selection import train_test_split
 from sklearn.compose import ColumnTransformer
 from sklearn.preprocessing import OneHotEncoder, StandardScaler
 from sklearn.pipeline import Pipeline
 from sklearn.impute import SimpleImputer
+from sklearn.metrics import (
+    roc_auc_score, accuracy_score, confusion_matrix, roc_curve,
+    r2_score, mean_squared_error
+)
+# Modelinhos rápidos para métricas (não usados na inferência)
+from sklearn.linear_model import LogisticRegression, LinearRegression
+# Inferência estatística
 import statsmodels.api as sm
+st.set_page_config(page_title="Inferência Estatística — Marketing Campaign", layout="wide")
+st.title("📊 Inferência Estatística — Customer Personality Analysis")
+st.caption("Escolha o **alvo** e as **variáveis explicativas** na barra lateral (esquerda) e obtenha inferência (item **e**).")
 DATA_PATH = "Dados/marketing_campaign.csv"
 # -----------------------------
+# Utilidades
 # -----------------------------
 @st.cache_data(show_spinner=False)
+def load_csv_try(path: str) -> pd.DataFrame:
+    """
+    Lê CSV tentando separadores: vírgula, ponto-e-vírgula e tab.
+    """
+    for sep in [",", ";", "\t"]:
+        try:
+            df = pd.read_csv(path, sep=sep, encoding="utf-8")
+            # Heurística: se vier 1 coluna gigantesca, tenta o próximo separador
+            if sep != "\t" and df.shape[1] == 1:
+                continue
+            return df
+        except Exception:
+            continue
+    # última tentativa bruta
+    return pd.read_csv(path, sep=None, engine="python")
+def split_num_cat(df: pd.DataFrame, exclude: List[str]) -> Tuple[List[str], List[str]]:
+    num_cols = [c for c in df.columns if c not in exclude and np.issubdtype(df[c].dtype, np.number)]
+    cat_cols = [c for c in df.columns if c not in exclude and (df[c].dtype == "object" or df[c].dtype.name == "category")]
     return num_cols, cat_cols
+def is_binary_series(s: pd.Series) -> bool:
+    vals = pd.unique(s.dropna())
+    if len(vals) == 2:
+        return True
+    if s.dtype == bool:
+        return True
+    return False
+def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Engenharia mínima inspirada no seu notebook: tenure, totais, shares, ticket, diversidade."""
+    out = df.copy()
+    # Datas → Tenure
+    if "Dt_Customer" in out.columns:
+        out["Dt_Customer"] = pd.to_datetime(out["Dt_Customer"], errors="coerce", dayfirst=True)
+        out["TenureDays"] = (pd.Timestamp("today").normalize() - out["Dt_Customer"]).dt.days
+    # Mnt* somados
+    mnt_cols = [c for c in out.columns if c.startswith("Mnt")]
+    if mnt_cols:
+        out["TotalMnt"] = out[mnt_cols].sum(axis=1)
+    # Compras totais e shares
+    buy_cols = [c for c in ["NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases"] if c in out.columns]
+    if buy_cols:
+        out["TotalPurchases"] = out[buy_cols].sum(axis=1)
+        if "NumWebPurchases" in out.columns:
+            out["OnlineShare"] = out["NumWebPurchases"] / out["TotalPurchases"].replace(0, np.nan)
+        if "NumDealsPurchases" in out.columns:
+            out["PromoShare"] = out["NumDealsPurchases"] / out["TotalPurchases"].replace(0, np.nan)
+    # Ticket médio
+    if "TotalMnt" in out.columns and "TotalPurchases" in out.columns:
+        out["AvgTicket"] = out["TotalMnt"] / out["TotalPurchases"].replace(0, np.nan)
+    # Diversidade de cesta (quantos Mnt*>0)
+    if mnt_cols:
+        out["BasketDiversity"] = (out[mnt_cols] > 0).sum(axis=1)
+    return out
 def build_preprocessor(num_cols: List[str], cat_cols: List[str]) -> ColumnTransformer:
+    """
+    Imputação + padronização (num) e OHE drop='first' (cat) para evitar colinearidade e viabilizar inferência.
+    """
     num_pipe = Pipeline([
+        ("imp", SimpleImputer(strategy="median")),
         ("scaler", StandardScaler())
     ])
     cat_pipe = Pipeline([
+        ("imp", SimpleImputer(strategy="most_frequent")),
+        ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False))
     ])
     pre = ColumnTransformer([
         ("num", num_pipe, num_cols),
     ])
     return pre
+def get_feature_names(pre: ColumnTransformer, num_cols: List[str], cat_cols: List[str]) -> List[str]:
+    names = list(num_cols)
+    if cat_cols:
+        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
+        names.extend(list(ohe.get_feature_names_out(cat_cols)))
+    return names
+def fit_inference(model_type: str, X_design: pd.DataFrame, y: pd.Series):
     """
+    Ajusta modelo de inferência (statsmodels):
+      - 'logit' → sm.Logit
+      - 'ols'   → sm.OLS
+    Retorna (resultado_statsmodels, tabela).
     """
+    X_sm = sm.add_constant(X_design, has_constant="add")
+    if model_type == "logit":
+        res = sm.Logit(y.values, X_sm).fit(disp=False)
+        summ = res.summary2().tables[1].copy()  # coef, std err, z, P>|z|, [0.025, 0.975]
+        or_vals = np.exp(res.params)
+        or_ci = np.exp(res.conf_int())
+        tbl = pd.DataFrame({
+            "feature": summ.index,
+            "coef": res.params.values,
+            "std_err": res.bse.values,
+            "z/t": summ["z"].values,
+            "p_value": summ["P>|z|"].values,
+            "ci_low": res.conf_int()[0].values,
+            "ci_high": res.conf_int()[1].values,
+            "odds_ratio": or_vals.values,
+            "or_ci_low": or_ci[0].values,
+            "or_ci_high": or_ci[1].values
+        })
+    else:
+        res = sm.OLS(y.values, X_sm).fit()
+        conf = res.conf_int()
+        tbl = pd.DataFrame({
+            "feature": res.params.index,
+            "coef": res.params.values,
+            "std_err": res.bse.values,
+            "z/t": res.tvalues.values,
+            "p_value": res.pvalues.values,
+            "ci_low": conf[0].values,
+            "ci_high": conf[1].values
+        })
+    return res, tbl
+def recs_from_inference(tbl: pd.DataFrame, model_type: str, k: int = 5):
+    """Gera recomendações do item (e) a partir dos efeitos significativos (p<0.05), ignorando o intercepto."""
+    df = tbl[tbl["feature"] != "const"].copy()
+    df = df.sort_values(["p_value", "z/t"], ascending=[True, False])
+    core = df[df["p_value"] < 0.05].head(k)
+    out = []
+    for _, r in core.iterrows():
+        feat = r["feature"]
+        sign = np.sign(r["coef"])
+        if model_type == "logit":
+            or_txt = f"(OR≈{r['odds_ratio']:.2f}, IC95% {r['or_ci_low']:.2f}–{r['or_ci_high']:.2f}, p={r['p_value']:.3g})"
+            if sign > 0:
+                out.append(f"🔧 **Reduzir exposição associada a `{feat}`** {or_txt}, pois aumento nessa variável eleva a probabilidade do evento alvo.")
+            else:
+                out.append(f"✅ **Fortalecer fatores ligados a `{feat}`** {or_txt}, pois valores maiores reduzem a probabilidade do evento alvo.")
         else:
+            eff = f"(β≈{r['coef']:.3g}, IC95% {r['ci_low']:.2g}–{r['ci_high']:.2g}, p={r['p_value']:.3g})"
+            if sign > 0:
+                out.append(f"🔧 **Mitigar o crescimento de `{feat}`** {eff}, pois contribui positivamente para o aumento do alvo.")
+            else:
+                out.append(f"✅ **Aumentar `{feat}`** {eff}, pois está associado à redução do alvo.")
+    # Trilhas transversais
+    out.append("🧪 **Testes A/B** nas variáveis mais significativas para validar impacto causal antes de escalar.")
+    out.append("📞 **Melhorar FCR** e fluxos críticos detectados pelas variáveis top-k (treinamento, scripts, UX).")
+    out.append("🔁 **Feedback para Produto/Qualidade** baseado nos fatores com evidência estatística robusta.")
+    return out[:k+3]
 # -----------------------------
+# Sidebar (lado esquerdo)
 # -----------------------------
 with st.sidebar:
+    st.header("⚙️ Configuração")
     if not os.path.exists(DATA_PATH):
+        st.error(f"Arquivo não encontrado: `{DATA_PATH}`. Suba o CSV em `Dados/`.")
+        st.stop()
+# Carrega & engenharia
+df_raw = load_csv_try(DATA_PATH)
+df_eng = engineer_features(df_raw)
+with st.sidebar:
+    st.markdown("**Escolha o alvo (variável dependente):**")
+    all_cols = df_eng.columns.tolist()
+    # Preferências de alvo conforme seu caderno
+    preferred_targets = ["Response", "Complain", "HasComplained", "Exited", "Churn", "Complaint"]
+    default_target = next((c for c in preferred_targets if c in all_cols), None)
+    if default_target is None:
+        # tenta primeira binária; senão primeira numérica
+        for c in all_cols:
+            if is_binary_series(df_eng[c]): default_target = c; break
+        if default_target is None:
+            for c in all_cols:
+                if np.issubdtype(df_eng[c].dtype, np.number): default_target = c; break
+    target_col = st.selectbox("Alvo (y)", options=all_cols, index=all_cols.index(default_target) if default_target in all_cols else 0)
+# Variáveis explicativas
+exclude = [target_col]
+num_cols_all, cat_cols_all = split_num_cat(df_eng, exclude=exclude)
 with st.sidebar:
+    st.markdown("**Variáveis explicativas (X):**")
+    # sugere alguns engenheirados primeiro
+    engineered_first = [c for c in ["TenureDays","TotalMnt","TotalPurchases","OnlineShare","PromoShare","AvgTicket","BasketDiversity"] if c in num_cols_all]
+    base_defaults = engineered_first + [c for c in num_cols_all if c not in engineered_first][:5] + cat_cols_all[:3]
+    selected_feats = st.multiselect("Selecione X", options=(num_cols_all + cat_cols_all), default=base_defaults)
     test_size = st.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
     random_state = st.number_input("Random seed", value=42, step=1)
+if len(selected_feats) == 0:
+    st.warning("Selecione pelo menos uma variável explicativa.")
+    st.stop()
 # -----------------------------
+# Amostra de dados
 # -----------------------------
+st.markdown("### 🔎 Amostra")
+st.dataframe(df_eng[[target_col] + selected_feats].head(12), use_container_width=True)
+# -----------------------------
+# Preparação / Treino
+# -----------------------------
+df = df_eng.dropna(subset=[target_col]).copy()
+y_raw = df[target_col]
+is_bin = is_binary_series(y_raw)
+model_type = "logit" if is_bin else "ols"
+# Mapear alvo binário não numérico → {0,1}
+if is_bin and not np.issubdtype(y_raw.dtype, np.number):
+    uniq = sorted(pd.unique(y_raw.dropna()).tolist(), key=lambda x: str(x))
+    y = y_raw.replace({uniq[0]: 0, uniq[1]: 1}).astype(int)
+else:
+    y = y_raw.astype(float if not is_bin else int)
+X = df[selected_feats].copy()
+sel_num = [c for c in selected_feats if np.issubdtype(X[c].dtype, np.number)]
+sel_cat = [c for c in selected_feats if (X[c].dtype == "object" or X[c].dtype.name == "category")]
+pre = build_preprocessor(sel_num, sel_cat)
+# Pipeline só para métricas/predição
+quick_est = LogisticRegression(max_iter=200) if is_bin else LinearRegression()
+pipe = Pipeline([("pre", pre), ("est", quick_est)])
 X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=test_size, random_state=random_state, stratify=y if is_bin else None
 )
+with st.spinner("Treinando e construindo matriz de design..."):
+    pipe.fit(X_train, y_train)
+    pre_fit = pipe.named_steps["pre"].fit(X_train, y_train)
+    X_train_design = pre_fit.transform(X_train)
+    feat_names = get_feature_names(pre_fit, sel_num, sel_cat)
+    X_train_df = pd.DataFrame(X_train_design, columns=feat_names)
 # -----------------------------
+# Inferência estatística (item e)
 # -----------------------------
+st.markdown("## 📚 Inferência estatística (Item e)")
+with st.spinner("Ajustando modelo de inferência (statsmodels)..."):
+    res, infer_tbl = fit_inference(model_type, X_train_df, y_train)
+if is_bin:
+    st.caption("Modelo: **Logit** (alvo binário). Coeficientes em log-odds; também exibimos **odds ratios**.")
+    cols_show = ["feature","coef","std_err","z/t","p_value","ci_low","ci_high","odds_ratio","or_ci_low","or_ci_high"]
+else:
+    st.caption("Modelo: **OLS** (alvo contínuo). Coeficientes, erros-padrão, estatística t e IC 95%.")
+    cols_show = ["feature","coef","std_err","z/t","p_value","ci_low","ci_high"]
+st.dataframe(infer_tbl[cols_show].round(4), use_container_width=True)
+# -----------------------------
+# Métricas de desempenho
+# -----------------------------
+st.markdown("### 📈 Desempenho do modelo")
+if is_bin:
+    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe.named_steps["est"], "predict_proba") else pipe.predict(X_test)
+    y_pred = (y_proba >= 0.5).astype(int)
+    auc = roc_auc_score(y_test, y_proba)
+    acc = accuracy_score(y_test, y_pred)
+    c1, c2 = st.columns(2)
+    with c1: st.metric("AUC (ROC)", f"{auc:.3f}")
+    with c2: st.metric("Acurácia (0.5)", f"{acc:.3f}")
+    cm = confusion_matrix(y_test, y_pred)
+    st.markdown("**Matriz de confusão (teste)**")
+    st.dataframe(pd.DataFrame(cm, index=["Real 0","Real 1"], columns=["Pred 0","Pred 1"]), use_container_width=True)
+    fpr, tpr, _ = roc_curve(y_test, y_proba)
+    roc_data = pd.DataFrame({"fpr": fpr, "tpr": tpr})
+    roc_chart = alt.Chart(roc_data).mark_line().encode(x="fpr:Q", y="tpr:Q").properties(height=250, width=380)
+    diag = alt.Chart(pd.DataFrame({"x":[0,1],"y":[0,1]})).mark_line(strokeDash=[4,4]).encode(x="x", y="y")
+    st.altair_chart(roc_chart + diag, use_container_width=True)
+else:
+    y_pred = pipe.predict(X_test)
+    r2 = r2_score(y_test, y_pred)
+    rmse = mean_squared_error(y_test, y_pred, squared=False)
+    c1, c2 = st.columns(2)
+    with c1: st.metric("R² (teste)", f"{r2:.3f}")
+    with c2: st.metric("RMSE (teste)", f"{rmse:.3f}")
 # -----------------------------
+# Efeito visual (força por |t/z|)
 # -----------------------------
+st.markdown("### 🌟 Força dos efeitos (|t/z|)")
+eff_df = infer_tbl[infer_tbl["feature"] != "const"].copy()
+eff_df["effect_strength"] = eff_df["z/t"].abs()
+eff_chart = alt.Chart(eff_df.sort_values("effect_strength", ascending=False).head(20)).mark_bar().encode(
+    x=alt.X("effect_strength:Q", title="|estatística t/z|"),
+    y=alt.Y("feature:N", sort='-x', title="Variável")
+).properties(height=420)
+st.altair_chart(eff_chart, use_container_width=True)
 # -----------------------------
 # Predição interativa
 # -----------------------------
+st.markdown("## 🔮 Predição interativa")
+st.caption("Ajuste valores para X e veja a probabilidade (Logit) ou valor previsto (OLS).")
 with st.form("pred_form"):
     cols = st.columns(3)
+    user_inputs = {}
+    for i, col in enumerate(selected_feats):
+        with cols[i % 3]:
             if col in sel_num:
+                q1, q5, q95, q99 = X_train[col].quantile([0.01,0.05,0.95,0.99])
+                default_val = float(np.nan_to_num(X_train[col].median(), nan=0.0))
+                user_inputs[col] = st.number_input(
+                    f"{col}", value=default_val,
                     help=f"Faixa típica ~ {q5:.2f}–{q95:.2f} (1–99%: {q1:.2f}–{q99:.2f})"
                 )
             else:
+                opts = sorted([str(x) for x in X_train[col].dropna().unique().tolist()])[:50]
+                user_inputs[col] = st.selectbox(f"{col}", options=opts if opts else [""], index=0 if opts else 0)
+    submitted = st.form_submit_button("Calcular")
 if submitted:
+    x_new = pd.DataFrame([user_inputs])
+    x_new_proc = pre_fit.transform(x_new)
+    x_new_df = pd.DataFrame(x_new_proc, columns=feat_names)
+    X_sm_new = sm.add_constant(x_new_df, has_constant="add")
+    y_hat = float(res.predict(X_sm_new)[0])
+    if is_bin:
+        st.success(f"Probabilidade prevista do evento alvo: **{y_hat:.2%}**")
+    else:
+        st.success(f"Valor previsto do alvo: **{y_hat:.4g}**")
 # -----------------------------
+# Recomendações (Item e)
 # -----------------------------
+st.markdown("## 🧭 Recomendações estratégicas (Item e)")
+for r in recs_from_inference(infer_tbl, model_type="logit" if is_bin else "ols", k=5):
     st.markdown("- " + r)
 st.markdown("---")
+st.caption("App Streamlit — Inferência Estatística • Dados/marketing_campaign.csv • Controles na barra lateral (esquerda).")