atividade1_nova

Sleeping

App Files Files Community

ricardoadriano commited on Nov 6, 2025

Commit

d5ce1f9

verified ·

1 Parent(s): eb89db6

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +233 -378

src/streamlit_app.py CHANGED Viewed

@@ -1,391 +1,246 @@
-# Dataset: Dados/marketing_campaign.csv
-import os
 import numpy as np
 import pandas as pd
-import streamlit as st
 import altair as alt
-from typing import List, Tuple
-from sklearn.model_selection import train_test_split
-from sklearn.compose import ColumnTransformer
-from sklearn.preprocessing import OneHotEncoder, StandardScaler
-from sklearn.pipeline import Pipeline
-from sklearn.impute import SimpleImputer
-from sklearn.metrics import (
-    roc_auc_score, accuracy_score, confusion_matrix, roc_curve,
-    r2_score, mean_squared_error
-)
-from sklearn.linear_model import LogisticRegression, LinearRegression
-import statsmodels.api as sm
-st.set_page_config(page_title="Inferência Estatística", layout="wide")
-st.title("Inferência Estatística — Reclamações de Clientes")
-st.caption("Escolha o **alvo** e as **variáveis explicativas** na barra lateral (esquerda) e obtenha a inferência estatística.")
-DATA_PATH = "Dados/marketing_campaign.csv"
-# ---------- Utilidades ----------
 @st.cache_data(show_spinner=False)
-def load_csv_try(path: str) -> pd.DataFrame:
-    """Lê CSV tentando separadores: vírgula, ponto-e-vírgula e tab."""
-    for sep in [",", ";", "\t"]:
-        try:
-            df = pd.read_csv(path, sep=sep, encoding="utf-8")
-            if sep != "\t" and df.shape[1] == 1:
-                continue
-            return df
-        except Exception:
-            continue
-    return pd.read_csv(path, sep=None, engine="python")
-def split_num_cat(df: pd.DataFrame, exclude: List[str]) -> Tuple[List[str], List[str]]:
-    num_cols = [c for c in df.columns if c not in exclude and np.issubdtype(df[c].dtype, np.number)]
-    cat_cols = [c for c in df.columns if c not in exclude and (df[c].dtype == "object" or df[c].dtype.name == "category")]
-    return num_cols, cat_cols
-def is_binary_series(s: pd.Series) -> bool:
-    vals = pd.unique(s.dropna())
-    return len(vals) == 2 or s.dtype == bool
-def coerce_numeric_series(s: pd.Series) -> pd.Series:
-    """Tenta converter strings numéricas para float (lida com vírgula decimal)."""
-    if np.issubdtype(s.dtype, np.number):
-        return s.astype(float)
-    tmp = s.astype(str).str.replace(r"[.\s]", "", regex=True).str.replace(",", ".", regex=False)
-    return pd.to_numeric(tmp, errors="coerce")
-def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
-    """Engenharia minimalista para o dataset padrão do Kaggle."""
-    out = df.copy()
-    # Tenure (dias desde Dt_Customer)
-    if "Dt_Customer" in out.columns:
-        out["Dt_Customer"] = pd.to_datetime(out["Dt_Customer"], errors="coerce", dayfirst=True)
-        out["TenureDays"] = (pd.Timestamp("today").normalize() - out["Dt_Customer"]).dt.days
-    # Total gasto (Mnt*)
-    mnt_cols = [c for c in out.columns if c.startswith("Mnt")]
-    if mnt_cols:
-        out["TotalMnt"] = out[mnt_cols].sum(axis=1)
-    # Compras totais e participações
-    buy_cols = [c for c in ["NumWebPurchases", "NumCatalogPurchases", "NumStorePurchases"] if c in out.columns]
-    if buy_cols:
-        out["TotalPurchases"] = out[buy_cols].sum(axis=1)
-        if "NumWebPurchases" in out.columns:
-            out["OnlineShare"] = out["NumWebPurchases"] / out["TotalPurchases"].replace(0, np.nan)
-        if "NumDealsPurchases" in out.columns:
-            out["PromoShare"] = out["NumDealsPurchases"] / out["TotalPurchases"].replace(0, np.nan)
-    # Ticket médio
-    if "TotalMnt" in out.columns and "TotalPurchases" in out.columns:
-        out["AvgTicket"] = out["TotalMnt"] / out["TotalPurchases"].replace(0, np.nan)
-    # Diversidade de cesta (quantos tipos Mnt*>0)
-    if mnt_cols:
-        out["BasketDiversity"] = (out[mnt_cols] > 0).sum(axis=1)
-    return out
-def build_preprocessor(num_cols: List[str], cat_cols: List[str]) -> ColumnTransformer:
-    """Imputação + padronização (num) e OHE drop='first' (cat) para evitar colinearidade."""
-    num_pipe = Pipeline([
-        ("imp", SimpleImputer(strategy="median")),
-        ("scaler", StandardScaler())
-    ])
-    cat_pipe = Pipeline([
-        ("imp", SimpleImputer(strategy="most_frequent")),
-        ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first", sparse_output=False))
-    ])
-    return ColumnTransformer([
-        ("num", num_pipe, num_cols),
-        ("cat", cat_pipe, cat_cols)
-    ])
-def get_feature_names(pre: ColumnTransformer, num_cols: List[str], cat_cols: List[str]) -> List[str]:
-    names = list(num_cols)
-    if cat_cols:
-        ohe = pre.named_transformers_["cat"].named_steps["ohe"]
-        names.extend(list(ohe.get_feature_names_out(cat_cols)))
-    return names
-def fit_inference(model_type: str, X_design: pd.DataFrame, y: pd.Series):
-    """Ajusta a inferência (statsmodels): Logit p/ binário; OLS p/ contínuo."""
-    X_sm = sm.add_constant(X_design, has_constant="add")
-    if model_type == "logit":
-        res = sm.Logit(y.values, X_sm).fit(disp=False)
-        or_vals = np.exp(res.params)
-        or_ci = np.exp(res.conf_int())
-        tbl = pd.DataFrame({
-            "feature": res.params.index,
-            "coef": res.params.values,
-            "std_err": res.bse.values,
-            "z/t": res.tvalues.values if hasattr(res, "tvalues") else res.tvalues,
-            "p_value": res.pvalues.values,
-            "ci_low": res.conf_int()[0].values,
-            "ci_high": res.conf_int()[1].values,
-            "odds_ratio": or_vals.values,
-            "or_ci_low": or_ci[0].values,
-            "or_ci_high": or_ci[1].values
         })
     else:
-        res = sm.OLS(y.values, X_sm).fit()
-        conf = res.conf_int()
-        tbl = pd.DataFrame({
-            "feature": res.params.index,
-            "coef": res.params.values,
-            "std_err": res.bse.values,
-            "z/t": res.tvalues.values,
-            "p_value": res.pvalues.values,
-            "ci_low": conf[0].values,
-            "ci_high": conf[1].values
-        })
-    return res, tbl
-def recs_from_inference(tbl: pd.DataFrame, model_type: str, k: int = 5):
-    """Gera recomendações (item e) a partir dos efeitos significativos (p<0.05), ignorando 'const'."""
-    df = tbl[tbl["feature"] != "const"].copy()
-    df = df.sort_values(["p_value", "z/t"], ascending=[True, False])
-    core = df[df["p_value"] < 0.05].head(k)
-    out = []
-    for _, r in core.iterrows():
-        feat = r["feature"]
-        sign = np.sign(r["coef"])
-        if model_type == "logit":
-            or_txt = f"(OR≈{r['odds_ratio']:.2f}, IC95% {r['or_ci_low']:.2f}–{r['or_ci_high']:.2f}, p={r['p_value']:.3g})"
-            if sign > 0:
-                out.append(f" **Reduzir exposição associada a `{feat}`** {or_txt}, pois aumento nessa variável eleva a probabilidade do alvo.")
-            else:
-                out.append(f" **Fortalecer fatores ligados a `{feat}`** {or_txt}, pois valores maiores reduzem a probabilidade do alvo.")
-        else:
-            eff = f"(β≈{r['coef']:.3g}, IC95% {r['ci_low']:.2g}–{r['ci_high']:.2g}, p={r['p_value']:.3g})"
-            if sign > 0:
-                out.append(f" **Mitigar o crescimento de `{feat}`** {eff}, pois contribui positivamente para o aumento do alvo.")
-            else:
-                out.append(f" **Aumentar `{feat}`** {eff}, pois está associado à redução do alvo.")
-    # trilhas transversais
-    out.append(" **Testes A/B** nas variáveis mais significativas para validar impacto causal.")
-    out.append(" **Melhorar FCR/primeiro contato** nas causas evidenciadas pelos top fatores.")
-    out.append(" **Feedback a Produto/Qualidade** guiado pelos efeitos com evidência estatística robusta.")
-    return out[:k+3]
-# ---------- Sidebar (lado esquerdo) ----------
-with st.sidebar:
-    st.header("Configuração")
-    if not os.path.exists(DATA_PATH):
-        st.error(f"Arquivo não encontrado: `{DATA_PATH}`. Suba o CSV em `Dados/`.")
-        st.stop()
-df_raw = load_csv_try(DATA_PATH)
-df_eng = engineer_features(df_raw)
-with st.sidebar:
-    st.markdown("**Alvo (variável dependente):**")
-    all_cols = df_eng.columns.tolist()
-    # Alvo padrão fixo: Response (se existir). Caso contrário, mesma lógica de fallback.
-    if "Response" in all_cols:
-        default_target = "Response"
-    else:
-        default_target = None
-        for c in all_cols:
-            if is_binary_series(df_eng[c]): default_target = c; break
-        if default_target is None:
-            for c in all_cols:
-                if np.issubdtype(df_eng[c].dtype, np.number): default_target = c; break
-    target_col = st.selectbox("Alvo (y)", options=all_cols, index=all_cols.index(default_target) if default_target in all_cols else 0)
-# Variáveis explicativas
-exclude = [target_col]
-num_cols_all, cat_cols_all = split_num_cat(df_eng, exclude=exclude)
-# X padrão alinhado ao Colab (só incluir se existir na base)
-preferred_defaults = [
-    "Income", "Recency", "Education", "Marital_Status",
-    "TenureDays", "TotalMnt", "TotalPurchases",
-    "OnlineShare", "PromoShare", "AvgTicket", "BasketDiversity",
-    "NumWebVisitsMonth"
-]
-default_X = [c for c in preferred_defaults if c in (num_cols_all + cat_cols_all)]
-with st.sidebar:
-    st.markdown("**Variáveis explicativas (X):**")
-    # Se nada dos preferidos existir, cai no fallback antigo (algumas num + categ)
-    if not default_X:
-        engineered_first = [c for c in ["TenureDays","TotalMnt","TotalPurchases","OnlineShare","PromoShare","AvgTicket","BasketDiversity"] if c in num_cols_all]
-        default_X = engineered_first + [c for c in num_cols_all if c not in engineered_first][:5] + cat_cols_all[:3]
-    selected_feats = st.multiselect("Selecione X", options=(num_cols_all + cat_cols_all), default=default_X)
-    test_size = st.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
-    random_state = st.number_input("Random seed", value=42, step=1)
-if len(selected_feats) == 0:
-    st.warning("Selecione pelo menos uma variável explicativa.")
     st.stop()
-# ---------- Amostra ----------
-st.markdown("### Amostra dos dados")
-st.dataframe(df_eng[[target_col] + selected_feats].head(12), use_container_width=True)
-# ---------- Preparação do alvo ----------
-y_raw = df_eng[target_col]
-# 1) tenta identificar binário diretamente
-is_bin = is_binary_series(y_raw)
-# 2) se não binário, tenta numérico (coerção segura)
-y_numeric_try = coerce_numeric_series(y_raw) if not is_bin else None
-is_numeric_ok = False
-if not is_bin and y_numeric_try is not None:
-    conv_rate = y_numeric_try.notna().mean()
-    is_numeric_ok = conv_rate >= 0.8
-# 3) se não binário e não numérico, vira categórico multi-classe → one-vs-rest
-with st.sidebar:
-    positive_class = None
-    if not is_bin and not is_numeric_ok:
-        uniq_vals = sorted(pd.unique(y_raw.dropna()).tolist(), key=lambda x: str(x))
-        st.markdown("**Alvo categórico com múltiplas classes**")
-        positive_class = st.selectbox("Classe 'positiva' (one-vs-rest)", options=uniq_vals, index=0)
-        st.caption("O modelo fará Logit para a classe escolhida vs. as demais.")
-# ---------- Montagem de y conforme os casos ----------
-if is_bin:
-    if not np.issubdtype(y_raw.dtype, np.number):
-        uniq = sorted(pd.unique(y_raw.dropna()).tolist(), key=lambda x: str(x))
-        y = y_raw.replace({uniq[0]: 0, uniq[1]: 1}).astype(int)
-    else:
-        y = y_raw.astype(int)
-    model_type = "logit"
-elif is_numeric_ok:
-    y = y_numeric_try.astype(float)
-    model_type = "ols"
-else:
-    y = (y_raw == positive_class).astype(int)
-    model_type = "logit"
-# Alinha df aos y válidos
-mask_valid = y.notna()
-df_model = df_eng.loc[mask_valid].copy()
-y = y.loc[mask_valid]
-X = df_model[selected_feats].copy()
-# ---------- Pré-processamento e treino ----------
-sel_num = [c for c in selected_feats if np.issubdtype(X[c].dtype, np.number)]
-sel_cat = [c for c in selected_feats if (X[c].dtype == "object" or X[c].dtype.name == "category")]
-pre = build_preprocessor(sel_num, sel_cat)
-quick_est = LogisticRegression(max_iter=200) if model_type == "logit" else LinearRegression()
-pipe = Pipeline([("pre", pre), ("est", quick_est)])
-X_train, X_test, y_train, y_test = train_test_split(
-    X, y, test_size=test_size, random_state=random_state,
-    stratify=y if model_type == "logit" else None
 )
-with st.spinner("Treinando e construindo matriz de design..."):
-    pipe.fit(X_train, y_train)
-    pre_fit = pipe.named_steps["pre"].fit(X_train, y_train)
-    X_train_design = pre_fit.transform(X_train)
-    # nomes das features após OHE
-    ohe_names = []
-    if sel_cat:
-        ohe_names = list(pre_fit.named_transformers_["cat"].named_steps["ohe"].get_feature_names_out(sel_cat))
-    feat_names = sel_num + ohe_names
-    X_train_df = pd.DataFrame(X_train_design, columns=feat_names)
-# ---------- Inferência (item e) ----------
-st.markdown("## Inferência estatística")
-with st.spinner("Ajustando modelo de inferência (statsmodels)..."):
-    res, infer_tbl = fit_inference(model_type, X_train_df, y_train)
-if model_type == "logit":
-    if positive_class is not None:
-        st.caption(f"Modelo: **Logit** (one-vs-rest). Classe positiva: **{positive_class}**.")
-    else:
-        st.caption("Modelo: **Logit** (alvo binário). Coeficientes em log-odds; exibimos **odds ratios** e IC 95%.")
-    cols_show = ["feature","coef","std_err","z/t","p_value","ci_low","ci_high","odds_ratio","or_ci_low","or_ci_high"]
-else:
-    st.caption("Modelo: **OLS** (alvo contínuo). Coeficientes, erros-padrão, estatística t e IC 95%.")
-    cols_show = ["feature","coef","std_err","z/t","p_value","ci_low","ci_high"]
-st.dataframe(infer_tbl[cols_show].round(4), use_container_width=True)
-# ---------- Métricas ----------
-st.markdown("### Desempenho do modelo")
-if model_type == "logit":
-    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe.named_steps["est"], "predict_proba") else pipe.predict(X_test)
-    y_pred = (y_proba >= 0.5).astype(int)
-    auc = roc_auc_score(y_test, y_proba)
-    acc = accuracy_score(y_test, y_pred)
-    c1, c2 = st.columns(2)
-    with c1: st.metric("AUC (ROC)", f"{auc:.3f}")
-    with c2: st.metric("Acurácia (0.5)", f"{acc:.3f}")
-    cm = confusion_matrix(y_test, y_pred)
-    st.markdown("**Matriz de confusão (teste)**")
-    st.dataframe(pd.DataFrame(cm, index=["Real 0","Real 1"], columns=["Pred 0","Pred 1"]), use_container_width=True)
-    fpr, tpr, _ = roc_curve(y_test, y_proba)
-    roc_data = pd.DataFrame({"fpr": fpr, "tpr": tpr})
-    roc_chart = alt.Chart(roc_data).mark_line().encode(x="fpr:Q", y="tpr:Q").properties(height=250, width=380)
-    diag = alt.Chart(pd.DataFrame({"x":[0,1],"y":[0,1]})).mark_line(strokeDash=[4,4]).encode(x="x", y="y")
-    st.altair_chart(roc_chart + diag, use_container_width=True)
-else:
-    y_pred = pipe.predict(X_test)
-    r2 = r2_score(y_test, y_pred)
-    rmse = mean_squared_error(y_test, y_pred, squared=False)
-    c1, c2 = st.columns(2)
-    with c1: st.metric("R² (teste)", f"{r2:.3f}")
-    with c2: st.metric("RMSE (teste)", f"{rmse:.3f}")
-# ---------- Força dos efeitos ----------
-st.markdown("### Força dos efeitos (|t/z|)")
-eff_df = infer_tbl[infer_tbl["feature"] != "const"].copy()
-eff_df["effect_strength"] = eff_df["z/t"].abs()
-eff_chart = alt.Chart(eff_df.sort_values("effect_strength", ascending=False).head(20)).mark_bar().encode(
-    x=alt.X("effect_strength:Q", title="|estatística t/z|"),
-    y=alt.Y("feature:N", sort='-x', title="Variável")
-).properties(height=420)
-st.altair_chart(eff_chart, use_container_width=True)
-# ---------- Predição interativa ----------
-st.markdown("## Predição interativa")
-st.caption("Ajuste valores para X e veja a probabilidade (Logit) ou valor previsto (OLS).")
-with st.form("pred_form"):
-    cols = st.columns(3)
-    user_inputs = {}
-    for i, col in enumerate(selected_feats):
-        with cols[i % 3]:
-            if col in [c for c in selected_feats if np.issubdtype(X[c].dtype, np.number)]:
-                q1, q5, q95, q99 = X_train[col].quantile([0.01,0.05,0.95,0.99])
-                default_val = float(np.nan_to_num(X_train[col].median(), nan=0.0))
-                user_inputs[col] = st.number_input(
-                    f"{col}", value=default_val,
-                    help=f"Faixa típica ~ {q5:.2f}–{q95:.2f} (1–99%: {q1:.2f}–{q99:.2f})"
-                )
-            else:
-                opts = sorted([str(x) for x in X_train[col].dropna().unique().tolist()])[:50]
-                user_inputs[col] = st.selectbox(f"{col}", options=opts if opts else [""], index=0 if opts else 0)
-    submitted = st.form_submit_button("Calcular")
-if submitted:
-    x_new = pd.DataFrame([user_inputs])
-    x_new_proc = pre_fit.transform(x_new)
-    x_new_df = pd.DataFrame(x_new_proc, columns=feat_names)
-    X_sm_new = sm.add_constant(x_new_df, has_constant="add")
-    y_hat = float(res.predict(X_sm_new)[0])
-    if model_type == "logit":
-        st.success(f"Probabilidade prevista do alvo: **{y_hat:.2%}**")
-    else:
-        st.success(f"Valor previsto do alvo: **{y_hat:.4g}**")
-# ---------- Recomendações (item e) ----------
-st.markdown("## Recomendações estratégicas (Item e)")
-for r in recs_from_inference(infer_tbl, model_type=model_type, k=5):
-    st.markdown("- " + r)
-st.markdown("---")
-st.caption("Controles na barra lateral (esquerda) • Dados: `Dados/marketing_campaign.csv` • Inferência conforme item (e).")

+import re
 import numpy as np
 import pandas as pd
 import altair as alt
+import streamlit as st
+from pathlib import Path
+st.set_page_config(page_title="Simulação Monte Carlo (Dirichlet–Multinomial)", layout="wide")
+# ===================== Sidebar: parâmetros =====================
+st.sidebar.title("Parâmetros da Simulação")
+N_SIM = st.sidebar.number_input("Número de simulações", min_value=1000, max_value=200_000, value=10_000, step=1000)
+META_APROV = st.sidebar.slider("Meta de aprovação (≥)", 0.50, 0.95, 0.80, 0.01)
+MAX_EVASAO = st.sidebar.slider("Limite de evasão (≤)", 0.00, 0.40, 0.15, 0.01)
+ADD_K = st.sidebar.select_slider("Suavização add-k", options=[0.5, 1.0, 2.0], value=1.0)
+N_MULT = st.sidebar.select_slider("Cenário do tamanho da turma (n ×)", options=[0.9, 1.0, 1.1], value=1.0)
+SEED = st.sidebar.number_input("Semente aleatória", min_value=0, value=42, step=1)
+# ===================== Helpers =====================
+def _norm_cols(cols):
+    return [re.sub(r"\s+", " ", str(c)).strip().replace("%", "pct") for c in cols]
+def _pick(col, pats):
+    return any(re.search(p, col, re.I) for p in pats)
+def _to_num(s):
+    return pd.to_numeric(
+        s.astype(str)
+         .str.replace("%", "", regex=False)
+         .str.replace(",", ".", regex=False)
+         .str.strip(),
+        errors="coerce"
+    )
+def _try_read_csv(path: Path):
+    """Lê Dados/levantamentoTurmas.csv tentando separadores e encodings comuns."""
+    if not path.exists():
+        return None, f"Arquivo esperado não encontrado: {path}"
+    last_err = None
+    for enc in ("utf-8-sig", "utf-8", "latin1"):
+        for sep in (None, ",", ";", "\t"):  # None = autodetect
+            try:
+                df = pd.read_csv(path, sep=sep, engine="python", encoding=enc)
+                if df.shape[1] == 1 and sep is None:
+                    df = pd.read_csv(path, sep=";", engine="python", encoding=enc)
+                return df, {"source": str(path), "sep": sep if sep is not None else "auto", "encoding": enc}
+            except Exception as e:
+                last_err = e
+                continue
+    return None, f"Falha ao ler {path}: {last_err}"
+@st.cache_data(show_spinner=False)
+def load_dataframe_from_dados():
+    csv_path = Path("Dados/levantamentoTurmas.csv")
+    df, meta = _try_read_csv(csv_path)
+    if df is None:
+        return None, meta  # mensagem de erro
+    # Normalização de cabeçalhos
+    df.columns = _norm_cols(df.columns)
+    # Renomeação inteligente
+    ren = {}
+    for c in df.columns:
+        lc = c.lower()
+        if _pick(c, [r"^turma"]):               ren[c] = "Turma"
+        elif _pick(c, [r"matriculado"]):        ren[c] = "Matriculados"
+        elif _pick(c, [r"\baprov"]):            ren[c] = "Aprovados" if "pct" not in lc else "pct_Aprov"
+        elif _pick(c, [r"reprov"]):             ren[c] = "Reprovados" if "pct" not in lc else "pct_Reprov"
+        elif _pick(c, [r"desistent|evas"]):     ren[c] = "Desistentes" if "pct" not in lc else "pct_Desist"
+    df = df.rename(columns=ren)
+    # Converte números/percentuais
+    for c in ["Matriculados","Aprovados","Reprovados","Desistentes","pct_Aprov","pct_Reprov","pct_Desist"]:
+        if c in df.columns:
+            df[c] = _to_num(df[c])
+    # Reconstrói contagens quando vierem apenas em %
+    if "Aprovados" not in df.columns and "pct_Aprov" in df.columns:
+        df["Aprovados"] = (df["pct_Aprov"]/100 * df["Matriculados"]).round()
+    if "Reprovados" not in df.columns and "pct_Reprov" in df.columns:
+        df["Reprovados"] = (df["pct_Reprov"]/100 * df["Matriculados"]).round()
+    if "Desistentes" not in df.columns and "pct_Desist" in df.columns:
+        df["Desistentes"] = (df["pct_Desist"]/100 * df["Matriculados"]).round()
+    need = ["Turma","Matriculados","Aprovados","Reprovados","Desistentes"]
+    miss = [c for c in need if c not in df.columns]
+    if miss:
+        return None, f"Colunas ausentes no CSV ({csv_path}): {miss}"
+    base = df[need].copy()
+    for c in need[1:]:
+        base[c] = pd.to_numeric(base[c], errors="coerce").fillna(0).astype(int)
+    base = base[base["Matriculados"] > 0].copy()
+    base["Turma"] = base["Turma"].astype(str).str.strip()
+    # Ajuste de soma
+    soma = base[["Aprovados","Reprovados","Desistentes"]].sum(axis=1)
+    diff = soma != base["Matriculados"]
+    base.loc[diff, "Aprovados"] = (
+        base.loc[diff, "Matriculados"] - base.loc[diff, ["Reprovados","Desistentes"]].sum(axis=1)
+    ).clip(lower=0)
+    if len(base) == 0:
+        return None, "Após limpeza, não restaram turmas válidas."
+    return base.reset_index(drop=True), None
 @st.cache_data(show_spinner=False)
+def simulate_dirichlet_multinomial(base: pd.DataFrame, n_sim: int, meta_aprov: float, max_evasao: float, add_k: float, n_mult: float, seed: int):
+    rng = np.random.default_rng(seed)
+    rows = []
+    for _, r in base.iterrows():
+        turma = r["Turma"]
+        n0 = int(r["Matriculados"])
+        n = max(1, int(round(n0 * n_mult)))
+        a, rp, dz = int(r["Aprovados"]), int(r["Reprovados"]), int(r["Desistentes"])
+        alpha = np.array([a + add_k, rp + add_k, dz + add_k], dtype=float)
+        P = rng.dirichlet(alpha, size=n_sim)
+        counts = np.vstack([rng.multinomial(n, p) for p in P])
+        t_ap = counts[:, 0] / n
+        t_dz = counts[:, 2] / n
+        rows.append({
+            "Turma": turma,
+            "Matriculados": n,
+            "Média_Aprov": t_ap.mean(),
+            "P5_Aprov": np.percentile(t_ap, 5),
+            "P50_Aprov": np.percentile(t_ap, 50),
+            "P95_Aprov": np.percentile(t_ap, 95),
+            "Média_Desist": t_dz.mean(),
+            "P5_Desist": np.percentile(t_dz, 5),
+            "P50_Desist": np.percentile(t_dz, 50),
+            "P95_Desist": np.percentile(t_dz, 95),
+            "Prob_Meta": ((t_ap >= meta_aprov) & (t_dz <= max_evasao)).mean()
         })
+    return pd.DataFrame(rows).sort_values("Prob_Meta", ascending=False).reset_index(drop=True)
+@st.cache_data(show_spinner=False)
+def sample_turma(base: pd.DataFrame, turma_label: str, n_sim: int, add_k: float, n_mult: float, seed: int):
+    turma_label = str(turma_label).strip()
+    m = base["Turma"] == turma_label
+    if not m.any():
+        mc = base["Turma"].str.contains(re.escape(turma_label), case=False, na=False)
+        if not mc.any():
+            return None, None
+        idx = base.index[mc][0]
     else:
+        idx = base.index[m][0]
+    r = base.loc[idx]
+    n0 = int(r["Matriculados"])
+    n = max(1, int(round(n0 * n_mult)))
+    a, rp, dz = int(r["Aprovados"]), int(r["Reprovados"]), int(r["Desistentes"])
+    alpha = np.array([a + add_k, rp + add_k, dz + add_k], dtype=float)
+    rng = np.random.default_rng(seed)
+    P = rng.dirichlet(alpha, size=n_sim)
+    C = np.vstack([rng.multinomial(n, p) for p in P])
+    return C[:, 0] / n, C[:, 2] / n
+# ===================== App =====================
+st.title("Simulação de Monte Carlo — Dirichlet–Multinomial")
+st.caption("O app lê **Dados/levantamentoTurmas.csv**. Ajuste os parâmetros na lateral e simule.")
+base, err = load_dataframe_from_dados()
+if err:
+    st.error(err)
     st.stop()
+with st.expander("Ver dados utilizados (base limpa)", expanded=False):
+    st.dataframe(base)
+sim_df = simulate_dirichlet_multinomial(
+    base=base,
+    n_sim=int(N_SIM),
+    meta_aprov=float(META_APROV),
+    max_evasao=float(MAX_EVASAO),
+    add_k=float(ADD_K),
+    n_mult=float(N_MULT),
+    seed=int(SEED)
 )
+st.subheader("Resultados por turma")
+st.dataframe(sim_df.style.format({
+    "Média_Aprov": "{:.3f}", "P5_Aprov": "{:.3f}", "P50_Aprov": "{:.3f}", "P95_Aprov": "{:.3f}",
+    "Média_Desist": "{:.3f}", "P5_Desist": "{:.3f}", "P50_Desist": "{:.3f}", "P95_Desist": "{:.3f}",
+    "Prob_Meta": "{:.3f}"
+}))
+st.download_button(
+    label="Baixar resultados (CSV)",
+    data=sim_df.to_csv(index=False).encode("utf-8"),
+    file_name="resultados_simulacao.csv",
+    mime="text/csv"
+)
+st.subheader("Probabilidade de bater a meta (ordenado)")
+chart_prob = (
+    alt.Chart(sim_df.sort_values("Prob_Meta", ascending=True))
+    .mark_bar()
+    .encode(
+        x=alt.X("Prob_Meta:Q", title=f"Prob. (aprovação ≥ {META_APROV:.0%} & evasão ≤ {MAX_EVASAO:.0%})"),
+        y=alt.Y("Turma:N", sort="-x", title="Turma"),
+        tooltip=[
+            alt.Tooltip("Turma:N"),
+            alt.Tooltip("Prob_Meta:Q", format=".3f"),
+            alt.Tooltip("Média_Aprov:Q", format=".3f"),
+            alt.Tooltip("Média_Desist:Q", format=".3f"),
+        ],
+    ).properties(height=400)
+)
+st.altair_chart(chart_prob, use_container_width=True)
+st.subheader("Distribuições simuladas (detalhe por turma)")
+col1, col2 = st.columns(2)
+with col1:
+    turma_sel = st.selectbox("Escolha uma turma", options=sim_df["Turma"].tolist(), index=0)
+with col2:
+    st.write(f"Meta de aprovação ≥ **{META_APROV:.0%}** | Evasão ≤ **{MAX_EVASAO:.0%}**")
+    st.write(f"add-k = **{ADD_K}** · n × = **{N_MULT}** · simulações = **{N_SIM}**")
+t_ap, t_dz = sample_turma(base, turma_sel, int(N_SIM), float(ADD_K), float(N_MULT), int(SEED))
+if t_ap is None:
+    st.warning("Turma não encontrada após normalização.")
+else:
+    h_ap = (
+        alt.Chart(pd.DataFrame({"taxa_aprov": t_ap}))
+        .mark_bar()
+        .encode(x=alt.X("taxa_aprov:Q", bin=alt.Bin(maxbins=30), title="Taxa de aprovação"),
+                y=alt.Y("count()", title="Frequência"))
+        .properties(height=300)
+    )
+    linha_meta = alt.Chart(pd.DataFrame({"x": [META_APROV]})).mark_rule(strokeDash=[6,4]).encode(x="x:Q")
+    st.altair_chart(h_ap + linha_meta, use_container_width=True)
+    h_dz = (
+        alt.Chart(pd.DataFrame({"taxa_evasao": t_dz}))
+        .mark_bar()
+        .encode(x=alt.X("taxa_evasao:Q", bin=alt.Bin(maxbins=30), title="Taxa de evasão"),
+                y=alt.Y("count()", title="Frequência"))
+        .properties(height=300)
+    )
+    linha_lim = alt.Chart(pd.DataFrame({"x": [MAX_EVASAO]})).mark_rule(strokeDash=[6,4]).encode(x="x:Q")
+    st.altair_chart(h_dz + linha_lim, use_container_width=True)