Atividade5_1

Sleeping

App Files Files Community

ricardoadriano commited on Oct 23, 2025

Commit

5189ebb

verified ·

1 Parent(s): 7d4c7e1

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +205 -181

src/streamlit_app.py CHANGED Viewed

@@ -1,196 +1,220 @@
 #!/usr/bin/env python
 # coding: utf-8
-# =====================================================
-# Dashboard - AmesHousing (Tarefa 4) - versão compacta
-# =====================================================
-import streamlit as st
-import pandas as pd
-import matplotlib.pyplot as plt
-import seaborn as sns
-import plotly.express as px
 import numpy as np
-from scipy.stats import shapiro, levene, kruskal
-from statsmodels.formula.api import ols
-import statsmodels.api as sm
-from statsmodels.stats.diagnostic import het_breuschpagan
-from statsmodels.stats.outliers_influence import variance_inflation_factor
 from sklearn.model_selection import train_test_split
-from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
-# -----------------------------------------------------
-# Configuração da Página
-# -----------------------------------------------------
-st.set_page_config(
-    page_title="Dashboard - AmesHousing (Tarefa 4)",
-    layout="wide",
-    initial_sidebar_state="expanded"
-)
-st.query_params.clear()
-st.markdown("<h1 style='text-align:center;color:#003366;'>Análise do Dataset AmesHousing</h1>", unsafe_allow_html=True)
-st.markdown("---")
-# -----------------------------------------------------
-# Leitura do CSV
-# -----------------------------------------------------
 @st.cache_data
-def carregar_dados():
-    paths_tentativa = [
-        "AmesHousing.csv",
-        "/mnt/data/AmesHousing.csv",
-        "../Dados/AmesHousing.csv",
     ]
-    for p in paths_tentativa:
         try:
             df = pd.read_csv(p)
-            return df
-        except:
             continue
-    return pd.DataFrame()
-casa_data = carregar_dados()
-casa_data.columns = casa_data.columns.str.strip().str.replace(" ", "_")
 # -----------------------------
-# Sidebar — Filtros + Seleção de Variáveis (Regressão)
 # -----------------------------
-st.sidebar.markdown("### Filtros AmesHousing")
-bairros = st.sidebar.multiselect(
-    "Selecione bairros",
-    options=sorted(casa_data["Neighborhood"].dropna().unique()),
-    default=None
-)
-dados_filtrados = casa_data.copy()
-if bairros:
-    dados_filtrados = dados_filtrados[dados_filtrados["Neighborhood"].isin(bairros)]
-st.sidebar.markdown("---")
-st.sidebar.subheader("Regressão Linear — Tarefa 4 (PPCA/UnB)")
-st.sidebar.markdown("Selecione variáveis para modelagem do **SalePrice** (alvo).")
-candidatos_numericas = [
-    c for c in [
-        'Bedroom_AbvGr','Full_Bath','Half_Bath','TotRms_AbvGrd','Gr_Liv_Area',
-        'Garage_Cars','Garage_Area','Overall_Qual','Overall_Cond','Year_Built','Lot_Area',
-        'Fireplaces'
-    ] if c in dados_filtrados.columns
-]
-candidatos_categoricas = [c for c in ['Neighborhood','House_Style','Bldg_Type','Garage_Type','Kitchen_Qual'] if c in dados_filtrados.columns]
-feats_num = st.sidebar.multiselect("Numéricas", options=candidatos_numericas, default=['Bedroom_AbvGr','Garage_Cars','Gr_Liv_Area'] if 'Garage_Cars' in candidatos_numericas else candidatos_numericas[:2])
-feats_cat = st.sidebar.multiselect("Categóricas", options=candidatos_categoricas, default=['Neighborhood'] if 'Neighborhood' in candidatos_categoricas else [])
-interagir = st.sidebar.checkbox("Adicionar interação entre duas variáveis", value=False)
-inter_1 = inter_2 = None
-if interagir:
-    inter_1 = st.sidebar.selectbox("Variável 1 (para interação)", options=feats_num + feats_cat, index=0 if feats_num else 0)
-    inter_2 = st.sidebar.selectbox("Variável 2 (para interação)", options=[v for v in feats_num + feats_cat if v != inter_1], index=0)
-usar_logy = st.sidebar.checkbox("Aplicar transformação log(SalePrice) caso pressupostos sejam violados", value=False)
-teste_size = st.sidebar.slider("Proporção de teste (holdout)", 0.1, 0.5, 0.2, 0.05)
-alpha_reg = st.sidebar.slider("Nível de significância (α) — Regressão", 0.01, 0.10, 0.05, 0.01)
-# >>> botão agora na sidebar <<<
-ajustar = st.sidebar.button("Ajustar modelo")
-# -------------------------------------------------
-# Conteúdo principal — Análises
-# -------------------------------------------------
-# Distribuição de Preço de Venda
-st.subheader("Distribuição do Preço de Venda")
-if not dados_filtrados.empty:
-    fig, ax = plt.subplots(figsize=(5,3.5))
-    sns.histplot(dados_filtrados['SalePrice'], kde=True, ax=ax)
-    st.pyplot(fig, clear_figure=True, use_container_width=False)
-# Boxplots
-st.subheader("Boxplots das Variáveis Selecionadas")
-variavel = st.selectbox(
-    "Escolha a variável categórica para comparar preços:",
-    ["Neighborhood","Garage_Type","Fireplaces"]
-)
-if not dados_filtrados.empty and len(dados_filtrados[variavel].dropna().unique()) > 1:
-    fig2, ax2 = plt.subplots(figsize=(5,3.5))
-    sns.boxplot(x=variavel, y="SalePrice", data=dados_filtrados, ax=ax2)
-    plt.xticks(rotation=60)
-    st.pyplot(fig2, clear_figure=True, use_container_width=False)
-# Scatter interativo
-st.subheader("Preço Médio de Venda por Bairro")
-if not dados_filtrados.empty:
-    bairro_grouped = dados_filtrados.groupby('Neighborhood').agg(
-        count=('SalePrice','size'),
-        mean_price=('SalePrice','mean')
-    ).reset_index()
-    bairro_filtered = bairro_grouped[bairro_grouped['count'] >= 5]
-    if not bairro_filtered.empty:
-        fig3 = px.scatter(
-            bairro_filtered,
-            x='mean_price', y='Neighborhood',
-            size='count', color='Neighborhood',
-            labels={'mean_price': 'Preço Médio de Venda', 'Neighborhood':'Bairro'}
-        )
-        fig3.update_layout(width=550, height=320)
-        st.plotly_chart(fig3, use_container_width=False)
-# =================================================
-# Regressão Linear — Tarefa 4
-# =================================================
-def construir_formula(y, feats_num, feats_cat, inter_1=None, inter_2=None):
-    termos = feats_num + [f"C({c})" for c in feats_cat]
-    if inter_1 and inter_2:
-        a = f"C({inter_1})" if inter_1 in feats_cat else inter_1
-        b = f"C({inter_2})" if inter_2 in feats_cat else inter_2
-        termos.append(f"{a}:{b}")
-    rhs = " + ".join(termos) if termos else "1"
-    return f"{y} ~ {rhs}"
-if ajustar:   # botão na sidebar
-    cols_necessarias = ['SalePrice'] + feats_num + feats_cat
-    if interagir and inter_1 and inter_2:
-        cols_necessarias += [inter_1, inter_2]
-    df_modelo = dados_filtrados[cols_necessarias].dropna().copy()
-    if not df_modelo.empty:
-        y_col = 'SalePrice'
-        if usar_logy:
-            df_modelo['SalePrice'] = np.log(df_modelo['SalePrice'])
-        formula = construir_formula(y_col, feats_num, feats_cat, inter_1 if interagir else None, inter_2 if interagir else None)
-        df_treino, df_teste = train_test_split(df_modelo, test_size=teste_size, random_state=42)
-        model = ols(formula, data=df_treino).fit()
-        st.code(formula)
-        st.dataframe(model.summary2().tables[1], use_container_width=False, height=200)
-        y_true, y_pred = df_teste['SalePrice'], model.predict(df_teste)
-        if usar_logy:
-            y_true, y_pred = np.exp(y_true), np.exp(y_pred)
-        metrics = pd.DataFrame({
-            'Métrica': ['R²','RMSE','MAE'],
-            'Valor': [r2_score(y_true, y_pred),
-                      mean_squared_error(y_true, y_pred, squared=False),
-                      mean_absolute_error(y_true, y_pred)]
-        })
-        st.dataframe(metrics, use_container_width=False, height=120)
-        # Gráficos diagnósticos compactos
-        residuos, fitted = model.resid, model.fittedvalues
-        cols = st.columns(3)
-        with cols[0]:
-            fig_r, ax_r = plt.subplots(figsize=(3,3))
-            ax_r.scatter(fitted, residuos, alpha=0.5)
-            ax_r.axhline(0, color='red', linestyle='--')
-            st.pyplot(fig_r, clear_figure=True, use_container_width=False)
-        with cols[1]:
-            fig_q, ax_q = plt.subplots(figsize=(3,3))
-            sm.qqplot(residuos, line='45', fit=True, ax=ax_q)
-            st.pyplot(fig_q, clear_figure=True, use_container_width=False)
-        with cols[2]:
-            fig_h, ax_h = plt.subplots(figsize=(3,3))
-            sns.histplot(residuos, kde=True, ax=ax_h)
-            st.pyplot(fig_h, clear_figure=True, use_container_width=False)

 #!/usr/bin/env python
 # coding: utf-8
+"""
+Streamlit – Churn (Logistic Regression) for Hugging Face Spaces
+---------------------------------------------------------------
+- Loads "Dados/Churn_Modelling.csv"
+- Lets the user choose features (numeric / categorical)
+- Trains Logistic Regression
+- Shows coefficients, odds ratios, and quick interpretations
+- Provides an interactive control panel to simulate a customer's probability of churn
+Obs.: Esta versão atende ao item (a) da Tarefa 5: modelagem com Regressão Logística
+e interpretação dos coeficientes/odds ratio.
+"""
+import os
 import numpy as np
+import pandas as pd
+import streamlit as st
 from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+from sklearn.compose import ColumnTransformer
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+# -----------------------------
+# Page config
+# -----------------------------
+st.set_page_config(page_title="Churn – Regressão Logística (PPCA/UnB)", layout="wide", initial_sidebar_state="expanded")
+st.title("Churn – Regressão Logística (PPCA/UnB)")
+st.caption("Item (a) – Modelagem da Retenção de Clientes e interpretação de coeficientes/odds ratio.")
+# -----------------------------
+# Data loader (cache)
+# -----------------------------
 @st.cache_data
+def load_data():
+    tried = [
+        "Dados/Churn_Modelling.csv",
+        "./Dados/Churn_Modelling.csv",
+        "/mnt/data/Dados/Churn_Modelling.csv",
+        "Churn_Modelling.csv",
+        "./Churn_Modelling.csv"
     ]
+    last_err = None
+    for p in tried:
         try:
             df = pd.read_csv(p)
+            return df, p
+        except Exception as e:
+            last_err = e
             continue
+    return pd.DataFrame(), str(last_err)
+df, data_info = load_data()
+if df.empty:
+    st.error("Não foi possível carregar o arquivo **Churn_Modelling.csv**. "
+             "Certifique-se de que ele está em `Dados/Churn_Modelling.csv` dentro do Space.")
+    st.stop()
+st.success(f"Dataset carregado de: `{data_info}`")
+# Normalizar nomes de colunas
+df.columns = [c.strip() for c in df.columns]
+# -----------------------------
+# Target and candidate features (dataset padrão do Kaggle)
+# -----------------------------
+TARGET = "Exited"  # 1 = saiu, 0 = permaneceu
+candidates_num = [c for c in ["CreditScore","Age","Tenure","Balance","NumOfProducts","HasCrCard","IsActiveMember","EstimatedSalary"] if c in df.columns]
+candidates_cat = [c for c in ["Geography","Gender"] if c in df.columns]
+# Sidebar: feature selection & model hyperparams
+st.sidebar.header("Configuração do Modelo")
+use_num = st.sidebar.multiselect("Variáveis numéricas", options=candidates_num, default=[c for c in ["Age","Balance","NumOfProducts","IsActiveMember"] if c in candidates_num])
+use_cat = st.sidebar.multiselect("Variáveis categóricas", options=candidates_cat, default=[c for c in ["Geography","Gender"] if c in candidates_cat])
+test_size = st.sidebar.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
+reg_strength = st.sidebar.slider("Força de regularização (C)", 0.05, 5.0, 1.0, 0.05)
+class_balanced = st.sidebar.checkbox("Class weight = 'balanced' (útil se desbalanceado)", value=True)
+max_iter = st.sidebar.slider("Max iter", 200, 2000, 1000, 100)
+train_btn = st.sidebar.button("Treinar modelo")
 # -----------------------------
+# Quick EDA block (compact)
 # -----------------------------
+st.subheader("Visão rápida do conjunto de dados")
+col_a, col_b = st.columns([2,1])
+with col_a:
+    st.dataframe(df.sample(min(10, len(df))), use_container_width=True)
+with col_b:
+    if TARGET in df.columns:
+        n1 = int(df[TARGET].sum())
+        n0 = int((1 - df[TARGET]).sum())
+        st.metric("Clientes que saíram (1)", n1)
+        st.metric("Clientes que ficaram (0)", n0)
+# -----------------------------
+# Training
+# -----------------------------
+def build_pipeline(num_cols, cat_cols, C=1.0, class_weight=None, max_iter=1000):
+    preprocess = ColumnTransformer(
+        transformers=[
+            ("num", StandardScaler(with_mean=True, with_std=True), num_cols),
+            ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), cat_cols),
+        ],
+        remainder="drop"
+    )
+    lr = LogisticRegression(C=C, penalty="l2", solver="lbfgs", max_iter=max_iter, class_weight=class_weight, n_jobs=None)
+    pipe = Pipeline(steps=[("prep", preprocess), ("clf", lr)])
+    return pipe
+def get_feature_names(preprocess, num_cols, cat_cols):
+    names = []
+    if num_cols:
+        names.extend(num_cols)
+    if cat_cols:
+        ohe = preprocess.named_transformers_["cat"]
+        cat_names = ohe.get_feature_names_out(cat_cols).tolist()
+        names.extend(cat_names)
+    return names
+if train_btn:
+    if not use_num and not use_cat:
+        st.warning("Selecione pelo menos **uma** variável explicativa (numérica ou categórica).")
+        st.stop()
+    cols_needed = [TARGET] + use_num + use_cat
+    df_model = df[cols_needed].dropna().copy()
+    X = df_model[use_num + use_cat]
+    y = df_model[TARGET]
+    cw = "balanced" if class_balanced else None
+    pipe = build_pipeline(use_num, use_cat, C=reg_strength, class_weight=cw, max_iter=max_iter)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
+    pipe.fit(X_train, y_train)
+    # -------------------------
+    # Coefficients & Odds Ratios
+    # -------------------------
+    lr = pipe.named_steps["clf"]
+    preprocess = pipe.named_steps["prep"]
+    feat_names = get_feature_names(preprocess, use_num, use_cat)
+    coefs = lr.coef_.ravel()
+    odds = np.exp(coefs)
+    coef_table = pd.DataFrame({
+        "Variável": feat_names,
+        "Coeficiente (β)": coefs,
+        "Odds Ratio (e^β)": odds
+    }).sort_values(by="Odds Ratio (e^β)", ascending=False)
+    st.subheader("Coeficientes e Odds Ratio")
+    st.write(
+        "Interpretação: mantendo as demais variáveis constantes, um aumento de uma unidade na variável (ou mudança para a categoria indicada) "
+        "multiplica as *odds* de churn por `e^β`. Se `e^β > 1`, o risco de churn aumenta; se `< 1`, diminui."
+    )
+    st.dataframe(coef_table, use_container_width=True, height=380)
+    # Acurácia simples (para referência rápida no item a)
+    acc = pipe.score(X_test, y_test)
+    st.info(f"**Acurácia (holdout)**: {acc:.3f}  |  Amostras de treino: {len(X_train)}  |  Amostras de teste: {len(X_test)}")
+    # -------------------------
+    # Interactive prediction
+    # -------------------------
+    st.subheader("Simulação: probabilidade de churn para um perfil de cliente")
+    with st.expander("Abrir painel de controle do cliente", expanded=True):
+        # Build controls dynamically from current selections
+        inputs = {}
+        cols = st.columns(2)
+        # Numeric controls
+        for i, col in enumerate(use_num):
+            with cols[i % 2]:
+                vmin = float(np.nanmin(df[col])) if np.isfinite(df[col]).all() else 0.0
+                vmax = float(np.nanmax(df[col])) if np.isfinite(df[col]).all() else 1.0
+                vmean = float(np.nanmean(df[col])) if np.isfinite(df[col]).all() else (vmin + vmax)/2.0
+                step = (vmax - vmin) / 100.0 if vmax > vmin else 1.0
+                inputs[col] = st.number_input(f"{col}", value=round(vmean, 2), step=step, min_value=vmin, max_value=vmax, format="%.2f")
+        # Categorical controls
+        for i, col in enumerate(use_cat):
+            with cols[i % 2]:
+                opts = sorted([o for o in df[col].dropna().unique().tolist()])
+                default_idx = 0 if opts else None
+                inputs[col] = st.selectbox(f"{col}", options=opts, index=default_idx if default_idx is not None else 0)
+        # Compose a single-row DataFrame
+        if inputs:
+            row = pd.DataFrame([inputs])
+            proba = float(pipe.predict_proba(row)[0, 1])
+            st.metric("Probabilidade de churn (sair do banco)", f"{proba:.1%}")
+            st.caption("Dica: ajuste os controles e observe como a probabilidade muda.")
+    # -------------------------
+    # Textual help / interpretation
+    # -------------------------
+    st.subheader("Como interpretar os coeficientes")
+    st.markdown("""
+- **Sinal de β**: positivo ⇒ aumenta as *odds* de churn; negativo ⇒ reduz.
+- **Magnitude**: valores maiores em módulo indicam maior impacto, dado o mesmo escalonamento.
+- **Odds Ratio `e^β`**: fator multiplicativo nas *odds*. Ex.: `e^β = 1.30` ⇒ as *odds* aumentam **30%**.
+- Em variáveis **categóricas**, o β refere-se à **categoria de referência vs. a categoria exibida** (depois do one-hot com `drop='first'`).
+    """)
+else:
+    st.info("Selecione as variáveis na barra lateral e clique em **Treinar modelo** para começar.")
+# -----------------------------
+# Footer
+# -----------------------------
+st.markdown("---")
+st.caption("PPCA/UnB • Tarefa 5 – Item (a) • Regressão Logística + Odds Ratio • Feito para rodar em Hugging Face Spaces (Streamlit).")