Spaces:

ricardoadriano
/

atividade6

Sleeping

App Files Files Community

ricardoadriano commited on Oct 28, 2025

Commit

b2737bb

verified ·

1 Parent(s): 6bbdfd0

Update src/streamlit_app.py

Browse files

Files changed (1) hide show

src/streamlit_app.py +30 -25

src/streamlit_app.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # Objetivo: permitir escolher ALVO e PREDITORES e produzir INFERÊNCIA (item e),
 #           usando Logit (alvo binário) ou OLS (alvo contínuo).
 #           Se o alvo for categórico com >2 classes, permite one-vs-rest.
 # -------------------------------------------------------------------
 import os
 import numpy as np
@@ -58,13 +59,11 @@ def coerce_numeric_series(s: pd.Series) -> pd.Series:
     """Tenta converter strings numéricas para float (lida com vírgula decimal)."""
     if np.issubdtype(s.dtype, np.number):
         return s.astype(float)
-    # troca vírgula decimal por ponto, remove separadores comuns
     tmp = s.astype(str).str.replace(r"[.\s]", "", regex=True).str.replace(",", ".", regex=False)
-    coerced = pd.to_numeric(tmp, errors="coerce")
-    return coerced
 def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
-    """Engenharia minimalista (compatível com o CSV padrão do Kaggle)."""
     out = df.copy()
     # Tenure (dias desde Dt_Customer)
@@ -191,27 +190,39 @@ df_eng = engineer_features(df_raw)
 with st.sidebar:
     st.markdown("**Alvo (variável dependente):**")
     all_cols = df_eng.columns.tolist()
-    # Preferência: 'Response' se existir; senão 1ª binária; senão 1ª numérica
-    default_target = "Response" if "Response" in all_cols else None
-    if default_target is None:
-        for c in all_cols:
-            if is_binary_series(df_eng[c]):
-                default_target = c; break
-    if default_target is None:
         for c in all_cols:
-            if np.issubdtype(df_eng[c].dtype, np.number):
-                default_target = c; break
     target_col = st.selectbox("Alvo (y)", options=all_cols, index=all_cols.index(default_target) if default_target in all_cols else 0)
 # Variáveis explicativas
 exclude = [target_col]
 num_cols_all, cat_cols_all = split_num_cat(df_eng, exclude=exclude)
 with st.sidebar:
     st.markdown("**Variáveis explicativas (X):**")
-    engineered_first = [c for c in ["TenureDays","TotalMnt","TotalPurchases","OnlineShare","PromoShare","AvgTicket","BasketDiversity"] if c in num_cols_all]
-    base_defaults = engineered_first + [c for c in num_cols_all if c not in engineered_first][:5] + cat_cols_all[:3]
-    selected_feats = st.multiselect("Selecione X", options=(num_cols_all + cat_cols_all), default=base_defaults)
     test_size = st.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
     random_state = st.number_input("Random seed", value=42, step=1)
@@ -233,11 +244,9 @@ is_bin = is_binary_series(y_raw)
 # 2) se não binário, tenta numérico (coerção segura)
 y_numeric_try = coerce_numeric_series(y_raw) if not is_bin else None
 is_numeric_ok = False
-if not is_bin:
-    if y_numeric_try is not None:
-        # considera "ok" se pelo menos 80% foram convertidos
-        conv_rate = y_numeric_try.notna().mean()
-        is_numeric_ok = conv_rate >= 0.8
 # 3) se não binário e não numérico, vira categórico multi-classe → one-vs-rest
 with st.sidebar:
@@ -250,20 +259,16 @@ with st.sidebar:
 # ---------- Montagem de y conforme os casos ----------
 if is_bin:
-    # Se binário não-numérico → mapear para {0,1} em ordem alfabética
     if not np.issubdtype(y_raw.dtype, np.number):
         uniq = sorted(pd.unique(y_raw.dropna()).tolist(), key=lambda x: str(x))
         y = y_raw.replace({uniq[0]: 0, uniq[1]: 1}).astype(int)
     else:
         y = y_raw.astype(int)
     model_type = "logit"
 elif is_numeric_ok:
     y = y_numeric_try.astype(float)
     model_type = "ols"
 else:
-    # one-vs-rest
     y = (y_raw == positive_class).astype(int)
     model_type = "logit"

 # Objetivo: permitir escolher ALVO e PREDITORES e produzir INFERÊNCIA (item e),
 #           usando Logit (alvo binário) ou OLS (alvo contínuo).
 #           Se o alvo for categórico com >2 classes, permite one-vs-rest.
+#           Alvo padrão: Response. X padrão alinhado ao Colab.
 # -------------------------------------------------------------------
 import os
 import numpy as np
     """Tenta converter strings numéricas para float (lida com vírgula decimal)."""
     if np.issubdtype(s.dtype, np.number):
         return s.astype(float)
     tmp = s.astype(str).str.replace(r"[.\s]", "", regex=True).str.replace(",", ".", regex=False)
+    return pd.to_numeric(tmp, errors="coerce")
 def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
+    """Engenharia minimalista para o dataset padrão do Kaggle."""
     out = df.copy()
     # Tenure (dias desde Dt_Customer)
 with st.sidebar:
     st.markdown("**Alvo (variável dependente):**")
     all_cols = df_eng.columns.tolist()
+    # Alvo padrão fixo: Response (se existir). Caso contrário, mesma lógica de fallback.
+    if "Response" in all_cols:
+        default_target = "Response"
+    else:
+        default_target = None
         for c in all_cols:
+            if is_binary_series(df_eng[c]): default_target = c; break
+        if default_target is None:
+            for c in all_cols:
+                if np.issubdtype(df_eng[c].dtype, np.number): default_target = c; break
     target_col = st.selectbox("Alvo (y)", options=all_cols, index=all_cols.index(default_target) if default_target in all_cols else 0)
 # Variáveis explicativas
 exclude = [target_col]
 num_cols_all, cat_cols_all = split_num_cat(df_eng, exclude=exclude)
+# X padrão alinhado ao Colab (só incluir se existir na base)
+preferred_defaults = [
+    "Income", "Recency", "Education", "Marital_Status",
+    "TenureDays", "TotalMnt", "TotalPurchases",
+    "OnlineShare", "PromoShare", "AvgTicket", "BasketDiversity",
+    "NumWebVisitsMonth"
+]
+default_X = [c for c in preferred_defaults if c in (num_cols_all + cat_cols_all)]
 with st.sidebar:
     st.markdown("**Variáveis explicativas (X):**")
+    # Se nada dos preferidos existir, cai no fallback antigo (algumas num + categ)
+    if not default_X:
+        engineered_first = [c for c in ["TenureDays","TotalMnt","TotalPurchases","OnlineShare","PromoShare","AvgTicket","BasketDiversity"] if c in num_cols_all]
+        default_X = engineered_first + [c for c in num_cols_all if c not in engineered_first][:5] + cat_cols_all[:3]
+    selected_feats = st.multiselect("Selecione X", options=(num_cols_all + cat_cols_all), default=default_X)
     test_size = st.slider("Proporção de teste", 0.1, 0.4, 0.2, 0.05)
     random_state = st.number_input("Random seed", value=42, step=1)
 # 2) se não binário, tenta numérico (coerção segura)
 y_numeric_try = coerce_numeric_series(y_raw) if not is_bin else None
 is_numeric_ok = False
+if not is_bin and y_numeric_try is not None:
+    conv_rate = y_numeric_try.notna().mean()
+    is_numeric_ok = conv_rate >= 0.8
 # 3) se não binário e não numérico, vira categórico multi-classe → one-vs-rest
 with st.sidebar:
 # ---------- Montagem de y conforme os casos ----------
 if is_bin:
     if not np.issubdtype(y_raw.dtype, np.number):
         uniq = sorted(pd.unique(y_raw.dropna()).tolist(), key=lambda x: str(x))
         y = y_raw.replace({uniq[0]: 0, uniq[1]: 1}).astype(int)
     else:
         y = y_raw.astype(int)
     model_type = "logit"
 elif is_numeric_ok:
     y = y_numeric_try.astype(float)
     model_type = "ols"
 else:
     y = (y_raw == positive_class).astype(int)
     model_type = "logit"