Spaces:

AEDIUnB
/

Teste_Hipoteses

Sleeping

App Files Files Community

joaogabrielsouza commited on Apr 29, 2025

Commit

df6c009

1 Parent(s): 4f9dcf1

Atualiza menu de mapas e gráficos interativos

Browse files

Files changed (1) hide show

Dashboard_Teste_de_Hipoteses_online.py +93 -67

Dashboard_Teste_de_Hipoteses_online.py CHANGED Viewed

@@ -71,10 +71,10 @@ tabs = st.tabs(["Simulações Teóricas", "Análise de Chocolate"])
 with tabs[0]:
     st.subheader("Teste de Hipótese para Proporção de Testes Positivos de COVID-19")
     st.sidebar.markdown("### Parâmetros do Teste")
-    p_pop    = st.sidebar.slider("Proporção populacional (H0)", 0.0, 1.0, 0.1, 0.01)
-    p_sample = st.sidebar.slider("Proporção amostral",            0.0, 1.0, 0.12,0.01)
-    n        = st.sidebar.slider("Tamanho da amostra",          100, 10000, 1000, 10)
-    alpha    = st.sidebar.slider("Nível de significância (α)",  0.01, 0.10, 0.05, 0.01)
     se      = np.sqrt(p_pop*(1-p_pop)/n)
     z       = (p_sample - p_pop)/se
@@ -82,7 +82,7 @@ with tabs[0]:
     st.write(f"**Z** = {z:.4f}")
     st.write(f"**p-valor** = {p_value:.4f}")
-    if p_value < alpha:
         st.write("**Rejeitamos H0**: diferença significativa.")
     else:
         st.write("**Não rejeitamos H0**: sem diferença significativa.")
@@ -93,7 +93,7 @@ with tabs[0]:
     fig = go.Figure()
     fig.add_trace(go.Scatter(x=x, y=y, mode="lines", line=dict(color="blue")))
     fig.add_vline(x=p_sample, line=dict(color="red", width=2))
-    zc = stats.norm.ppf(1-alpha/2)
     fig.add_vrect(x0=p_pop-zc*se, x1=p_pop+zc*se, fillcolor="red", opacity=0.2, line_width=0)
     fig.update_layout(
         title="Distribuição Normal e Região Crítica",
@@ -104,78 +104,86 @@ with tabs[0]:
     st.markdown("**Descrição**: Teste bilateral de proporções usando valor-p.")
-# Aba 2: Análise de Chocolate
 with tabs[1]:
     st.subheader("Testes de Hipótese e ANOVA em Dados de Chocolate")
     st.markdown("---")
-    # 1) Carregar dados
-    df = pd.read_csv("Dados/flavors_of_cacao.csv")
-    st.markdown("#### Dados Brutos")
-    st.dataframe(df)
-    # 2) Renomear colunas
     df.columns = [
         "company","bean_bar_origin","ref","date",
         "percent","location","rating","beantype","origin"
     ]
-    st.markdown("#### Colunas Renomeadas")
-    st.dataframe(df.head())
-    # 3) Mapa: empresas por país
-    grp_loc = df.groupby("location").size().reset_index(name="count")
-    url = "https://raw.githubusercontent.com/datasets/geo-countries/master/data/countries.geojson"
-    world = gpd.read_file(url).rename(columns={"ADMIN":"name"})
-    mapa = world.merge(grp_loc, how="left", left_on="name", right_on="location")
-    fig_map, ax_map = plt.subplots(figsize=(8,4))
-    mapa.boundary.plot(ax=ax_map, edgecolor="gray", linewidth=0.5)
-    mapa.plot(column="count", ax=ax_map, cmap="OrRd", legend=True,
-              missing_kwds={"color":"lightgrey"})
-    ax_map.axis("off")
-    st.pyplot(fig_map)
-    # 4) Scatter: média por origem (count>=5)
-    grp_ori = (
-        df.groupby("origin")
-          .agg(count=("rating","size"), mean_rating=("rating","mean"))
-          .reset_index()
-    )
-    fil_ori = grp_ori.query("count>=5")
-    fig_sc, ax_sc = plt.subplots()
-    fig_sc = px.scatter(
-        fil_ori, x="mean_rating", y="origin",
-        size="count", color="origin",
-        title="Avaliação Média vs Origem",
-        labels={"mean_rating":"Avaliação Média","origin":"Origem"}
-    )
-    st.plotly_chart(fig_sc, use_container_width=True)
-    # 5) WordCloud de empresas
-    wc = WordCloud(width=800, height=300, background_color="white")
-    freqs = df["company"].value_counts().to_dict()
-    wc_img = wc.generate_from_frequencies(freqs)
-    fig_wc, ax_wc = plt.subplots(figsize=(8,3))
-    ax_wc.imshow(wc_img, interpolation="bilinear")
-    ax_wc.axis("off")
-    st.pyplot(fig_wc)
-    # 6) ANOVA e Tukey
     paises = ["Brazil","France","U.S.A.","Canada","Ecuador","Peru","Venezuela"]
     filt = df[df.location.isin(paises)]
     melt = pd.melt(filt, id_vars=["location"], value_vars=["rating"])
     model = ols("value ~ C(location)", data=melt).fit()
     anova = sm.stats.anova_lm(model, typ=2)
     st.markdown("#### ANOVA")
     st.dataframe(anova)
-    tukey = pairwise_tukeyhsd(endog=melt.value, groups=melt.location, alpha=0.05)
-    st.markdown("#### Tukey HSD")
     st.text(tukey.summary())
-    # 7) Diagnóstico de resíduos
     std_res = model.get_influence().resid_studentized_internal
     fig_qq = sm.qqplot(std_res, line="45", fit=True)
-    plt.title("QQ-plot Resíduos Padronizados")
     st.pyplot(fig_qq)
     fig_h, ax_h = plt.subplots()
@@ -184,16 +192,34 @@ with tabs[1]:
     ax_h.set_xlabel("Resíduos"); ax_h.set_ylabel("Frequência")
     st.pyplot(fig_h)
-    # 8) Testes de premissas
-    st.markdown("#### Testes de Premissas")
-    w, p_sh = shapiro(model.resid)
-    st.write(f"Shapiro-Wilk: estatística={w:.3f}, p-valor={p_sh:.3f}")
     grupos = [filt.query("location==@loc").rating for loc in paises]
     w_lev, p_lev = levene(*grupos)
-    st.write(f"Levene: estatística={w_lev:.3f}, p-valor={p_lev:.3f}")
     kw_stat, kw_p = kruskal(*grupos)
-    st.write(f"Kruskal-Wallis: estatística={kw_stat:.3f}, p-valor={kw_p:.3f}")

 with tabs[0]:
     st.subheader("Teste de Hipótese para Proporção de Testes Positivos de COVID-19")
     st.sidebar.markdown("### Parâmetros do Teste")
+    p_pop      = st.sidebar.slider("Proporção populacional (H0)", 0.0, 1.0, 0.1, 0.01, key="p_pop")
+    p_sample   = st.sidebar.slider("Proporção amostral",            0.0, 1.0, 0.12,0.01, key="p_sample")
+    n          = st.sidebar.slider("Tamanho da amostra",          100, 10000, 1000, 10, key="n_sample")
+    alpha_prop = st.sidebar.slider("Nível de significância (α)",  0.01,  0.10,  0.05, 0.01, key="alpha_prop")
     se      = np.sqrt(p_pop*(1-p_pop)/n)
     z       = (p_sample - p_pop)/se
     st.write(f"**Z** = {z:.4f}")
     st.write(f"**p-valor** = {p_value:.4f}")
+    if p_value < alpha_prop:
         st.write("**Rejeitamos H0**: diferença significativa.")
     else:
         st.write("**Não rejeitamos H0**: sem diferença significativa.")
     fig = go.Figure()
     fig.add_trace(go.Scatter(x=x, y=y, mode="lines", line=dict(color="blue")))
     fig.add_vline(x=p_sample, line=dict(color="red", width=2))
+    zc = stats.norm.ppf(1-alpha_prop/2)
     fig.add_vrect(x0=p_pop-zc*se, x1=p_pop+zc*se, fillcolor="red", opacity=0.2, line_width=0)
     fig.update_layout(
         title="Distribuição Normal e Região Crítica",
     st.markdown("**Descrição**: Teste bilateral de proporções usando valor-p.")
+# Aba 2: Análise de Chocolate (atualizada)
 with tabs[1]:
     st.subheader("Testes de Hipótese e ANOVA em Dados de Chocolate")
     st.markdown("---")
+    # parâmetro interativo de significância para chocolate
+    alpha_choc = st.sidebar.slider("Nível de significância (α) - ANOVA Chocolate", 0.01, 0.10, 0.05, 0.01, key="alpha_choc")
+    # 1) Carrega e renomeia dados
+    df = pd.read_csv("Dados/flavors_of_cacao.csv")
     df.columns = [
         "company","bean_bar_origin","ref","date",
         "percent","location","rating","beantype","origin"
     ]
+    # Filtra países
     paises = ["Brazil","France","U.S.A.","Canada","Ecuador","Peru","Venezuela"]
     filt = df[df.location.isin(paises)]
+    # 2) Mapas Interativos — menu de seleção
+    st.markdown("#### Mapas Interativos")
+    map_type = st.selectbox(
+        "Selecione o mapa",
+        ["Empresas de Chocolate por País", "Plantações de Cacau por País"],
+        key="map_selector"
+    )
+    if map_type == "Empresas de Chocolate por País":
+        grp = df.groupby("location").size().reset_index(name="value")
+        locations = "location"
+        color_scale = "OrRd"
+        title = "Número de Empresas de Chocolate por País"
+        label = "# Empresas"
+    else:
+        grp = df.groupby("origin").size().reset_index(name="value")
+        locations = "origin"
+        color_scale = "Greens"
+        title = "Número de Plantações de Cacau por País"
+        label = "# Plantações"
+    fig_map = px.choropleth(
+        grp,
+        locations=locations,
+        locationmode="country names",
+        color="value",
+        color_continuous_scale=color_scale,
+        title=title,
+        labels={"value": label}
+    )
+    st.plotly_chart(fig_map, use_container_width=True)
+    # 3) Gráfico interativo de distribuição
+    plot_type = st.selectbox(
+        "Tipo de gráfico de distribuição",
+        ["Boxplot","Violin","Histograma"]
+    )
+    if plot_type == "Boxplot":
+        fig1 = px.box(filt, x="location", y="rating", title="Boxplot de Ratings")
+    elif plot_type == "Violin":
+        fig1 = px.violin(filt, x="location", y="rating", box=True, points="all", title="Violin Plot de Ratings")
+    else:
+        fig1 = px.histogram(filt, x="rating", color="location", barmode="overlay", title="Histograma de Ratings")
+    st.plotly_chart(fig1, use_container_width=True)
+    # 3) ANOVA + Tukey
     melt = pd.melt(filt, id_vars=["location"], value_vars=["rating"])
     model = ols("value ~ C(location)", data=melt).fit()
     anova = sm.stats.anova_lm(model, typ=2)
     st.markdown("#### ANOVA")
     st.dataframe(anova)
+    tukey = pairwise_tukeyhsd(endog=melt.value, groups=melt.location, alpha=alpha_choc)
+    st.markdown("#### Pós-hoc Tukey HSD")
     st.text(tukey.summary())
+    # 4) Diagnóstico de resíduos
+    st.markdown("#### Diagnóstico de Resíduos")
     std_res = model.get_influence().resid_studentized_internal
     fig_qq = sm.qqplot(std_res, line="45", fit=True)
+    plt.title("QQ-plot dos Resíduos Padronizados")
     st.pyplot(fig_qq)
     fig_h, ax_h = plt.subplots()
     ax_h.set_xlabel("Resíduos"); ax_h.set_ylabel("Frequência")
     st.pyplot(fig_h)
+    # Normalidade dos resíduos: Kolmogorov–Smirnov
+    ks_stat, ks_p = stats.kstest((model.resid - model.resid.mean())/model.resid.std(), "norm")
+    st.write(f"Kolmogorov–Smirnov (resíduos): estatística={ks_stat:.3f}, p-valor={ks_p:.3f}")
+    st.write("➡️ " + ("resíduos normais" if ks_p >= alpha_choc else "violação de normalidade"))
+    # 5) Homocedasticidade (Levene)
+    st.markdown("#### Homocedasticidade (Levene)")
     grupos = [filt.query("location==@loc").rating for loc in paises]
     w_lev, p_lev = levene(*grupos)
+    st.write(f"Levene: W={w_lev:.3f}, p-valor={p_lev:.3f}")
+    st.write("➡️ " + ("variâncias iguais" if p_lev >= alpha_choc else "variâncias diferentes"))
+    # 6) Testes Não Paramétricos Par a Par (Wilcoxon)
+    st.markdown("#### Testes Não Paramétricos Par a Par (Wilcoxon)")
+    col1, col2 = st.columns(2)
+    with col1:
+        g1 = st.selectbox("Grupo 1", paises, index=0)
+    with col2:
+        g2 = st.selectbox("Grupo 2", paises, index=1)
+    d1 = filt[filt.location==g1].rating.dropna().values
+    d2 = filt[filt.location==g2].rating.dropna().values
+    m = min(len(d1), len(d2))
+    w_stat, w_p = wilcoxon(d1[:m], d2[:m])
+    st.write(f"Wilcoxon signed-rank: W={w_stat:.3f}, p-valor={w_p:.3f}")
+    st.write("➡️ " + ("diferença significativa" if w_p < alpha_choc else "sem diferença significativa"))
+    # 7) Teste Kruskal–Wallis (todas as comparações)
+    st.markdown("#### Teste Kruskal–Wallis (todas as comparações)")
     kw_stat, kw_p = kruskal(*grupos)
+    st.write(f"Kruskal–Wallis: estatística={kw_stat:.3f}, p-valor={kw_p:.3f}")
+    st.write("➡️ " + ("distribuições diferentes" if kw_p < alpha_choc else "mesma distribuição"))