Spaces:
Sleeping
Sleeping
Commit
·
ba2cd93
1
Parent(s):
6e83e2f
Clarify several pages
Browse files- fields/investing_flat_fields.py +7 -0
- page_attitudes.py +1 -1
- page_home.py +9 -8
- page_personas.py +67 -44
- page_shopping.py +12 -11
- page_tests.py +24 -16
fields/investing_flat_fields.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
investing_flat_fields = [
|
| 2 |
+
"你/妳會對金錢感到焦慮嗎?",
|
| 3 |
+
"你/妳會對金錢很節儉嗎?",
|
| 4 |
+
"你/妳會經常存錢嗎?",
|
| 5 |
+
"你/妳對自己的財務知識滿意嗎?",
|
| 6 |
+
"你/妳投資會考慮環保嗎?"
|
| 7 |
+
]
|
page_attitudes.py
CHANGED
|
@@ -10,7 +10,7 @@ from fields.translation_mapping import translation_mapping
|
|
| 10 |
@st.cache_data
|
| 11 |
def show(df):
|
| 12 |
st.title("Student Attitudes (Overall)")
|
| 13 |
-
st.write("Student
|
| 14 |
|
| 15 |
# Chinese font
|
| 16 |
chinese_font = FontProperties(fname='mingliu.ttf')
|
|
|
|
| 10 |
@st.cache_data
|
| 11 |
def show(df):
|
| 12 |
st.title("Student Attitudes (Overall)")
|
| 13 |
+
st.write("Student attitudes across all likert fields without clustering")
|
| 14 |
|
| 15 |
# Chinese font
|
| 16 |
chinese_font = FontProperties(fname='mingliu.ttf')
|
page_home.py
CHANGED
|
@@ -3,15 +3,16 @@ import streamlit as st
|
|
| 3 |
|
| 4 |
def show():
|
| 5 |
st.title("Survey Overview")
|
| 6 |
-
st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students).
|
| 7 |
-
\n* Survey Oct.
|
| 8 |
\n* 2000 cards with a QR code printed out
|
| 9 |
-
\n* Distribution conducted
|
| 10 |
-
\n*
|
| 11 |
-
\n*
|
| 12 |
\n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
|
| 13 |
-
\n* 36
|
| 14 |
-
\n* 14 product features (multiple-choice)
|
| 15 |
\n* 6 choice experiments
|
| 16 |
-
\n*
|
|
|
|
| 17 |
)
|
|
|
|
| 3 |
|
| 4 |
def show():
|
| 5 |
st.title("Survey Overview")
|
| 6 |
+
st.markdown('''A survey of Taiwanese college students (excludes overseas Chinese-speaking students as well as foreign students) covering attitudes towards shopping, saving, investing, economy, nature, sustainability, and AI.
|
| 7 |
+
\n* Survey Oct. 13th - Nov. 3rd, 2023
|
| 8 |
\n* 2000 cards with a QR code printed out
|
| 9 |
+
\n* Distribution conducted at 8 universities (handing out the cards)
|
| 10 |
+
\n* 1289 people started the survey, 518 quit
|
| 11 |
+
\n* 771 people completed the whole survey
|
| 12 |
\n* Data after filtering: 675 people aged 18-26 (Gen-Z), Taiwanese, current students in BA (large majority), MA (small minority) or PhD level (very few respondents)
|
| 13 |
+
\n* 36 likert fields (5-point scale) used for clustering the students into 3 personas with K-means clustering
|
| 14 |
+
\n* 14 product features (multiple-choice) used for K-modes clustering
|
| 15 |
\n* 6 choice experiments
|
| 16 |
+
\n* 2 option ranking questions
|
| 17 |
+
\n* 10 text fields used to enrich the personas'''
|
| 18 |
)
|
page_personas.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
import matplotlib.pyplot as plt
|
| 5 |
import seaborn as sns
|
| 6 |
import squarify
|
|
@@ -11,36 +12,49 @@ from fields.likert_flat_fields import likert_flat_fields
|
|
| 11 |
|
| 12 |
#@st.cache_data
|
| 13 |
def show(df):
|
| 14 |
-
st.title("Clustering Students to Build Personas")
|
| 15 |
-
st.write("Clustering Students based on 36 fields of Likert data")
|
| 16 |
-
|
| 17 |
-
st.title("Top 10 highest agreement between personas")
|
| 18 |
-
|
| 19 |
-
create_treemap()
|
| 20 |
-
|
| 21 |
-
st.title("Top 10 highest disagreement between Personas")
|
| 22 |
-
|
| 23 |
# Chinese font
|
| 24 |
chinese_font = FontProperties(fname='mingliu.ttf')
|
| 25 |
-
|
| 26 |
-
show_clustering_heatmap(df, chinese_font)
|
| 27 |
-
|
| 28 |
# Prepare the data and perform clustering and PCA
|
| 29 |
df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# Show a scatterplot with all clusters included
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 34 |
|
| 35 |
# Show a scatterplot for each cluster separately
|
| 36 |
for cluster_id in range(3):
|
| 37 |
df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
|
| 38 |
-
plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font,
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
|
|
|
|
|
|
| 41 |
|
|
|
|
| 42 |
|
| 43 |
-
def
|
| 44 |
|
| 45 |
# Select only the relevant columns for clustering
|
| 46 |
df_likert_real_data = df[likert_flat_fields]
|
|
@@ -59,20 +73,12 @@ def perform_kmeans_clustering(df):
|
|
| 59 |
cluster_means_real_data = df_likert_real_data.groupby(
|
| 60 |
'Cluster').mean().reset_index()
|
| 61 |
|
| 62 |
-
# Count the number of students in each cluster
|
| 63 |
-
cluster_counts = df_likert_real_data['Cluster'].value_counts(
|
| 64 |
-
).sort_index().reset_index()
|
| 65 |
-
cluster_counts.columns = ['Cluster', 'Number of Students']
|
| 66 |
-
|
| 67 |
# Display the table
|
| 68 |
-
st.
|
| 69 |
-
st.write("Mean Scores for Each Question in Each Cluster:")
|
| 70 |
st.table(cluster_means_real_data)
|
| 71 |
|
| 72 |
|
| 73 |
def show_clustering_heatmap(df, chinese_font):
|
| 74 |
-
st.title("Heatmap")
|
| 75 |
-
|
| 76 |
# Filter the DataFrame to only include the Likert scale fields
|
| 77 |
df_likert_data = df[likert_flat_fields]
|
| 78 |
|
|
@@ -100,9 +106,9 @@ def show_clustering_heatmap(df, chinese_font):
|
|
| 100 |
ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
|
| 101 |
|
| 102 |
# Rotate the x-axis labels for better readability
|
| 103 |
-
|
|
|
|
| 104 |
|
| 105 |
-
# Use the figure object (fig) in st.pyplot() to display the plot
|
| 106 |
st.pyplot(fig)
|
| 107 |
|
| 108 |
|
|
@@ -125,32 +131,49 @@ def prepare_data_for_pca(df):
|
|
| 125 |
return df_clustered, pca, cluster_centers
|
| 126 |
|
| 127 |
|
| 128 |
-
def plot_scatterplot(df, pca, cluster_centers, chinese_font, title):
|
| 129 |
# Create a figure and a set of subplots
|
| 130 |
fig, ax = plt.subplots(figsize=(10, 10))
|
| 131 |
|
| 132 |
-
#
|
| 133 |
-
|
| 134 |
-
data=df, palette='viridis', s=100, alpha=0.6, ax=ax)
|
| 135 |
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
alpha=0.75, marker='o', edgecolors='k')
|
|
|
|
|
|
|
|
|
|
| 145 |
|
|
|
|
| 146 |
ax.set_title(title, fontproperties=chinese_font)
|
| 147 |
-
ax.set_xlabel('Principal Component 1')
|
| 148 |
-
ax.set_ylabel('Principal Component 2')
|
| 149 |
|
| 150 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
st.pyplot(fig)
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
| 154 |
def create_treemap():
|
| 155 |
categories = {
|
| 156 |
'Ethical Consumption and Labor Concerns': 3.2,
|
|
@@ -181,7 +204,7 @@ def create_treemap():
|
|
| 181 |
ax.axis('off')
|
| 182 |
|
| 183 |
# Add a title to the plot
|
| 184 |
-
plt.title('
|
| 185 |
|
| 186 |
# Use the figure object (fig) in st.pyplot() to display the plot
|
| 187 |
st.pyplot(fig)
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
+
import textwrap
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
import seaborn as sns
|
| 7 |
import squarify
|
|
|
|
| 12 |
|
| 13 |
#@st.cache_data
|
| 14 |
def show(df):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# Chinese font
|
| 16 |
chinese_font = FontProperties(fname='mingliu.ttf')
|
| 17 |
+
|
|
|
|
|
|
|
| 18 |
# Prepare the data and perform clustering and PCA
|
| 19 |
df_clustered, pca, cluster_centers = prepare_data_for_pca(df)
|
| 20 |
|
| 21 |
+
# Titles
|
| 22 |
+
st.title("Personas")
|
| 23 |
+
st.write("Based on 36 fields of likert data")
|
| 24 |
+
|
| 25 |
+
# Retain colors
|
| 26 |
+
unique_clusters = df_clustered['Cluster'].unique()
|
| 27 |
+
palette = sns.color_palette('pastel', n_colors=len(unique_clusters))
|
| 28 |
+
cluster_palette = {cluster: color for cluster, color in zip(unique_clusters, palette)}
|
| 29 |
+
|
| 30 |
+
# Cluster names
|
| 31 |
+
cluster_names = {
|
| 32 |
+
0: 'Sustainable',
|
| 33 |
+
1: 'Moderate',
|
| 34 |
+
2: 'Frugal',
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
# Show a scatterplot with all clusters included
|
| 38 |
+
st.markdown(
|
| 39 |
+
f"<h2 style='text-align: center;'>Clustering Students to Build 3 Personas</h2>", unsafe_allow_html=True)
|
| 40 |
+
plot_scatterplot(df_clustered, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, "Distinct Respondent Profiles Based on K-means Clustering")
|
| 41 |
|
| 42 |
# Show a scatterplot for each cluster separately
|
| 43 |
for cluster_id in range(3):
|
| 44 |
df_cluster = df_clustered[df_clustered['Cluster'] == cluster_id]
|
| 45 |
+
plot_scatterplot(df_cluster, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title=f"Scatterplot for Cluster {cluster_id}")
|
| 46 |
+
|
| 47 |
+
st.markdown(
|
| 48 |
+
f"<h2 style='text-align: center;'>Mean Answer Scores</h2>", unsafe_allow_html=True)
|
| 49 |
+
get_kmeans_table(df)
|
| 50 |
+
show_clustering_heatmap(df, chinese_font)
|
| 51 |
|
| 52 |
+
st.markdown(
|
| 53 |
+
f"<h2 style='text-align: center;'>Agreement between personas</h2>", unsafe_allow_html=True)
|
| 54 |
|
| 55 |
+
create_treemap()
|
| 56 |
|
| 57 |
+
def get_kmeans_table(df):
|
| 58 |
|
| 59 |
# Select only the relevant columns for clustering
|
| 60 |
df_likert_real_data = df[likert_flat_fields]
|
|
|
|
| 73 |
cluster_means_real_data = df_likert_real_data.groupby(
|
| 74 |
'Cluster').mean().reset_index()
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
# Display the table
|
| 77 |
+
st.write("Mean response values for each likert question in each cluster:")
|
|
|
|
| 78 |
st.table(cluster_means_real_data)
|
| 79 |
|
| 80 |
|
| 81 |
def show_clustering_heatmap(df, chinese_font):
|
|
|
|
|
|
|
| 82 |
# Filter the DataFrame to only include the Likert scale fields
|
| 83 |
df_likert_data = df[likert_flat_fields]
|
| 84 |
|
|
|
|
| 106 |
ax.set_ylabel('Cluster ID', fontproperties=chinese_font)
|
| 107 |
|
| 108 |
# Rotate the x-axis labels for better readability
|
| 109 |
+
wrapped_labels = [textwrap.fill(label.get_text(), width=10) for label in ax.get_xticklabels()]
|
| 110 |
+
ax.set_xticklabels(wrapped_labels, rotation=45, fontproperties=chinese_font)
|
| 111 |
|
|
|
|
| 112 |
st.pyplot(fig)
|
| 113 |
|
| 114 |
|
|
|
|
| 131 |
return df_clustered, pca, cluster_centers
|
| 132 |
|
| 133 |
|
| 134 |
+
def plot_scatterplot(df, pca, cluster_centers, chinese_font, cluster_palette, cluster_names, title):
|
| 135 |
# Create a figure and a set of subplots
|
| 136 |
fig, ax = plt.subplots(figsize=(10, 10))
|
| 137 |
|
| 138 |
+
# Calculate cluster counts
|
| 139 |
+
cluster_counts = df['Cluster'].value_counts()
|
|
|
|
| 140 |
|
| 141 |
+
# Plot the scatterplot
|
| 142 |
+
scatter = sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster',
|
| 143 |
+
data=df, palette=cluster_palette, s=100, alpha=0.6, ax=ax)
|
| 144 |
+
|
| 145 |
+
# Get unique cluster labels sorted by value
|
| 146 |
+
unique_clusters = sorted(df['Cluster'].unique())
|
| 147 |
+
|
| 148 |
+
# Add the cluster centers for all clusters if plotting combined scatterplot
|
| 149 |
+
for label in unique_clusters:
|
| 150 |
+
# Use the label to index cluster_centers directly if it's a dictionary
|
| 151 |
+
center = cluster_centers[label]
|
| 152 |
+
ax.scatter(center[0], center[1], c=cluster_palette[label], s=200,
|
| 153 |
alpha=0.75, marker='o', edgecolors='k')
|
| 154 |
+
# Annotate the number of respondents in the cluster
|
| 155 |
+
ax.text(center[0], center[1], str(cluster_counts[label]), color='black',
|
| 156 |
+
ha='center', va='center', fontproperties=chinese_font)
|
| 157 |
|
| 158 |
+
# Set titles and labels
|
| 159 |
ax.set_title(title, fontproperties=chinese_font)
|
| 160 |
+
ax.set_xlabel('Principal Component 1', fontproperties=chinese_font)
|
| 161 |
+
ax.set_ylabel('Principal Component 2', fontproperties=chinese_font)
|
| 162 |
|
| 163 |
+
# Extract handles and labels from the scatterplot
|
| 164 |
+
handles, labels = scatter.get_legend_handles_labels()
|
| 165 |
+
|
| 166 |
+
# Update labels with custom names and counts
|
| 167 |
+
new_labels = [f'Cluster {label}: {cluster_names[label]} (n={cluster_counts[label]})' for label in unique_clusters]
|
| 168 |
+
# Update the legend with the new labels
|
| 169 |
+
ax.legend(handles=handles, labels=new_labels, title='Personas', loc='upper right')
|
| 170 |
+
|
| 171 |
+
# Use the figure object (fig) to display the plot
|
| 172 |
st.pyplot(fig)
|
| 173 |
|
| 174 |
|
| 175 |
+
|
| 176 |
+
|
| 177 |
def create_treemap():
|
| 178 |
categories = {
|
| 179 |
'Ethical Consumption and Labor Concerns': 3.2,
|
|
|
|
| 204 |
ax.axis('off')
|
| 205 |
|
| 206 |
# Add a title to the plot
|
| 207 |
+
plt.title('Average Agreement Level by Question Category', fontsize=15)
|
| 208 |
|
| 209 |
# Use the figure object (fig) in st.pyplot() to display the plot
|
| 210 |
st.pyplot(fig)
|
page_shopping.py
CHANGED
|
@@ -13,14 +13,15 @@ def show(df):
|
|
| 13 |
# Load the Chinese font
|
| 14 |
chinese_font = FontProperties(fname='mingliu.ttf', size=12)
|
| 15 |
st.title("Shopping")
|
| 16 |
-
st.write("Clustering
|
| 17 |
-
st.title("Boycott Count")
|
| 18 |
-
show_boycott_count(df, font_prop=chinese_font)
|
| 19 |
clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
|
| 20 |
-
st.
|
|
|
|
| 21 |
show_radar_chart(clusters, font_prop=chinese_font)
|
| 22 |
-
st.title("Feature Preferences")
|
| 23 |
plot_feature_preferences(clusters, font_prop=chinese_font)
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
def show_boycott_count(df, font_prop):
|
| 26 |
# Count the number of people who have invested and who have not
|
|
@@ -75,9 +76,9 @@ def perform_kmodes_clustering(df, feature_columns, n_clusters=3):
|
|
| 75 |
def show_radar_chart(clusters, font_prop):
|
| 76 |
|
| 77 |
df_dict={
|
| 78 |
-
'
|
| 79 |
-
'
|
| 80 |
-
'
|
| 81 |
}
|
| 82 |
|
| 83 |
feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
|
|
@@ -146,9 +147,9 @@ def plot_feature_preferences(clusters, font_prop):
|
|
| 146 |
"老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
|
| 147 |
"投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
|
| 148 |
],
|
| 149 |
-
'
|
| 150 |
-
'
|
| 151 |
-
'
|
| 152 |
}
|
| 153 |
# Create a DataFrame
|
| 154 |
df = pd.DataFrame(data)
|
|
|
|
| 13 |
# Load the Chinese font
|
| 14 |
chinese_font = FontProperties(fname='mingliu.ttf', size=12)
|
| 15 |
st.title("Shopping")
|
| 16 |
+
st.write("Clustering students based on AI-assistant feature choices")
|
|
|
|
|
|
|
| 17 |
clusters = perform_kmodes_clustering(df, prod_feat_flat_fields)
|
| 18 |
+
st.markdown(
|
| 19 |
+
f"<h2 style='text-align: center;'>Feature Preferences</h2>", unsafe_allow_html=True)
|
| 20 |
show_radar_chart(clusters, font_prop=chinese_font)
|
|
|
|
| 21 |
plot_feature_preferences(clusters, font_prop=chinese_font)
|
| 22 |
+
st.markdown(
|
| 23 |
+
f"<h2 style='text-align: center;'>Boycott Count</h2>", unsafe_allow_html=True)
|
| 24 |
+
show_boycott_count(df, font_prop=chinese_font)
|
| 25 |
|
| 26 |
def show_boycott_count(df, font_prop):
|
| 27 |
# Count the number of people who have invested and who have not
|
|
|
|
| 76 |
def show_radar_chart(clusters, font_prop):
|
| 77 |
|
| 78 |
df_dict={
|
| 79 |
+
'Conscious (n=340)': clusters[0],
|
| 80 |
+
'Interested (n=215)': clusters[1],
|
| 81 |
+
'Advocate (n=126)': clusters[2]
|
| 82 |
}
|
| 83 |
|
| 84 |
feature_translations_dict = dict(zip(prod_feat_flat_fields, feature_translations))
|
|
|
|
| 147 |
"老實說我對任何環保資訊都沒有太多興趣\nHonestly, I'm Not Very Interested in Any Eco Information",
|
| 148 |
"投資前比較公司的環保表現\nCompare Companies' Environmental Performance Before Investing"
|
| 149 |
],
|
| 150 |
+
'Conscious (n=340)': [0.367, 0.415, 0.191, 0.176, 0.079, 1.000, 0.197, 0.265, 0.144, 0.241, 0.144, 0.332, 0.044, 0.188],
|
| 151 |
+
'Interested (n=215)': [0.260, 0.163, 0.153, 0.191, 0.107, 0.000, 0.135, 0.219, 0.172, 0.186, 0.093, 0.214, 0.233, 0.130],
|
| 152 |
+
'Advocate (n=126)': [0.825, 0.881, 0.460, 0.746, 0.230, 0.881, 0.667, 0.690, 0.421, 0.865, 0.468, 0.778, 0.143, 0.738]
|
| 153 |
}
|
| 154 |
# Create a DataFrame
|
| 155 |
df = pd.DataFrame(data)
|
page_tests.py
CHANGED
|
@@ -1,11 +1,13 @@
|
|
| 1 |
from scipy.stats import chisquare
|
|
|
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
from fields.likert_flat_fields import likert_flat_fields
|
| 5 |
|
| 6 |
-
|
| 7 |
def show(df):
|
| 8 |
st.title("Statistical Tests")
|
|
|
|
| 9 |
show_chi_square_results(df)
|
| 10 |
|
| 11 |
def show_chi_square_results(df):
|
|
@@ -13,40 +15,46 @@ def show_chi_square_results(df):
|
|
| 13 |
|
| 14 |
for field in likert_flat_fields:
|
| 15 |
observed_values = df[field].value_counts().sort_index()
|
| 16 |
-
observed_values = observed_values.
|
| 17 |
expected_values = [len(df) / len(observed_values)] * len(observed_values)
|
| 18 |
-
expected_values = [float(x) for x in expected_values]
|
| 19 |
chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
|
| 20 |
chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
|
| 21 |
|
| 22 |
chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
|
| 23 |
chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
# Reset index to add a sequence number
|
| 26 |
chi_square_df.reset_index(inplace=True)
|
| 27 |
chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
|
| 28 |
|
| 29 |
# Define thresholds for highlighting
|
| 30 |
chi_square_threshold = 300 # example threshold for high Chi-Square value
|
| 31 |
-
p_value_threshold = 1e-50
|
| 32 |
|
| 33 |
# Apply the highlighting
|
| 34 |
-
def highlight(value):
|
| 35 |
-
if
|
| 36 |
-
return "background-color: yellow"
|
| 37 |
-
elif isinstance(value, float) and value < p_value_threshold:
|
| 38 |
return "background-color: yellow"
|
|
|
|
|
|
|
| 39 |
else:
|
| 40 |
-
return
|
| 41 |
-
|
| 42 |
-
# Apply the highlighting to numeric columns only
|
| 43 |
-
chi_square_df_styled = chi_square_df.style.applymap(highlight, subset=pd.IndexSlice[:, ['Chi-Square Statistic', 'p-value']])
|
| 44 |
|
| 45 |
-
#
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
-
# Convert styled DataFrame to HTML
|
| 49 |
chi_square_html = chi_square_df_styled.to_html(escape=False)
|
| 50 |
|
| 51 |
# Display the HTML with unsafe_allow_html set to True
|
| 52 |
-
st.markdown(chi_square_html, unsafe_allow_html=True)
|
|
|
|
| 1 |
from scipy.stats import chisquare
|
| 2 |
+
from functools import partial
|
| 3 |
import streamlit as st
|
| 4 |
import pandas as pd
|
| 5 |
from fields.likert_flat_fields import likert_flat_fields
|
| 6 |
|
| 7 |
+
#@st.cache_data
|
| 8 |
def show(df):
|
| 9 |
st.title("Statistical Tests")
|
| 10 |
+
st.write("Yellow Chi-Square statistics (high) and pink p-values (low) are statistically meaningful")
|
| 11 |
show_chi_square_results(df)
|
| 12 |
|
| 13 |
def show_chi_square_results(df):
|
|
|
|
| 15 |
|
| 16 |
for field in likert_flat_fields:
|
| 17 |
observed_values = df[field].value_counts().sort_index()
|
| 18 |
+
observed_values = observed_values.reindex(index=range(1, 6), fill_value=0)
|
| 19 |
expected_values = [len(df) / len(observed_values)] * len(observed_values)
|
|
|
|
| 20 |
chi_stat, p_value = chisquare(f_obs=observed_values, f_exp=expected_values)
|
| 21 |
chi_square_results[field] = {'Chi-Square Statistic': chi_stat, 'p-value': p_value}
|
| 22 |
|
| 23 |
chi_square_df = pd.DataFrame.from_dict(chi_square_results, orient='index')
|
| 24 |
chi_square_df['p-value'] = chi_square_df['p-value'].astype(float)
|
| 25 |
|
| 26 |
+
# Convert p-values to string for formatting
|
| 27 |
+
chi_square_df['p-value'] = chi_square_df['p-value'].apply(lambda x: "{:.2e}".format(x))
|
| 28 |
+
|
| 29 |
# Reset index to add a sequence number
|
| 30 |
chi_square_df.reset_index(inplace=True)
|
| 31 |
chi_square_df.rename(columns={'index': 'Question'}, inplace=True)
|
| 32 |
|
| 33 |
# Define thresholds for highlighting
|
| 34 |
chi_square_threshold = 300 # example threshold for high Chi-Square value
|
| 35 |
+
p_value_threshold = 1e-50 # example threshold for very low p-value
|
| 36 |
|
| 37 |
# Apply the highlighting
|
| 38 |
+
def highlight(value, chi_square_threshold, p_value_threshold, col_name):
|
| 39 |
+
if col_name == 'Chi-Square Statistic' and float(value) > chi_square_threshold:
|
|
|
|
|
|
|
| 40 |
return "background-color: yellow"
|
| 41 |
+
elif col_name == 'p-value' and float(value) < p_value_threshold:
|
| 42 |
+
return "background-color: pink"
|
| 43 |
else:
|
| 44 |
+
return None
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
+
# Create partial functions for each column to apply the highlight with the column name
|
| 47 |
+
highlight_chi_square = partial(highlight, chi_square_threshold=chi_square_threshold,
|
| 48 |
+
p_value_threshold=p_value_threshold, col_name='Chi-Square Statistic')
|
| 49 |
+
highlight_p_value = partial(highlight, chi_square_threshold=chi_square_threshold,
|
| 50 |
+
p_value_threshold=p_value_threshold, col_name='p-value')
|
| 51 |
+
|
| 52 |
+
# Apply the highlighting to the DataFrame
|
| 53 |
+
chi_square_df_styled = chi_square_df.style.applymap(highlight_chi_square, subset=['Chi-Square Statistic']) \
|
| 54 |
+
.applymap(highlight_p_value, subset=['p-value'])
|
| 55 |
|
| 56 |
+
# Convert styled DataFrame to HTML
|
| 57 |
chi_square_html = chi_square_df_styled.to_html(escape=False)
|
| 58 |
|
| 59 |
# Display the HTML with unsafe_allow_html set to True
|
| 60 |
+
st.markdown(chi_square_html, unsafe_allow_html=True)
|