Commit ·
81850c8
1
Parent(s): c919c92
add caching
Browse files
app.py
CHANGED
|
@@ -49,6 +49,7 @@ st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
|
|
| 49 |
|
| 50 |
#st.markdown("### Rate of speech in words per minute (WPM)")
|
| 51 |
|
|
|
|
| 52 |
def get_wpm_chart(show_medians=False):
|
| 53 |
|
| 54 |
# Data for vertical lines corresponding to each level
|
|
@@ -220,6 +221,7 @@ st.markdown("To put this data into perspective, native Japanese speakers \
|
|
| 220 |
|
| 221 |
# wpm vs sps chart
|
| 222 |
|
|
|
|
| 223 |
def get_wpm_vs_sps_chart(interactive=False):
|
| 224 |
|
| 225 |
selection = alt.selection_point(fields=['level'], bind='legend', on='click')
|
|
@@ -366,6 +368,7 @@ st.markdown("## Sentence length")
|
|
| 366 |
|
| 367 |
st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
|
| 368 |
|
|
|
|
| 369 |
def get_sentence_length_hist(show_medians=False):
|
| 370 |
|
| 371 |
# Data for vertical lines corresponding to each level
|
|
@@ -542,6 +545,7 @@ st.markdown("## Amount of repetition")
|
|
| 542 |
|
| 543 |
st.markdown("Words are repeated more often in easier videos.")
|
| 544 |
|
|
|
|
| 545 |
def get_repetition_hist(show_medians=False):
|
| 546 |
|
| 547 |
video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
|
|
@@ -744,6 +748,7 @@ st.markdown("If we take all the words in CIJ, count them then order them from mo
|
|
| 744 |
|
| 745 |
# word coverage chart
|
| 746 |
|
|
|
|
| 747 |
def get_word_coverage_chart():
|
| 748 |
|
| 749 |
word_coverage_df = pd.read_csv('word_coverage_df_plot.tsv', sep='\t')
|
|
@@ -892,7 +897,7 @@ def get_word_coverage_chart():
|
|
| 892 |
|
| 893 |
return layered_chart
|
| 894 |
|
| 895 |
-
|
| 896 |
def get_zoomed_word_coverage_chart():
|
| 897 |
|
| 898 |
word_coverage_df = pd.read_csv('word_coverage_df_plot.tsv', sep='\t')
|
|
@@ -1059,6 +1064,7 @@ st.markdown("Using the same method of calculating word coverage as before, \
|
|
| 1059 |
we can also calculate how many of the top words you need to know \
|
| 1060 |
to achieve 98% word coverage in each video.")
|
| 1061 |
|
|
|
|
| 1062 |
def get_ne_spot_hist(show_medians=False):
|
| 1063 |
|
| 1064 |
# Data for vertical lines corresponding to each level
|
|
@@ -1233,6 +1239,7 @@ st.markdown("## Word rareness")
|
|
| 1233 |
|
| 1234 |
st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
|
| 1235 |
|
|
|
|
| 1236 |
def get_tfplr_hist(show_medians=False):
|
| 1237 |
|
| 1238 |
# Data for vertical lines corresponding to each level
|
|
@@ -1421,6 +1428,7 @@ st.markdown("## Grammar")
|
|
| 1421 |
|
| 1422 |
st.markdown("Easier videos tend to use less [subordinating conjunctions](https://universaldependencies.org/u/pos/SCONJ.html) than harder videos.")
|
| 1423 |
|
|
|
|
| 1424 |
def get_sconj_hist(show_medians=False):
|
| 1425 |
|
| 1426 |
video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
|
|
@@ -1653,6 +1661,7 @@ st.markdown("Wago are native Japanese words, Kango are Chinese words and Gairaig
|
|
| 1653 |
|
| 1654 |
st.markdown("Harder videos tend to use more Kango than easier videos")
|
| 1655 |
|
|
|
|
| 1656 |
def get_kango_hist(show_medians=False):
|
| 1657 |
|
| 1658 |
video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
|
|
@@ -1875,6 +1884,7 @@ st.markdown("To answer this, we can look at a correlation heatmap between each o
|
|
| 1875 |
|
| 1876 |
num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
|
| 1877 |
|
|
|
|
| 1878 |
def render_vanilla_heatmap():
|
| 1879 |
|
| 1880 |
# Compute the correlation matrix
|
|
@@ -1908,7 +1918,7 @@ st.markdown("Using a statistics rule of thumb and removing all variables that ha
|
|
| 1908 |
weaker than 0.3 (and more than -0.3), we can identify the variables with the strongest correlations.")
|
| 1909 |
|
| 1910 |
|
| 1911 |
-
|
| 1912 |
def render_level_row_unordered():
|
| 1913 |
|
| 1914 |
# Compute the correlation matrix
|
|
@@ -1934,6 +1944,7 @@ def render_level_row_unordered():
|
|
| 1934 |
#plt.show()
|
| 1935 |
st.pyplot(plt.gcf())
|
| 1936 |
|
|
|
|
| 1937 |
def render_level_col_ordered():
|
| 1938 |
|
| 1939 |
# Compute the correlation matrix
|
|
|
|
| 49 |
|
| 50 |
#st.markdown("### Rate of speech in words per minute (WPM)")
|
| 51 |
|
| 52 |
+
@st.cache_data
|
| 53 |
def get_wpm_chart(show_medians=False):
|
| 54 |
|
| 55 |
# Data for vertical lines corresponding to each level
|
|
|
|
| 221 |
|
| 222 |
# wpm vs sps chart
|
| 223 |
|
| 224 |
+
@st.cache_data
|
| 225 |
def get_wpm_vs_sps_chart(interactive=False):
|
| 226 |
|
| 227 |
selection = alt.selection_point(fields=['level'], bind='legend', on='click')
|
|
|
|
| 368 |
|
| 369 |
st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
|
| 370 |
|
| 371 |
+
@st.cache_data
|
| 372 |
def get_sentence_length_hist(show_medians=False):
|
| 373 |
|
| 374 |
# Data for vertical lines corresponding to each level
|
|
|
|
| 545 |
|
| 546 |
st.markdown("Words are repeated more often in easier videos.")
|
| 547 |
|
| 548 |
+
@st.cache_data
|
| 549 |
def get_repetition_hist(show_medians=False):
|
| 550 |
|
| 551 |
video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
|
|
|
|
| 748 |
|
| 749 |
# word coverage chart
|
| 750 |
|
| 751 |
+
@st.cache_data
|
| 752 |
def get_word_coverage_chart():
|
| 753 |
|
| 754 |
word_coverage_df = pd.read_csv('word_coverage_df_plot.tsv', sep='\t')
|
|
|
|
| 897 |
|
| 898 |
return layered_chart
|
| 899 |
|
| 900 |
+
@st.cache_data
|
| 901 |
def get_zoomed_word_coverage_chart():
|
| 902 |
|
| 903 |
word_coverage_df = pd.read_csv('word_coverage_df_plot.tsv', sep='\t')
|
|
|
|
| 1064 |
we can also calculate how many of the top words you need to know \
|
| 1065 |
to achieve 98% word coverage in each video.")
|
| 1066 |
|
| 1067 |
+
@st.cache_data
|
| 1068 |
def get_ne_spot_hist(show_medians=False):
|
| 1069 |
|
| 1070 |
# Data for vertical lines corresponding to each level
|
|
|
|
| 1239 |
|
| 1240 |
st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
|
| 1241 |
|
| 1242 |
+
@st.cache_data
|
| 1243 |
def get_tfplr_hist(show_medians=False):
|
| 1244 |
|
| 1245 |
# Data for vertical lines corresponding to each level
|
|
|
|
| 1428 |
|
| 1429 |
st.markdown("Easier videos tend to use less [subordinating conjunctions](https://universaldependencies.org/u/pos/SCONJ.html) than harder videos.")
|
| 1430 |
|
| 1431 |
+
@st.cache_data
|
| 1432 |
def get_sconj_hist(show_medians=False):
|
| 1433 |
|
| 1434 |
video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props']
|
|
|
|
| 1661 |
|
| 1662 |
st.markdown("Harder videos tend to use more Kango than easier videos")
|
| 1663 |
|
| 1664 |
+
@st.cache_data
|
| 1665 |
def get_kango_hist(show_medians=False):
|
| 1666 |
|
| 1667 |
video_df['kan_props_perc'] = 100.0 * video_df['kan_props']
|
|
|
|
| 1884 |
|
| 1885 |
num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
|
| 1886 |
|
| 1887 |
+
@st.cache_data
|
| 1888 |
def render_vanilla_heatmap():
|
| 1889 |
|
| 1890 |
# Compute the correlation matrix
|
|
|
|
| 1918 |
weaker than 0.3 (and more than -0.3), we can identify the variables with the strongest correlations.")
|
| 1919 |
|
| 1920 |
|
| 1921 |
+
@st.cache_data
|
| 1922 |
def render_level_row_unordered():
|
| 1923 |
|
| 1924 |
# Compute the correlation matrix
|
|
|
|
| 1944 |
#plt.show()
|
| 1945 |
st.pyplot(plt.gcf())
|
| 1946 |
|
| 1947 |
+
@st.cache_data
|
| 1948 |
def render_level_col_ordered():
|
| 1949 |
|
| 1950 |
# Compute the correlation matrix
|