Commit ·
84629ad
1
Parent(s): 81a6566
add some text content
Browse files
app.py
CHANGED
|
@@ -16,167 +16,203 @@ import seaborn as sns
|
|
| 16 |
#</style>
|
| 17 |
#""", unsafe_allow_html=True)
|
| 18 |
|
| 19 |
-
st.
|
| 20 |
|
| 21 |
-
st.markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
video platform for learning Japanese.")
|
| 23 |
|
| 24 |
video_df = pd.read_csv("video_data.tsv", sep="\t")
|
| 25 |
|
| 26 |
# Plot the WPM histogram
|
| 27 |
|
| 28 |
-
#
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
alt.
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
#titleFontStyle='italic',
|
| 71 |
-
titlePadding=20,
|
| 72 |
-
tickCount=5
|
| 73 |
),
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
)
|
| 96 |
-
)
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
)
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
)
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
strokeWidth=6,
|
| 132 |
-
strokeDash = [10, 2], # first arg is length, second is gap
|
| 133 |
-
).encode(
|
| 134 |
-
x='x:Q',
|
| 135 |
-
tooltip=[
|
| 136 |
-
alt.Tooltip('x:N', title='Median WPM:'),
|
| 137 |
-
alt.Tooltip('level:N', title='Level:')
|
| 138 |
-
],
|
| 139 |
-
#color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
|
| 140 |
-
color=alt.Color(
|
| 141 |
-
'level:N',
|
| 142 |
-
scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
|
| 143 |
-
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 144 |
-
legend=None # No legend for lines, it is already shown in the histogram
|
| 145 |
-
),
|
| 146 |
-
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 147 |
-
strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
|
| 148 |
-
).add_params(
|
| 149 |
-
selection,
|
| 150 |
-
highlight
|
| 151 |
-
)
|
| 152 |
-
|
| 153 |
-
text_labels = alt.Chart(line_data).mark_text(
|
| 154 |
-
align='center', # Align text to the left of the line
|
| 155 |
-
dx=0, # Offset the text to the right by 5 pixels
|
| 156 |
-
dy=-10, # Adjust vertical positioning
|
| 157 |
-
fontSize=16,
|
| 158 |
-
fontWeight='bold'
|
| 159 |
-
).encode(
|
| 160 |
-
x='x:Q',
|
| 161 |
-
y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
|
| 162 |
-
text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
|
| 163 |
-
color=alt.Color(
|
| 164 |
-
'level:N',
|
| 165 |
-
scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
|
| 166 |
-
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 167 |
-
legend=None
|
| 168 |
-
),
|
| 169 |
-
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 170 |
-
)
|
| 171 |
|
| 172 |
|
| 173 |
if st.checkbox('Show medians'):
|
| 174 |
-
|
|
|
|
|
|
|
| 175 |
else:
|
| 176 |
-
|
|
|
|
| 177 |
|
| 178 |
st.altair_chart(layered_chart, use_container_width=True)
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
# wpm vs sps chart
|
| 181 |
|
| 182 |
def get_wpm_vs_sps_chart(interactive=False):
|
|
@@ -279,6 +315,74 @@ else:
|
|
| 279 |
|
| 280 |
st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
|
| 281 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
# word coverage chart
|
| 283 |
|
| 284 |
def get_word_coverage_chart():
|
|
@@ -591,8 +695,45 @@ else:
|
|
| 591 |
word_coverage_chart = get_word_coverage_chart()
|
| 592 |
|
| 593 |
st.altair_chart(word_coverage_chart, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 594 |
# grammar table
|
| 595 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 596 |
data = {
|
| 597 |
'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
|
| 598 |
'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
|
|
@@ -628,6 +769,21 @@ styled_df = df.style.set_table_styles(
|
|
| 628 |
# Display the styled DataFrame
|
| 629 |
st.markdown(styled_df.to_html(), unsafe_allow_html=True)
|
| 630 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 631 |
# word origin table
|
| 632 |
|
| 633 |
data = {
|
|
@@ -667,6 +823,14 @@ st.markdown(styled_df.to_html(), unsafe_allow_html=True)
|
|
| 667 |
|
| 668 |
# heatmap
|
| 669 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 670 |
num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
|
| 671 |
|
| 672 |
def render_vanilla_heatmap():
|
|
@@ -693,6 +857,16 @@ def render_vanilla_heatmap():
|
|
| 693 |
|
| 694 |
render_vanilla_heatmap()
|
| 695 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 696 |
def render_level_row_unordered():
|
| 697 |
|
| 698 |
# Compute the correlation matrix
|
|
@@ -752,4 +926,39 @@ def render_level_col_ordered():
|
|
| 752 |
if st.checkbox('Flip and sort'):
|
| 753 |
render_level_col_ordered()
|
| 754 |
else:
|
| 755 |
-
render_level_row_unordered()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
#</style>
|
| 17 |
#""", unsafe_allow_html=True)
|
| 18 |
|
| 19 |
+
st.markdown("Note: this analysis is meant to viewed on a computer and not a phone (sorry!)")
|
| 20 |
|
| 21 |
+
st.markdown("# What makes comprehensible input *comprehensible*?")
|
| 22 |
+
|
| 23 |
+
st.markdown("**Comprehensible input** (or CI, for short) is a language teaching technique where teachers \
|
| 24 |
+
speak in a way that is understandable to their students. \
|
| 25 |
+
It is believed by many that CI is one of the most optimal and natural \
|
| 26 |
+
ways to acquire a foreign language \
|
| 27 |
+
...but, what exactly is about CI that makes it comprehensible?")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
st.markdown("To answer this question, I'll be analyzing the videos on \
|
| 32 |
+
[cijapanese.com](https://cijapanese.com/) (CIJ), a \
|
| 33 |
video platform for learning Japanese.")
|
| 34 |
|
| 35 |
video_df = pd.read_csv("video_data.tsv", sep="\t")
|
| 36 |
|
| 37 |
# Plot the WPM histogram
|
| 38 |
|
| 39 |
+
st.markdown("## How fast is CI?")
|
| 40 |
+
|
| 41 |
+
st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
|
| 42 |
+
they speak more slowly in videos meant for beginners and more quickly \
|
| 43 |
+
for advanced learners.")
|
| 44 |
+
|
| 45 |
+
#st.markdown("### Rate of speech in words per minute (WPM)")
|
| 46 |
+
|
| 47 |
+
def get_wpm_chart(show_medians=False):
|
| 48 |
+
|
| 49 |
+
# Data for vertical lines corresponding to each level
|
| 50 |
+
line_data = pd.DataFrame({
|
| 51 |
+
'x': [75, 91, 124, 149],
|
| 52 |
+
'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 53 |
+
'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
|
| 54 |
+
})
|
| 55 |
+
|
| 56 |
+
selection = alt.selection_point(fields=['level'], bind='legend', on='click')
|
| 57 |
+
|
| 58 |
+
highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
|
| 59 |
+
|
| 60 |
+
histogram = alt.Chart(video_df).mark_bar(
|
| 61 |
+
opacity=0.5,
|
| 62 |
+
binSpacing=3,
|
| 63 |
+
stroke='black',
|
| 64 |
+
strokeWidth=0,
|
| 65 |
+
cornerRadius=5,
|
| 66 |
+
cursor="pointer"
|
| 67 |
+
).encode(
|
| 68 |
+
alt.X(
|
| 69 |
+
'wpm:Q',
|
| 70 |
+
bin=alt.Bin(maxbins=20),
|
| 71 |
+
title='Words per minute',
|
| 72 |
+
axis=alt.Axis(
|
| 73 |
+
labelFontSize=14,
|
| 74 |
+
titleFontSize=18,
|
| 75 |
+
#titleFont='Urbanist',
|
| 76 |
+
titleColor='black',
|
| 77 |
+
titleFontWeight='normal',
|
| 78 |
+
#titleFontStyle='italic',
|
| 79 |
+
titlePadding=20
|
| 80 |
+
)
|
|
|
|
|
|
|
|
|
|
| 81 |
),
|
| 82 |
+
alt.Y(
|
| 83 |
+
'count()',
|
| 84 |
+
title="Num. videos",
|
| 85 |
+
axis=alt.Axis(
|
| 86 |
+
labelFontSize=14,
|
| 87 |
+
titleFontSize=18,
|
| 88 |
+
#titleFont='Urbanist',
|
| 89 |
+
titleColor='black',
|
| 90 |
+
titleFontWeight='normal',
|
| 91 |
+
#titleFontStyle='italic',
|
| 92 |
+
titlePadding=20,
|
| 93 |
+
tickCount=5
|
| 94 |
+
),
|
| 95 |
+
scale=alt.Scale(domain=[0,100])
|
| 96 |
+
).stack(None),
|
| 97 |
+
alt.Color(
|
| 98 |
+
'level:N',
|
| 99 |
+
scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
|
| 100 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 101 |
+
legend=alt.Legend(
|
| 102 |
+
title='CIJ Level',
|
| 103 |
+
#titleFont='Urbanist',
|
| 104 |
+
titleFontSize=18,
|
| 105 |
+
titleFontWeight='bolder',
|
| 106 |
+
labelFontSize=16,
|
| 107 |
+
#labelFont='Urbanist',
|
| 108 |
+
symbolType='circle',
|
| 109 |
+
symbolSize=200,
|
| 110 |
+
symbolStrokeWidth=0,
|
| 111 |
+
orient='right',
|
| 112 |
+
direction='vertical',
|
| 113 |
+
fillColor='white',
|
| 114 |
+
padding=10,
|
| 115 |
+
cornerRadius=5,
|
| 116 |
+
)
|
| 117 |
+
),
|
| 118 |
+
tooltip=[
|
| 119 |
+
alt.Tooltip('wpm:Q', title='Words per minute:', bin=True), # Properly indicate that `wpm` is binned
|
| 120 |
+
alt.Tooltip('level:N', title='Level:'),
|
| 121 |
+
alt.Tooltip('count()', title='Video count:')
|
| 122 |
+
],
|
| 123 |
+
opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
|
| 124 |
+
strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
|
| 125 |
+
).properties(
|
| 126 |
+
#width=750,
|
| 127 |
+
#width='container',
|
| 128 |
+
#height='container',
|
| 129 |
+
height=500,
|
| 130 |
+
#background='beige',
|
| 131 |
+
#padding=50,
|
| 132 |
+
title=alt.TitleParams(
|
| 133 |
+
text='Rate of speech in words per minute (WPM)',
|
| 134 |
+
offset=20,
|
| 135 |
+
#subtitle='(clickable)',
|
| 136 |
+
#font='Urbanist',
|
| 137 |
+
fontSize=24,
|
| 138 |
+
fontWeight='normal',
|
| 139 |
+
anchor='middle',
|
| 140 |
+
color='black',
|
| 141 |
+
subtitleFontSize=15,
|
| 142 |
+
subtitleColor='gray'
|
| 143 |
)
|
| 144 |
+
).add_params(
|
| 145 |
+
selection,
|
| 146 |
+
highlight
|
| 147 |
+
)
|
| 148 |
+
|
| 149 |
+
# Vertical lines corresponding to each level
|
| 150 |
+
vertical_lines = alt.Chart(line_data).mark_rule(
|
| 151 |
+
color='red',
|
| 152 |
+
strokeWidth=6,
|
| 153 |
+
strokeDash = [10, 2], # first arg is length, second is gap
|
| 154 |
+
).encode(
|
| 155 |
+
x='x:Q',
|
| 156 |
+
tooltip=[
|
| 157 |
+
alt.Tooltip('x:N', title='Median WPM:'),
|
| 158 |
+
alt.Tooltip('level:N', title='Level:')
|
| 159 |
+
],
|
| 160 |
+
#color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
|
| 161 |
+
color=alt.Color(
|
| 162 |
+
'level:N',
|
| 163 |
+
scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
|
| 164 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 165 |
+
legend=None # No legend for lines, it is already shown in the histogram
|
| 166 |
+
),
|
| 167 |
+
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 168 |
+
strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
|
| 169 |
+
).add_params(
|
| 170 |
+
selection,
|
| 171 |
+
highlight
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
text_labels = alt.Chart(line_data).mark_text(
|
| 175 |
+
align='center', # Align text to the left of the line
|
| 176 |
+
dx=0, # Offset the text to the right by 5 pixels
|
| 177 |
+
dy=-10, # Adjust vertical positioning
|
| 178 |
+
fontSize=16,
|
| 179 |
+
fontWeight='bold'
|
| 180 |
+
).encode(
|
| 181 |
+
x='x:Q',
|
| 182 |
+
y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
|
| 183 |
+
text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
|
| 184 |
+
color=alt.Color(
|
| 185 |
+
'level:N',
|
| 186 |
+
scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
|
| 187 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 188 |
+
legend=None
|
| 189 |
+
),
|
| 190 |
+
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 191 |
)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if show_medians:
|
| 195 |
+
layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='#f6f8fb')
|
| 196 |
+
else:
|
| 197 |
+
layered_chart = alt.layer(histogram, background='#f6f8fb')
|
| 198 |
+
|
| 199 |
+
return layered_chart
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
|
| 202 |
if st.checkbox('Show medians'):
|
| 203 |
+
|
| 204 |
+
layered_chart = get_wpm_chart(show_medians=True)
|
| 205 |
+
|
| 206 |
else:
|
| 207 |
+
|
| 208 |
+
layered_chart = get_wpm_chart(show_medians=False)
|
| 209 |
|
| 210 |
st.altair_chart(layered_chart, use_container_width=True)
|
| 211 |
|
| 212 |
+
st.markdown("To put this data into perspective, native Japanese speakers \
|
| 213 |
+
tend to speak at rates of over 200 wpm, meaning that most of the videos \
|
| 214 |
+
on CIJ have been adapted to be a lot slower than that!")
|
| 215 |
+
|
| 216 |
# wpm vs sps chart
|
| 217 |
|
| 218 |
def get_wpm_vs_sps_chart(interactive=False):
|
|
|
|
| 315 |
|
| 316 |
st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
|
| 317 |
|
| 318 |
+
st.markdown("We can also measure the rate of speech in syllables per second (SPS) \
|
| 319 |
+
and compare it to words per minute.")
|
| 320 |
+
|
| 321 |
+
st.markdown("(Also, FYI, most of these **graphs are \
|
| 322 |
+
interactive** so please click around.)")
|
| 323 |
+
|
| 324 |
+
st.markdown("## A quick statistics lesson")
|
| 325 |
+
|
| 326 |
+
st.markdown("Before we continue this analysis, there's some basic things you should know.")
|
| 327 |
+
|
| 328 |
+
st.markdown("### The data")
|
| 329 |
+
|
| 330 |
+
st.markdown("The dataset we'll be analyzing comprises of just under 1,000 videos. \
|
| 331 |
+
In particular, we'll be analyzing the subtitles of the videos.")
|
| 332 |
+
|
| 333 |
+
st.markdown('Every video has a Level: **Complete Beginner**, **Beginner**, \
|
| 334 |
+
**Intermediate**, or **Advanced**.')
|
| 335 |
+
|
| 336 |
+
st.markdown("### The statistics")
|
| 337 |
+
|
| 338 |
+
st.markdown("The goal of this analysis is to find features in the video data that lead \
|
| 339 |
+
to a specific pattern called an \"ordering\".")
|
| 340 |
+
|
| 341 |
+
st.markdown("We're specifically looking for *any* statistic that can lead to an \
|
| 342 |
+
ordering of the levels in one of the two following orders:")
|
| 343 |
+
|
| 344 |
+
st.markdown("> Complete Beginner < Beginner < Intermediate < Advanced")
|
| 345 |
+
st.markdown("or")
|
| 346 |
+
st.markdown("> Complete Beginner > Beginner > Intermediate > Advanced")
|
| 347 |
+
|
| 348 |
+
st.markdown("For example: if a statistic is small for Complete Beginnner videos, but gets bigger \
|
| 349 |
+
for Beginner, Intermediate, then Advanced videos, it suggests \
|
| 350 |
+
that this is a good statistic for determining what makes a video comprehensible. \
|
| 351 |
+
In fact, we already saw this above when measuring the **words per minute** statistic.")
|
| 352 |
+
|
| 353 |
+
st.markdown("Okay! Now we can continue.")
|
| 354 |
+
|
| 355 |
+
st.markdown("## Sentence length")
|
| 356 |
+
|
| 357 |
+
st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
|
| 358 |
+
|
| 359 |
+
st.markdown("[TODO]: Add mean sentence length graph")
|
| 360 |
+
|
| 361 |
+
st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
|
| 362 |
+
whereas short sentences are usually easier to understand.")
|
| 363 |
+
|
| 364 |
+
st.markdown("## Amount of repetition")
|
| 365 |
+
|
| 366 |
+
st.markdown("Words are repeated more often in easier videos.")
|
| 367 |
+
|
| 368 |
+
st.markdown("[TODO]: Add Average rel reps histogram")
|
| 369 |
+
|
| 370 |
+
st.markdown("If you don't catch a word the first time it's said, there's more opportunities \
|
| 371 |
+
in the easier videos to hear that word again.")
|
| 372 |
+
|
| 373 |
+
st.markdown("## How many words you need to know")
|
| 374 |
+
|
| 375 |
+
st.markdown("A popular statistic in language learning circles is that you generally \
|
| 376 |
+
need to know around 98% of words in a given piece of content to understand it well. \
|
| 377 |
+
This statistic is known as 'word coverage', the percentage of words you know in a given text.")
|
| 378 |
+
|
| 379 |
+
st.markdown("How many words do you need to know to understand 98% of the words in each level?")
|
| 380 |
+
|
| 381 |
+
st.markdown("If we take all the words in CIJ, count them then order them from most common, to least common, \
|
| 382 |
+
we can calculate the word coverage you get at different vocabulary sizes. \
|
| 383 |
+
For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
|
| 384 |
+
Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
|
| 385 |
+
|
| 386 |
# word coverage chart
|
| 387 |
|
| 388 |
def get_word_coverage_chart():
|
|
|
|
| 695 |
word_coverage_chart = get_word_coverage_chart()
|
| 696 |
|
| 697 |
st.altair_chart(word_coverage_chart, use_container_width=True)
|
| 698 |
+
|
| 699 |
+
st.markdown("Using the same method of calculating word coverage as before, \
|
| 700 |
+
we can also calculate how many of the top words you need to know \
|
| 701 |
+
to achieve 98% word coverage in each video.")
|
| 702 |
+
|
| 703 |
+
st.markdown("[TODO]: Add ne_spot histogram")
|
| 704 |
+
|
| 705 |
+
st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
|
| 706 |
+
|
| 707 |
+
st.markdown("## Word rareness")
|
| 708 |
+
|
| 709 |
+
st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
|
| 710 |
+
|
| 711 |
+
st.markdown("[TODO]: Add that that log rank histogram")
|
| 712 |
+
|
| 713 |
+
st.markdown("How common a word is, is known as its 'rank'. The most common word \
|
| 714 |
+
in a text would be rank 1 and the fifth most common would be rank 5. \
|
| 715 |
+
A word with a low rank is a commonly used word (e.g., 'it', 'walk', 'up') whereas a word with a high rank \
|
| 716 |
+
is an uncommon or 'rare' word (e.g., 'esoteric', 'gauche', 'gallant').")
|
| 717 |
+
|
| 718 |
+
st.markdown("The words in the videos were compared to the ranks of words generated from a frequency list made from over 4,000 Japanese Netflix \
|
| 719 |
+
TV episodes and movies. Duplicate ranks in the videos were removed, scaled with a log \
|
| 720 |
+
function then used to compute the 25th percentile. This was necessary due \
|
| 721 |
+
to power-law nature of word frequency distributions.")
|
| 722 |
+
|
| 723 |
+
st.markdown("(It's okay ff the above didn't quite make sense to you - just know that the above graph \
|
| 724 |
+
demonstrates that easier videos tend to use more common words whereas \
|
| 725 |
+
advanced videos tend to use more rare words!)")
|
| 726 |
+
|
| 727 |
# grammar table
|
| 728 |
|
| 729 |
+
st.markdown("## Grammar")
|
| 730 |
+
|
| 731 |
+
st.markdown("Easier videos tend to use less [subordinating conjunctions](https://universaldependencies.org/u/pos/SCONJ.html) than harder videos.")
|
| 732 |
+
|
| 733 |
+
st.markdown("[TODO]: Add sconj histogram")
|
| 734 |
+
|
| 735 |
+
st.markdown("We also notice differences in the use of other types of words.")
|
| 736 |
+
|
| 737 |
data = {
|
| 738 |
'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
|
| 739 |
'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
|
|
|
|
| 769 |
# Display the styled DataFrame
|
| 770 |
st.markdown(styled_df.to_html(), unsafe_allow_html=True)
|
| 771 |
|
| 772 |
+
st.markdown("## What type of word")
|
| 773 |
+
|
| 774 |
+
st.markdown("There are three main categories of words in Japanese:")
|
| 775 |
+
st.markdown("(1) Wago (和語), (2) Kango (漢語) and (3) Gairaigo (外来語)")
|
| 776 |
+
st.markdown("Wago are native Japanese words, Kango are Chinese words and Gairaigo are foreign words.")
|
| 777 |
+
|
| 778 |
+
st.markdown("Harder videos tend to use more Kango than easier videos")
|
| 779 |
+
|
| 780 |
+
st.markdown("[TODO]: Add kango histogram")
|
| 781 |
+
|
| 782 |
+
st.markdown("In Japanese, Kango are somewhat analogous to French words in English. \
|
| 783 |
+
These words tend to be more technical or sophisticated than other words.")
|
| 784 |
+
|
| 785 |
+
st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
|
| 786 |
+
|
| 787 |
# word origin table
|
| 788 |
|
| 789 |
data = {
|
|
|
|
| 823 |
|
| 824 |
# heatmap
|
| 825 |
|
| 826 |
+
st.markdown("## Which factors matter the most?")
|
| 827 |
+
|
| 828 |
+
st.markdown("We've just found a number of statistics that lead to orderings in the data \
|
| 829 |
+
but which statistics matter the most?")
|
| 830 |
+
|
| 831 |
+
st.markdown("To answer this, we can look at a correlation heatmap between each of the variables \
|
| 832 |
+
and observe which statistics correlate the most strongly with the video's level.")
|
| 833 |
+
|
| 834 |
num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
|
| 835 |
|
| 836 |
def render_vanilla_heatmap():
|
|
|
|
| 857 |
|
| 858 |
render_vanilla_heatmap()
|
| 859 |
|
| 860 |
+
st.markdown("In case you're not familiar with stuff like this, numbers close to 1 or -1 \
|
| 861 |
+
represent a high level or correlation and numbers close to 0 represent a low level of correlation. \
|
| 862 |
+
Positive numbers represent a positive relationship between the variables and negative numbers represent a \
|
| 863 |
+
reverse relationship between the variables.")
|
| 864 |
+
|
| 865 |
+
st.markdown("Using a statistics rule of thumb and removing all variables that have correlations \
|
| 866 |
+
weaker than 0.3 (and more than -0.3), we can identify the variables with the strongest correlations.")
|
| 867 |
+
|
| 868 |
+
|
| 869 |
+
|
| 870 |
def render_level_row_unordered():
|
| 871 |
|
| 872 |
# Compute the correlation matrix
|
|
|
|
| 926 |
if st.checkbox('Flip and sort'):
|
| 927 |
render_level_col_ordered()
|
| 928 |
else:
|
| 929 |
+
render_level_row_unordered()
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
st.markdown("To summarize (and simplify), this suggests that the most important factors in comprehensibility are:")
|
| 933 |
+
|
| 934 |
+
st.markdown("1. Rate of Speech")
|
| 935 |
+
st.markdown("2. Sentence length")
|
| 936 |
+
st.markdown("3. Amount of repetition of words")
|
| 937 |
+
st.markdown("4. How common/rare the words are")
|
| 938 |
+
st.markdown("5. Amount of subordinating conjunctions")
|
| 939 |
+
st.markdown("6. Vocabulary size")
|
| 940 |
+
st.markdown("7. Amount of adverbs")
|
| 941 |
+
st.markdown("8. Amount of Chinese words")
|
| 942 |
+
|
| 943 |
+
st.markdown("### Thanks for reading ✌️")
|
| 944 |
+
|
| 945 |
+
st.markdown("---")
|
| 946 |
+
|
| 947 |
+
st.markdown("In the unlikely chance that you happen to be a CI instructor or a CI content creator, I want to talk to you! \
|
| 948 |
+
I can be reached at hamiltonjoshuadavid@gmail.com and I'm interested in learning \
|
| 949 |
+
more about what you do. Please also add a link to your work if you decide to reach out.")
|
| 950 |
+
|
| 951 |
+
st.markdown("Special thanks to [CIJ](https://cijapanese.com/). I'm a happy subscriber and I recommend you also pick up a \
|
| 952 |
+
a membership if you're a Japanese learner!")
|
| 953 |
+
|
| 954 |
+
#st.markdown("---")
|
| 955 |
+
#st.markdown("**Some extra notes:**")
|
| 956 |
+
#st.markdown("1. No statistical tests of significance were conducted. This was just meant to be a light and unrigorous EDA.")
|
| 957 |
+
#st.markdown("2. It should be noted that the levels of the videos were determined by experts, and not by learners. They do not reflect objective difficulty.")
|
| 958 |
+
#st.markdown("3. While I stated that Japanese learners tend to speak at rates of over 200 wpm, I unfortunately haven't been able to find any good sources on this. \
|
| 959 |
+
# The actual average Japanese WPM is likely even higher than 200 wpm, but unfortunately I haven't found any good research on this.")
|
| 960 |
+
#st.markdown("4. Technically, I didn't actually compute syllables per second, but rather moras per second which served as an approximation for syllables. \
|
| 961 |
+
# I understand that this is linguistically incorrect, but I didn't want to confuse the reader who might not know any Japanese or linguistics.")
|
| 962 |
+
#st.markdown("5. More data cleaning could've been done to create better frequency lists, however, this was unnecessary in order to establish statistical patterns in a one-off analysis.")
|
| 963 |
+
#st.markdown("6. As a disclaimer, I do not think that CI instructors should base how they create their content off of the findings in this analysis. \
|
| 964 |
+
# They should only use these findings for inspiration and to get them thinking more analytically about what they're doing.")
|