joshdavham commited on
Commit
84629ad
·
1 Parent(s): 81a6566

add some text content

Browse files
Files changed (1) hide show
  1. app.py +354 -145
app.py CHANGED
@@ -16,167 +16,203 @@ import seaborn as sns
16
  #</style>
17
  #""", unsafe_allow_html=True)
18
 
19
- st.title("CIJ by the numbers")
20
 
21
- st.markdown("[Comprehensible Japanese (CIJ)](https://cijapanese.com/) is a \
 
 
 
 
 
 
 
 
 
 
 
22
  video platform for learning Japanese.")
23
 
24
  video_df = pd.read_csv("video_data.tsv", sep="\t")
25
 
26
  # Plot the WPM histogram
27
 
28
- # Data for vertical lines corresponding to each level
29
- line_data = pd.DataFrame({
30
- 'x': [75, 91, 124, 149],
31
- 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
32
- 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
33
- })
34
-
35
- selection = alt.selection_point(fields=['level'], bind='legend', on='click')
36
-
37
- highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
38
-
39
- histogram = alt.Chart(video_df).mark_bar(
40
- opacity=0.5,
41
- binSpacing=3,
42
- stroke='black',
43
- strokeWidth=0,
44
- cornerRadius=5,
45
- cursor="pointer"
46
- ).encode(
47
- alt.X(
48
- 'wpm:Q',
49
- bin=alt.Bin(maxbins=20),
50
- title='Words per minute',
51
- axis=alt.Axis(
52
- labelFontSize=14,
53
- titleFontSize=18,
54
- #titleFont='Urbanist',
55
- titleColor='black',
56
- titleFontWeight='normal',
57
- #titleFontStyle='italic',
58
- titlePadding=20
59
- )
60
- ),
61
- alt.Y(
62
- 'count()',
63
- title="Num. videos",
64
- axis=alt.Axis(
65
- labelFontSize=14,
66
- titleFontSize=18,
67
- #titleFont='Urbanist',
68
- titleColor='black',
69
- titleFontWeight='normal',
70
- #titleFontStyle='italic',
71
- titlePadding=20,
72
- tickCount=5
73
  ),
74
- scale=alt.Scale(domain=[0,100])
75
- ).stack(None),
76
- alt.Color(
77
- 'level:N',
78
- scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
79
- sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
80
- legend=alt.Legend(
81
- title='CIJ Level',
82
- #titleFont='Urbanist',
83
- titleFontSize=18,
84
- titleFontWeight='bolder',
85
- labelFontSize=16,
86
- #labelFont='Urbanist',
87
- symbolType='circle',
88
- symbolSize=200,
89
- symbolStrokeWidth=0,
90
- orient='right',
91
- direction='vertical',
92
- fillColor='white',
93
- padding=10,
94
- cornerRadius=5,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
- ),
97
- tooltip=[
98
- alt.Tooltip('wpm:Q', title='Words per minute:', bin=True), # Properly indicate that `wpm` is binned
99
- alt.Tooltip('level:N', title='Level:'),
100
- alt.Tooltip('count()', title='Video count:')
101
- ],
102
- opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
103
- strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
104
- ).properties(
105
- #width=750,
106
- #width='container',
107
- #height='container',
108
- height=500,
109
- #background='beige',
110
- #padding=50,
111
- title=alt.TitleParams(
112
- text='Rate of speech in words per minute (WPM)',
113
- offset=20,
114
- #subtitle='(clickable)',
115
- #font='Urbanist',
116
- fontSize=24,
117
- fontWeight='normal',
118
- anchor='middle',
119
- color='black',
120
- subtitleFontSize=15,
121
- subtitleColor='gray'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  )
123
- ).add_params(
124
- selection,
125
- highlight
126
- )
127
-
128
- # Vertical lines corresponding to each level
129
- vertical_lines = alt.Chart(line_data).mark_rule(
130
- color='red',
131
- strokeWidth=6,
132
- strokeDash = [10, 2], # first arg is length, second is gap
133
- ).encode(
134
- x='x:Q',
135
- tooltip=[
136
- alt.Tooltip('x:N', title='Median WPM:'),
137
- alt.Tooltip('level:N', title='Level:')
138
- ],
139
- #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
140
- color=alt.Color(
141
- 'level:N',
142
- scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
143
- sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
144
- legend=None # No legend for lines, it is already shown in the histogram
145
- ),
146
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
147
- strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
148
- ).add_params(
149
- selection,
150
- highlight
151
- )
152
-
153
- text_labels = alt.Chart(line_data).mark_text(
154
- align='center', # Align text to the left of the line
155
- dx=0, # Offset the text to the right by 5 pixels
156
- dy=-10, # Adjust vertical positioning
157
- fontSize=16,
158
- fontWeight='bold'
159
- ).encode(
160
- x='x:Q',
161
- y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
162
- text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
163
- color=alt.Color(
164
- 'level:N',
165
- scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
166
- sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
167
- legend=None
168
- ),
169
- opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
170
- )
171
 
172
 
173
  if st.checkbox('Show medians'):
174
- layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='#f6f8fb')
 
 
175
  else:
176
- layered_chart = alt.layer(histogram, background='#f6f8fb')
 
177
 
178
  st.altair_chart(layered_chart, use_container_width=True)
179
 
 
 
 
 
180
  # wpm vs sps chart
181
 
182
  def get_wpm_vs_sps_chart(interactive=False):
@@ -279,6 +315,74 @@ else:
279
 
280
  st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
281
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
  # word coverage chart
283
 
284
  def get_word_coverage_chart():
@@ -591,8 +695,45 @@ else:
591
  word_coverage_chart = get_word_coverage_chart()
592
 
593
  st.altair_chart(word_coverage_chart, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
594
  # grammar table
595
 
 
 
 
 
 
 
 
 
596
  data = {
597
  'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
598
  'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
@@ -628,6 +769,21 @@ styled_df = df.style.set_table_styles(
628
  # Display the styled DataFrame
629
  st.markdown(styled_df.to_html(), unsafe_allow_html=True)
630
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
631
  # word origin table
632
 
633
  data = {
@@ -667,6 +823,14 @@ st.markdown(styled_df.to_html(), unsafe_allow_html=True)
667
 
668
  # heatmap
669
 
 
 
 
 
 
 
 
 
670
  num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
671
 
672
  def render_vanilla_heatmap():
@@ -693,6 +857,16 @@ def render_vanilla_heatmap():
693
 
694
  render_vanilla_heatmap()
695
 
 
 
 
 
 
 
 
 
 
 
696
  def render_level_row_unordered():
697
 
698
  # Compute the correlation matrix
@@ -752,4 +926,39 @@ def render_level_col_ordered():
752
  if st.checkbox('Flip and sort'):
753
  render_level_col_ordered()
754
  else:
755
- render_level_row_unordered()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  #</style>
17
  #""", unsafe_allow_html=True)
18
 
19
+ st.markdown("Note: this analysis is meant to viewed on a computer and not a phone (sorry!)")
20
 
21
+ st.markdown("# What makes comprehensible input *comprehensible*?")
22
+
23
+ st.markdown("**Comprehensible input** (or CI, for short) is a language teaching technique where teachers \
24
+ speak in a way that is understandable to their students. \
25
+ It is believed by many that CI is one of the most optimal and natural \
26
+ ways to acquire a foreign language \
27
+ ...but, what exactly is about CI that makes it comprehensible?")
28
+
29
+
30
+
31
+ st.markdown("To answer this question, I'll be analyzing the videos on \
32
+ [cijapanese.com](https://cijapanese.com/) (CIJ), a \
33
  video platform for learning Japanese.")
34
 
35
  video_df = pd.read_csv("video_data.tsv", sep="\t")
36
 
37
  # Plot the WPM histogram
38
 
39
+ st.markdown("## How fast is CI?")
40
+
41
+ st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
42
+ they speak more slowly in videos meant for beginners and more quickly \
43
+ for advanced learners.")
44
+
45
+ #st.markdown("### Rate of speech in words per minute (WPM)")
46
+
47
+ def get_wpm_chart(show_medians=False):
48
+
49
+ # Data for vertical lines corresponding to each level
50
+ line_data = pd.DataFrame({
51
+ 'x': [75, 91, 124, 149],
52
+ 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
53
+ 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
54
+ })
55
+
56
+ selection = alt.selection_point(fields=['level'], bind='legend', on='click')
57
+
58
+ highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
59
+
60
+ histogram = alt.Chart(video_df).mark_bar(
61
+ opacity=0.5,
62
+ binSpacing=3,
63
+ stroke='black',
64
+ strokeWidth=0,
65
+ cornerRadius=5,
66
+ cursor="pointer"
67
+ ).encode(
68
+ alt.X(
69
+ 'wpm:Q',
70
+ bin=alt.Bin(maxbins=20),
71
+ title='Words per minute',
72
+ axis=alt.Axis(
73
+ labelFontSize=14,
74
+ titleFontSize=18,
75
+ #titleFont='Urbanist',
76
+ titleColor='black',
77
+ titleFontWeight='normal',
78
+ #titleFontStyle='italic',
79
+ titlePadding=20
80
+ )
 
 
 
81
  ),
82
+ alt.Y(
83
+ 'count()',
84
+ title="Num. videos",
85
+ axis=alt.Axis(
86
+ labelFontSize=14,
87
+ titleFontSize=18,
88
+ #titleFont='Urbanist',
89
+ titleColor='black',
90
+ titleFontWeight='normal',
91
+ #titleFontStyle='italic',
92
+ titlePadding=20,
93
+ tickCount=5
94
+ ),
95
+ scale=alt.Scale(domain=[0,100])
96
+ ).stack(None),
97
+ alt.Color(
98
+ 'level:N',
99
+ scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
100
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
101
+ legend=alt.Legend(
102
+ title='CIJ Level',
103
+ #titleFont='Urbanist',
104
+ titleFontSize=18,
105
+ titleFontWeight='bolder',
106
+ labelFontSize=16,
107
+ #labelFont='Urbanist',
108
+ symbolType='circle',
109
+ symbolSize=200,
110
+ symbolStrokeWidth=0,
111
+ orient='right',
112
+ direction='vertical',
113
+ fillColor='white',
114
+ padding=10,
115
+ cornerRadius=5,
116
+ )
117
+ ),
118
+ tooltip=[
119
+ alt.Tooltip('wpm:Q', title='Words per minute:', bin=True), # Properly indicate that `wpm` is binned
120
+ alt.Tooltip('level:N', title='Level:'),
121
+ alt.Tooltip('count()', title='Video count:')
122
+ ],
123
+ opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
124
+ strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
125
+ ).properties(
126
+ #width=750,
127
+ #width='container',
128
+ #height='container',
129
+ height=500,
130
+ #background='beige',
131
+ #padding=50,
132
+ title=alt.TitleParams(
133
+ text='Rate of speech in words per minute (WPM)',
134
+ offset=20,
135
+ #subtitle='(clickable)',
136
+ #font='Urbanist',
137
+ fontSize=24,
138
+ fontWeight='normal',
139
+ anchor='middle',
140
+ color='black',
141
+ subtitleFontSize=15,
142
+ subtitleColor='gray'
143
  )
144
+ ).add_params(
145
+ selection,
146
+ highlight
147
+ )
148
+
149
+ # Vertical lines corresponding to each level
150
+ vertical_lines = alt.Chart(line_data).mark_rule(
151
+ color='red',
152
+ strokeWidth=6,
153
+ strokeDash = [10, 2], # first arg is length, second is gap
154
+ ).encode(
155
+ x='x:Q',
156
+ tooltip=[
157
+ alt.Tooltip('x:N', title='Median WPM:'),
158
+ alt.Tooltip('level:N', title='Level:')
159
+ ],
160
+ #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
161
+ color=alt.Color(
162
+ 'level:N',
163
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
164
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
165
+ legend=None # No legend for lines, it is already shown in the histogram
166
+ ),
167
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
168
+ strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
169
+ ).add_params(
170
+ selection,
171
+ highlight
172
+ )
173
+
174
+ text_labels = alt.Chart(line_data).mark_text(
175
+ align='center', # Align text to the left of the line
176
+ dx=0, # Offset the text to the right by 5 pixels
177
+ dy=-10, # Adjust vertical positioning
178
+ fontSize=16,
179
+ fontWeight='bold'
180
+ ).encode(
181
+ x='x:Q',
182
+ y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
183
+ text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
184
+ color=alt.Color(
185
+ 'level:N',
186
+ scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
187
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
188
+ legend=None
189
+ ),
190
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
191
  )
192
+
193
+
194
+ if show_medians:
195
+ layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='#f6f8fb')
196
+ else:
197
+ layered_chart = alt.layer(histogram, background='#f6f8fb')
198
+
199
+ return layered_chart
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
 
202
  if st.checkbox('Show medians'):
203
+
204
+ layered_chart = get_wpm_chart(show_medians=True)
205
+
206
  else:
207
+
208
+ layered_chart = get_wpm_chart(show_medians=False)
209
 
210
  st.altair_chart(layered_chart, use_container_width=True)
211
 
212
+ st.markdown("To put this data into perspective, native Japanese speakers \
213
+ tend to speak at rates of over 200 wpm, meaning that most of the videos \
214
+ on CIJ have been adapted to be a lot slower than that!")
215
+
216
  # wpm vs sps chart
217
 
218
  def get_wpm_vs_sps_chart(interactive=False):
 
315
 
316
  st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
317
 
318
+ st.markdown("We can also measure the rate of speech in syllables per second (SPS) \
319
+ and compare it to words per minute.")
320
+
321
+ st.markdown("(Also, FYI, most of these **graphs are \
322
+ interactive** so please click around.)")
323
+
324
+ st.markdown("## A quick statistics lesson")
325
+
326
+ st.markdown("Before we continue this analysis, there's some basic things you should know.")
327
+
328
+ st.markdown("### The data")
329
+
330
+ st.markdown("The dataset we'll be analyzing comprises of just under 1,000 videos. \
331
+ In particular, we'll be analyzing the subtitles of the videos.")
332
+
333
+ st.markdown('Every video has a Level: **Complete Beginner**, **Beginner**, \
334
+ **Intermediate**, or **Advanced**.')
335
+
336
+ st.markdown("### The statistics")
337
+
338
+ st.markdown("The goal of this analysis is to find features in the video data that lead \
339
+ to a specific pattern called an \"ordering\".")
340
+
341
+ st.markdown("We're specifically looking for *any* statistic that can lead to an \
342
+ ordering of the levels in one of the two following orders:")
343
+
344
+ st.markdown("> Complete Beginner < Beginner < Intermediate < Advanced")
345
+ st.markdown("or")
346
+ st.markdown("> Complete Beginner > Beginner > Intermediate > Advanced")
347
+
348
+ st.markdown("For example: if a statistic is small for Complete Beginnner videos, but gets bigger \
349
+ for Beginner, Intermediate, then Advanced videos, it suggests \
350
+ that this is a good statistic for determining what makes a video comprehensible. \
351
+ In fact, we already saw this above when measuring the **words per minute** statistic.")
352
+
353
+ st.markdown("Okay! Now we can continue.")
354
+
355
+ st.markdown("## Sentence length")
356
+
357
+ st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
358
+
359
+ st.markdown("[TODO]: Add mean sentence length graph")
360
+
361
+ st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
362
+ whereas short sentences are usually easier to understand.")
363
+
364
+ st.markdown("## Amount of repetition")
365
+
366
+ st.markdown("Words are repeated more often in easier videos.")
367
+
368
+ st.markdown("[TODO]: Add Average rel reps histogram")
369
+
370
+ st.markdown("If you don't catch a word the first time it's said, there's more opportunities \
371
+ in the easier videos to hear that word again.")
372
+
373
+ st.markdown("## How many words you need to know")
374
+
375
+ st.markdown("A popular statistic in language learning circles is that you generally \
376
+ need to know around 98% of words in a given piece of content to understand it well. \
377
+ This statistic is known as 'word coverage', the percentage of words you know in a given text.")
378
+
379
+ st.markdown("How many words do you need to know to understand 98% of the words in each level?")
380
+
381
+ st.markdown("If we take all the words in CIJ, count them then order them from most common, to least common, \
382
+ we can calculate the word coverage you get at different vocabulary sizes. \
383
+ For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
384
+ Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
385
+
386
  # word coverage chart
387
 
388
  def get_word_coverage_chart():
 
695
  word_coverage_chart = get_word_coverage_chart()
696
 
697
  st.altair_chart(word_coverage_chart, use_container_width=True)
698
+
699
+ st.markdown("Using the same method of calculating word coverage as before, \
700
+ we can also calculate how many of the top words you need to know \
701
+ to achieve 98% word coverage in each video.")
702
+
703
+ st.markdown("[TODO]: Add ne_spot histogram")
704
+
705
+ st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
706
+
707
+ st.markdown("## Word rareness")
708
+
709
+ st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
710
+
711
+ st.markdown("[TODO]: Add that that log rank histogram")
712
+
713
+ st.markdown("How common a word is, is known as its 'rank'. The most common word \
714
+ in a text would be rank 1 and the fifth most common would be rank 5. \
715
+ A word with a low rank is a commonly used word (e.g., 'it', 'walk', 'up') whereas a word with a high rank \
716
+ is an uncommon or 'rare' word (e.g., 'esoteric', 'gauche', 'gallant').")
717
+
718
+ st.markdown("The words in the videos were compared to the ranks of words generated from a frequency list made from over 4,000 Japanese Netflix \
719
+ TV episodes and movies. Duplicate ranks in the videos were removed, scaled with a log \
720
+ function then used to compute the 25th percentile. This was necessary due \
721
+ to power-law nature of word frequency distributions.")
722
+
723
+ st.markdown("(It's okay ff the above didn't quite make sense to you - just know that the above graph \
724
+ demonstrates that easier videos tend to use more common words whereas \
725
+ advanced videos tend to use more rare words!)")
726
+
727
  # grammar table
728
 
729
+ st.markdown("## Grammar")
730
+
731
+ st.markdown("Easier videos tend to use less [subordinating conjunctions](https://universaldependencies.org/u/pos/SCONJ.html) than harder videos.")
732
+
733
+ st.markdown("[TODO]: Add sconj histogram")
734
+
735
+ st.markdown("We also notice differences in the use of other types of words.")
736
+
737
  data = {
738
  'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
739
  'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
 
769
  # Display the styled DataFrame
770
  st.markdown(styled_df.to_html(), unsafe_allow_html=True)
771
 
772
+ st.markdown("## What type of word")
773
+
774
+ st.markdown("There are three main categories of words in Japanese:")
775
+ st.markdown("(1) Wago (和語), (2) Kango (漢語) and (3) Gairaigo (外来語)")
776
+ st.markdown("Wago are native Japanese words, Kango are Chinese words and Gairaigo are foreign words.")
777
+
778
+ st.markdown("Harder videos tend to use more Kango than easier videos")
779
+
780
+ st.markdown("[TODO]: Add kango histogram")
781
+
782
+ st.markdown("In Japanese, Kango are somewhat analogous to French words in English. \
783
+ These words tend to be more technical or sophisticated than other words.")
784
+
785
+ st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
786
+
787
  # word origin table
788
 
789
  data = {
 
823
 
824
  # heatmap
825
 
826
+ st.markdown("## Which factors matter the most?")
827
+
828
+ st.markdown("We've just found a number of statistics that lead to orderings in the data \
829
+ but which statistics matter the most?")
830
+
831
+ st.markdown("To answer this, we can look at a correlation heatmap between each of the variables \
832
+ and observe which statistics correlate the most strongly with the video's level.")
833
+
834
  num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
835
 
836
  def render_vanilla_heatmap():
 
857
 
858
  render_vanilla_heatmap()
859
 
860
+ st.markdown("In case you're not familiar with stuff like this, numbers close to 1 or -1 \
861
+ represent a high level or correlation and numbers close to 0 represent a low level of correlation. \
862
+ Positive numbers represent a positive relationship between the variables and negative numbers represent a \
863
+ reverse relationship between the variables.")
864
+
865
+ st.markdown("Using a statistics rule of thumb and removing all variables that have correlations \
866
+ weaker than 0.3 (and more than -0.3), we can identify the variables with the strongest correlations.")
867
+
868
+
869
+
870
  def render_level_row_unordered():
871
 
872
  # Compute the correlation matrix
 
926
  if st.checkbox('Flip and sort'):
927
  render_level_col_ordered()
928
  else:
929
+ render_level_row_unordered()
930
+
931
+
932
+ st.markdown("To summarize (and simplify), this suggests that the most important factors in comprehensibility are:")
933
+
934
+ st.markdown("1. Rate of Speech")
935
+ st.markdown("2. Sentence length")
936
+ st.markdown("3. Amount of repetition of words")
937
+ st.markdown("4. How common/rare the words are")
938
+ st.markdown("5. Amount of subordinating conjunctions")
939
+ st.markdown("6. Vocabulary size")
940
+ st.markdown("7. Amount of adverbs")
941
+ st.markdown("8. Amount of Chinese words")
942
+
943
+ st.markdown("### Thanks for reading ✌️")
944
+
945
+ st.markdown("---")
946
+
947
+ st.markdown("In the unlikely chance that you happen to be a CI instructor or a CI content creator, I want to talk to you! \
948
+ I can be reached at hamiltonjoshuadavid@gmail.com and I'm interested in learning \
949
+ more about what you do. Please also add a link to your work if you decide to reach out.")
950
+
951
+ st.markdown("Special thanks to [CIJ](https://cijapanese.com/). I'm a happy subscriber and I recommend you also pick up a \
952
+ a membership if you're a Japanese learner!")
953
+
954
+ #st.markdown("---")
955
+ #st.markdown("**Some extra notes:**")
956
+ #st.markdown("1. No statistical tests of significance were conducted. This was just meant to be a light and unrigorous EDA.")
957
+ #st.markdown("2. It should be noted that the levels of the videos were determined by experts, and not by learners. They do not reflect objective difficulty.")
958
+ #st.markdown("3. While I stated that Japanese learners tend to speak at rates of over 200 wpm, I unfortunately haven't been able to find any good sources on this. \
959
+ # The actual average Japanese WPM is likely even higher than 200 wpm, but unfortunately I haven't found any good research on this.")
960
+ #st.markdown("4. Technically, I didn't actually compute syllables per second, but rather moras per second which served as an approximation for syllables. \
961
+ # I understand that this is linguistically incorrect, but I didn't want to confuse the reader who might not know any Japanese or linguistics.")
962
+ #st.markdown("5. More data cleaning could've been done to create better frequency lists, however, this was unnecessary in order to establish statistical patterns in a one-off analysis.")
963
+ #st.markdown("6. As a disclaimer, I do not think that CI instructors should base how they create their content off of the findings in this analysis. \
964
+ # They should only use these findings for inspiration and to get them thinking more analytically about what they're doing.")