import streamlit as st import pandas as pd import altair as alt import matplotlib.pyplot as plt import seaborn as sns st.set_page_config( page_title='What makes comprehensible input comprehensible?', page_icon='favicon.svg' ) # colors white the index columns of rendered dataframes st.markdown( """ """, unsafe_allow_html=True ) # functions for loading data @st.cache_data def load_dataframes(): video_df = pd.read_csv("video_data.tsv", sep="\t") word_coverage_df = pd.read_csv('word_coverage_df_plot.tsv', sep='\t') num_video_df = pd.read_csv('num_video_df.tsv', sep='\t') return video_df, word_coverage_df, num_video_df def get_grammar_table(): data = { 'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253, 0.18554386037363785, 0.01622086690206438, 0.04537920642893019, 0.1203097143691203], 'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816, 0.1773049645390071, 0.01384083044982699, 0.02676864244741874, 0.13333333333333333], 'Intermediate': [0.06625719079578135, 0.03514773095199635, 0.0087719298245614, 0.23239271705403663, 0.1587691162151326, 0.010784997932175352, 0.022392603507910194, 0.13379268084136123], 'Advanced': [0.0766787658802177, 0.0373056994818652, 0.0108588351431391, 0.2237101220953131, 0.14922184925236498, 0.009050978304272594, 0.020185708518368994, 0.1364369670430975] } df = pd.DataFrame(data) row_labels = ['Median Perc. Subordinating Conjunctions', 'Median Perc. Adverbs', 'Median Perc. Determiners', 'Median Perc. Nouns', 'Median Perc. Auxiliaries', 'Median Perc. Numerals', 'Median Perc. Pronouns', 'Median Perc. Verbs'] df.index = row_labels styled_df = df.style.set_table_styles( { 'Complete Beginner': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(165, 190, 228, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Beginner': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(154, 214, 216, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Intermediate': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(199, 174, 205, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Advanced': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ] }).set_properties(**{'background-color': 'white'}).format("{:.2%}") return styled_df def get_word_origin_table(): data = { 'Complete Beginner': [0.06999874574159035, 0.8578043261266064, 0.03301790801790795], 'Beginner': [0.0955284552845528, 0.8399311531841652, 0.0279441117764471], 'Intermediate': [0.1165702954621605, 0.8259877335615461, 0.0241447813837379], 'Advanced': [0.1303328645100797, 0.8225274725274725, 0.0157535445475231], } df = pd.DataFrame(data) row_labels = ['Median Perc. Kango (漢語)', 'Median Perc. Wago (和語)', 'Median Perc. Garaigo (外来語)'] df.index = row_labels styled_df = df.style.set_table_styles( { 'Complete Beginner': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(165, 190, 228, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Beginner': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(154, 214, 216, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Intermediate': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(199, 174, 205, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], 'Advanced': [ {'selector': 'th.col_heading.level0', 'props': [('background-color', 'rgba(221, 158, 158, 0.45)')]}, {'selector': 'td:hover', 'props': [('background-color', '#e0f7fa')]} ], }).set_properties(**{'background-color': 'white'}).format("{:.2%}") return styled_df # functions for loading data visualizations @st.cache_data def get_wpm_chart(show_medians=False): line_data = pd.DataFrame({ 'x': [75, 91, 124, 149], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'wpm:Q', bin=alt.Bin(maxbins=20), title='Words per minute', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20 ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,100]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('wpm:Q', title='Words per minute:', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( height=500, title=alt.TitleParams( text='Rate of speech in words per minute (WPM)', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median WPM:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.0f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_wpm_vs_sps_chart(interactive=False): scatter_plot = alt.Chart(video_df).mark_circle( cursor='pointer', size=80, ).encode( x=alt.X( 'wpm:Q', scale=alt.Scale(domain=[30,215]), title='Words per minute', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20 ) ), y=alt.Y( 'sps:Q', title='Syllables per second', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, ), ), color=alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, orient='right', direction='vertical', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('video:N', title='Video number:'), alt.Tooltip('wpm:Q', title='WPM:'), alt.Tooltip('sps:Q', title='SPS:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)), ).properties( width='container', height=500, title=alt.TitleParams( text='Rate of speech: Syllables per second vs. words per minute', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ).configure( background='white' ) if interactive: return scatter_plot.interactive() else: return scatter_plot @st.cache_data def get_sentence_length_hist(show_medians=False): line_data = pd.DataFrame({ 'x': [7.60, 10.45, 16.17, 19.39], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'mean_sentence_length:Q', bin=alt.Bin(maxbins=30), title='Words per sentence', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20 ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,100]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='Average sentence length (words per sentence)', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median avg. sentence length:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.2f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_repetition_hist(show_medians=False): video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps'] sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0] line_data = pd.DataFrame({ 'x': [0.99, 0.62, 0.37, 0.23], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(sub_video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'average_rel_reps_perc:Q', bin=alt.Bin(maxbins=30), title='Word repetitions (%)', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, ), ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,100]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('average_rel_reps:Q', title='Average repetitions (%):', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='Average amount of repetition per word', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( alt.X( 'x:Q' ), tooltip=[ alt.Tooltip('x:N', title='Median avg. repetitions (%):'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)), ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( alt.X( 'x:Q' ), y=alt.value(0), text=alt.Text('x:Q', format='.2f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_word_coverage_chart(zoom=False): if zoom: word_coverage_df_sub = word_coverage_df.loc[word_coverage_df['coverage_perc']>=90] else: word_coverage_df_sub = word_coverage_df line_data = pd.DataFrame({ 'x': [4295, 5606, 6853, 9085], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) line_chart = alt.Chart(word_coverage_df_sub).mark_line( cursor='pointer', point=False, ).encode( x=alt.X( 'rank:Q', scale=alt.Scale(domain=[1000,16000]) if zoom else alt.Scale(domain=[-10,16000]), title='Number of words known', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20 ) ), y=alt.Y( 'coverage_perc:Q', scale=alt.Scale(domain=[90,101]) if zoom else alt.Scale(domain=[0,105]), title='% of words understood', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), ), color=alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, orient='right', direction='vertical', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('word:N', title='Word: '), alt.Tooltip('rank:Q', title="CIJ rank: "), alt.Tooltip('coverage_perc_str:N', title='Word coverage: '), alt.Tooltip('level:N', title='Curve: ') ], opacity=alt.condition(selection, alt.value(1.0), alt.value(0.2)), strokeWidth=alt.condition(selection | highlight, alt.value(6), alt.value(2)) ).properties( width='container', height=500, title=alt.TitleParams( text='Word coverage curves', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=4, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Words needed to reach 98%:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.0f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) layered_chart = alt.layer(line_chart, vertical_lines, text_labels, background='white') return layered_chart @st.cache_data def get_ne_spot_hist(show_medians=False): line_data = pd.DataFrame({ 'x': [3859, 5229, 6698, 7925], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'ne_spot:Q', bin=alt.Bin(maxbins=30), title='Number of words known', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,40]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('ne_spot:Q', title='Vocab size for 98%.:', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:') ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='Vocab size needed for 98% coverage (videos)', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.0f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_tfplr_hist(show_medians=False): line_data = pd.DataFrame({ 'x': [3.82, 4.30, 4.76, 5.21], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'tfp_log_ranks_unique:Q', bin=alt.Bin(maxbins=30), title='Log ranks', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=30, ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,80]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('tfp_log_ranks_unique:Q', title='25th perc. log rank:', bin=True), # Properly indicate that `wpm` is binned alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='25th percentile word-frequency log ranks', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median 25th perc. log rank:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.2f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_sconj_hist(show_medians=False): video_df['sconj_props_perc'] = 100.0 * video_df['sconj_props'] line_data = pd.DataFrame({ 'x': [2.64, 4.73, 6.63, 7.67], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'sconj_props_perc:Q', bin=alt.Bin(maxbins=30), title='Percentage of sub. conj.', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=30, ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,50]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('sconj_props_perc:Q', title='Perc. sub. conj:', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='Frequency of subordinating conjunctions', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median perc. of sub. conj:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.2f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def get_kango_hist(show_medians=False): video_df['kan_props_perc'] = 100.0 * video_df['kan_props'] line_data = pd.DataFrame({ 'x': [7.00, 9.55, 11.66, 13.03], 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'] }) histogram = alt.Chart(video_df).mark_bar( opacity=0.5, binSpacing=3, stroke='black', strokeWidth=0, cornerRadius=5, cursor="pointer" ).encode( alt.X( 'kan_props_perc:Q', bin=alt.Bin(maxbins=30), title='Percentage of kango', axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=30, ) ), alt.Y( 'count()', title="Num. videos", axis=alt.Axis( labelFontSize=14, titleFontSize=18, titleColor='black', titleFontWeight='normal', titlePadding=20, tickCount=5 ), scale=alt.Scale(domain=[0,40]) ).stack(None), alt.Color( 'level:N', scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=alt.Legend( title='CIJ Level', titleFontSize=18, titleFontWeight='bolder', labelFontSize=16, symbolType='circle', symbolSize=200, symbolStrokeWidth=0, orient='right', direction='vertical', fillColor='white', padding=10, cornerRadius=5, ) ), tooltip=[ alt.Tooltip('kan_props_perc:Q', title='Percentage of kango:', bin=True), alt.Tooltip('count()', title='Video count:'), alt.Tooltip('level:N', title='Level:'), ], opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1)) ).properties( width='container', height=500, title=alt.TitleParams( text='Frequency of kango', offset=20, fontSize=24, fontWeight='normal', anchor='middle', color='black', subtitleFontSize=15, subtitleColor='gray' ) ).add_params( selection, highlight ) vertical_lines = alt.Chart(line_data).mark_rule( color='red', strokeWidth=6, strokeDash = [10, 2], ).encode( x='x:Q', tooltip=[ alt.Tooltip('x:N', title='Median perc. kango:'), alt.Tooltip('level:N', title='Level:') ], color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)) ).add_params( selection, highlight ) text_labels = alt.Chart(line_data).mark_text( align='center', dx=0, dy=-10, fontSize=16, fontWeight='bold' ).encode( x='x:Q', y=alt.value(0), text=alt.Text('x:Q', format='.0f'), color=alt.Color( 'level:N', scale=alt.Scale(range=['red', 'green', 'blue', 'orange']), sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'], legend=None ), opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), ) if show_medians: layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white') else: layered_chart = alt.layer(histogram, background='white') return layered_chart @st.cache_data def render_vanilla_heatmap(): corr_matrix = num_video_df.corr() variable_of_interest = 'Level' sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index sorted_corr_matrix = corr_matrix.loc[sorted_vars, sorted_vars] plt.figure(figsize=(10, 8)) sns.heatmap(sorted_corr_matrix, annot=True, cmap='coolwarm', fmt=".2f") st.pyplot(plt.gcf()) @st.cache_data def render_level_row_unordered(): corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo', 'Proportion of verbs', 'Proportion of numerals'], axis=1).corr() variable_of_interest = 'Level' sorted_vars = corr_matrix[variable_of_interest].sort_values(ascending=False).index sorted_vars = sorted_vars.drop(variable_of_interest) first_row_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars] plt.figure(figsize=(10, 1)) sns.heatmap(first_row_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'}) st.pyplot(plt.gcf()) @st.cache_data def render_level_col_ordered(): corr_matrix = num_video_df.drop(['Proportion of determiners', 'Proportion of nouns', 'Proportion of wago', 'Proportion of gairaigo', 'Proportion of verbs', 'Proportion of numerals'], axis=1).corr() variable_of_interest = 'Level' correlations = corr_matrix[variable_of_interest] sorted_vars = correlations.abs().sort_values(ascending=False).index sorted_vars = sorted_vars.drop(variable_of_interest) sorted_corr_matrix = corr_matrix.loc[[variable_of_interest], sorted_vars] transposed_corr_matrix = sorted_corr_matrix.T plt.figure(figsize=(2, 3)) sns.heatmap(transposed_corr_matrix, annot=True, cmap='coolwarm', fmt=".3f", cbar_kws={'label': 'Correlation'}) st.pyplot(plt.gcf()) # load the data video_df, word_coverage_df, num_video_df = load_dataframes() grammar_table = get_grammar_table() word_origin_table = get_word_origin_table() # allows interactivity in the vega altair plots selection = alt.selection_point(fields=['level'], bind='legend', on='click') highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False) ### # INTRO ### st.markdown("Note: this analysis is meant to viewed on a computer and not a phone (sorry!)") st.markdown("[Code and data can be found [here](https://github.com/joshdavham/cij-analysis)]") st.markdown("# What makes comprehensible input *comprehensible*?") st.markdown("**Comprehensible input** (or CI, for short) is a language learning method where teachers provide their students with lots of language “input” that has been adapted to a level that they can understand. It is believed by many that CI is one of the most natural and effective ways to acquire a foreign language.") st.markdown("…but what exactly is it about comprehensible input that makes it so *comprehensible*?") st.markdown("To answer this question, we'll be analyzing the videos on \ [cijapanese.com](https://cijapanese.com/) (CIJ), a \ CI platform for learning Japanese.") ### # RATE OF SPEECH ### st.markdown("## How fast is CI?") st.markdown("If we measure how fast the teachers speak on CIJ, we find that \ they speak more slowly in videos meant for beginners and more quickly \ in videos meant for advanced learners.") st.markdown("**(THESE GRAPHS ARE CLICKABLE)**") if st.checkbox('Show medians', value=True, key='wpm'): layered_chart = get_wpm_chart(show_medians=True) else: layered_chart = get_wpm_chart(show_medians=False) st.altair_chart(layered_chart, use_container_width=True) st.markdown("To put the above data into perspective, native Japanese speakers \ can speak at rates of over 200 wpm, meaning that most of the videos \ on CIJ have been adapted to be a lot slower than that!") st.markdown("We can also measure the rate of speech in syllables per second (SPS) \ and compare it to words per minute.") if st.checkbox('Enable zooming and panning ( ↕ / ↔️ )'): wpm_vs_sps_chart = get_wpm_vs_sps_chart(interactive=True) else: wpm_vs_sps_chart = get_wpm_vs_sps_chart(interactive=False) st.altair_chart(wpm_vs_sps_chart, use_container_width=True) ### # STATISTICS LESSON ### st.markdown("## A quick statistics lesson") st.markdown("Before we continue the analysis, there's some basic things you should know.") st.markdown("### The data") st.markdown("The dataset we'll be analyzing comprises of just under 1,000 videos. \ In particular, we'll be analyzing the subtitles of the videos.") st.markdown('Also, every video has a level: **Complete Beginner**, **Beginner**, \ **Intermediate**, or **Advanced**.') st.markdown("### The statistics") st.markdown("The goal of this analysis is to find features in the video data that lead \ to a specific pattern called an \"ordering\".") st.markdown("We're specifically looking for *any* statistic that can lead to an \ ordering of the levels in either of the two following directions:") st.markdown("> Complete Beginner < Beginner < Intermediate < Advanced") st.markdown("or") st.markdown("> Complete Beginner > Beginner > Intermediate > Advanced") st.markdown("For example: if a statistic is small for Complete Beginnner videos, but gets bigger \ for Beginner, Intermediate, then Advanced videos, it suggests \ that this is a good statistic for determining what makes a video comprehensible. \ In fact, we already saw this above when measuring the [words per minute statistic](#how-fast-is-ci).") st.markdown("Okay! Now we can continue.") ### # SENTENCE LENGTH ### st.markdown("## Sentence length") st.markdown("Videos meant for beginners tend to have shorter sentences on average.") if st.checkbox('Show medians', value=True, key='sentence_length'): sentence_length_hist = get_sentence_length_hist(show_medians=True) else: sentence_length_hist = get_sentence_length_hist(show_medians=False) st.altair_chart(sentence_length_hist, use_container_width=True) st.markdown("This makes sense because long sentences can be more complex and packed with information \ whereas short sentences are usually simpler.") ### # AMOUNT OF REPETITION ### st.markdown("## Amount of repetition") st.markdown("Words are repeated more often in easier videos.") if st.checkbox('Show medians', value=True, key='repetition'): repetition_hist = get_repetition_hist(show_medians=True) else: repetition_hist = get_repetition_hist(show_medians=False) st.altair_chart(repetition_hist, use_container_width=True) st.markdown("If you don't catch a word the first time it's said, there's more opportunities \ in the easier videos to hear that word repeated again.") ### # HOW MANY WORDS ### st.markdown("## How many words you need to know") st.markdown("A popular statistic in language learning circles is that you generally \ need to know around 98% of the words in a given piece of content in order to be able to understand it well. \ This statistic is known as 'word coverage' - the percentage of words you know in a given text.") st.markdown("How many words do you need to know in order to understand 98% of the words in each level?") st.markdown("If we take all of the words from each of the CIJ videos, count them and then order them from most common to least common, \ we can calculate the word coverage you get at different vocabulary sizes. \ For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \ Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in the Complete Beginner videos.") if st.checkbox('Zoom in'): word_coverage_chart = get_word_coverage_chart(zoom=True) else: word_coverage_chart = get_word_coverage_chart(zoom=False) st.altair_chart(word_coverage_chart, use_container_width=True) st.markdown("Using this same method of calculating word coverage, \ we can also calculate how many of the top words from CIJ you need to know \ in order to achieve 98% word coverage in each video.") if st.checkbox('Show medians', value=True, key='ne_spot'): ne_spot_hist = get_ne_spot_hist(show_medians=True) else: ne_spot_hist = get_ne_spot_hist(show_medians=False) st.altair_chart(ne_spot_hist, use_container_width=True) st.markdown("In general, easier videos require smaller vocabulary sizes to understand.") ### # WORD RARENESS ### st.markdown("## Word rareness") st.markdown("Harder videos use rarer words.") if st.checkbox('Show medians', value=True, key='tfplr'): # tfplr stands for "twenty fifth percentile log rank" tfplr_hist = get_tfplr_hist(show_medians=True) else: tfplr_hist = get_tfplr_hist(show_medians=False) st.altair_chart(tfplr_hist, use_container_width=True) st.markdown("How common a word is, is known as its 'rank'. The most common word \ in a text would be rank 1 and the fifth most common would be rank 5. \ A word with a low rank is a commonly used word (e.g., 'and', 'work', 'that') whereas a word with a high rank \ is an uncommon or 'rare' word (e.g., 'esoteric', 'gauche', 'opprobrium'). Furthermore, \ a list of word ranks is known as a 'frequency list'.") st.markdown("The ranks of the words in the videos were compared with a larger, independent frequency list and then scaled with a log function \ before computing the twenty fifth percentile. This was done to make for a better visualization.") st.markdown("Note: it's okay if the above values don't quite make sense to you - just know that the graph \ demonstrates that easier videos tend to use common words more often whereas \ advanced videos tend to use rarer words more often.") ### # GRAMMAR ### st.markdown("## Grammar") st.markdown("Easier videos use less [subordinating conjunctions](https://universaldependencies.org/ja/pos/SCONJ.html) than harder videos.") if st.checkbox('Show medians', value=True, key='sconj'): sconj_hist = get_sconj_hist(show_medians=True) else: sconj_hist = get_sconj_hist(show_medians=False) st.altair_chart(sconj_hist, use_container_width=True) st.markdown("We also notice differences in the use of other types of words.") st.markdown( '