Spaces:

joshdavham
/

Comprehensible-Input-Analysis

Sleeping

App Files Files Community

joshdavham commited on Oct 9, 2024

Commit

84629ad

1 Parent(s): 81a6566

add some text content

Browse files

Files changed (1) hide show

app.py +354 -145

app.py CHANGED Viewed

@@ -16,167 +16,203 @@ import seaborn as sns
 #</style>
 #""", unsafe_allow_html=True)
-st.title("CIJ by the numbers")
-st.markdown("[Comprehensible Japanese (CIJ)](https://cijapanese.com/) is a \
             video platform for learning Japanese.")
 video_df = pd.read_csv("video_data.tsv", sep="\t")
 # Plot the WPM histogram
-# Data for vertical lines corresponding to each level
-line_data = pd.DataFrame({
-    'x': [75, 91, 124, 149],
-    'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-    'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
-})
-selection = alt.selection_point(fields=['level'], bind='legend', on='click')
-highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
-histogram = alt.Chart(video_df).mark_bar(
-    opacity=0.5,
-    binSpacing=3,
-    stroke='black',
-    strokeWidth=0,
-    cornerRadius=5,
-    cursor="pointer"
-).encode(
-    alt.X(
-        'wpm:Q',
-        bin=alt.Bin(maxbins=20),
-        title='Words per minute',
-        axis=alt.Axis(
-            labelFontSize=14,
-            titleFontSize=18,
-            #titleFont='Urbanist',
-            titleColor='black',
-            titleFontWeight='normal',
-            #titleFontStyle='italic',
-            titlePadding=20
-        )
-    ),
-    alt.Y(
-        'count()',
-        title="Num. videos",
-        axis=alt.Axis(
-            labelFontSize=14,
-            titleFontSize=18,
-            #titleFont='Urbanist',
-            titleColor='black',
-            titleFontWeight='normal',
-            #titleFontStyle='italic',
-            titlePadding=20,
-            tickCount=5
         ),
-        scale=alt.Scale(domain=[0,100])
-    ).stack(None),
-    alt.Color(
-        'level:N',
-        scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
-        sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-        legend=alt.Legend(
-            title='CIJ Level',
-            #titleFont='Urbanist',
-            titleFontSize=18,
-            titleFontWeight='bolder',
-            labelFontSize=16,
-            #labelFont='Urbanist',
-            symbolType='circle',
-            symbolSize=200,
-            symbolStrokeWidth=0,
-            orient='right',
-            direction='vertical',
-            fillColor='white',
-            padding=10,
-            cornerRadius=5,
         )
-    ),
-    tooltip=[
-        alt.Tooltip('wpm:Q', title='Words per minute:', bin=True),  # Properly indicate that `wpm` is binned
-        alt.Tooltip('level:N', title='Level:'),
-        alt.Tooltip('count()', title='Video count:')
-    ],
-    opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
-    strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
-).properties(
-    #width=750,
-    #width='container',
-    #height='container',
-    height=500,
-    #background='beige',
-    #padding=50,
-    title=alt.TitleParams(
-        text='Rate of speech in words per minute (WPM)',
-        offset=20,
-        #subtitle='(clickable)',
-        #font='Urbanist',
-        fontSize=24,
-        fontWeight='normal',
-        anchor='middle',
-        color='black',
-        subtitleFontSize=15,
-        subtitleColor='gray'
     )
-).add_params(
-    selection,
-    highlight
-)
-# Vertical lines corresponding to each level
-vertical_lines = alt.Chart(line_data).mark_rule(
-    color='red',
-    strokeWidth=6,
-    strokeDash = [10, 2], # first arg is length, second is gap
-).encode(
-    x='x:Q',
-    tooltip=[
-        alt.Tooltip('x:N', title='Median WPM:'),
-        alt.Tooltip('level:N', title='Level:')
-    ],
-    #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
-    color=alt.Color(
-        'level:N',
-        scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
-        sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-        legend=None  # No legend for lines, it is already shown in the histogram
-    ),
-    opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
-    strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
-).add_params(
-    selection,
-    highlight
-)
-text_labels = alt.Chart(line_data).mark_text(
-    align='center',  # Align text to the left of the line
-    dx=0,  # Offset the text to the right by 5 pixels
-    dy=-10, # Adjust vertical positioning
-    fontSize=16,
-    fontWeight='bold'
-).encode(
-    x='x:Q',
-    y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
-    text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
-    color=alt.Color(
-        'level:N',
-        scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
-        sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
-        legend=None
-    ),
-    opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
-)
 if st.checkbox('Show medians'):
-    layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='#f6f8fb')
 else:
-    layered_chart = alt.layer(histogram, background='#f6f8fb')
 st.altair_chart(layered_chart, use_container_width=True)
 # wpm vs sps chart
 def get_wpm_vs_sps_chart(interactive=False):
@@ -279,6 +315,74 @@ else:
 st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
 # word coverage chart
 def get_word_coverage_chart():
@@ -591,8 +695,45 @@ else:
     word_coverage_chart = get_word_coverage_chart()
 st.altair_chart(word_coverage_chart, use_container_width=True)
 # grammar table
 data = {
     'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
     'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
@@ -628,6 +769,21 @@ styled_df = df.style.set_table_styles(
 # Display the styled DataFrame
 st.markdown(styled_df.to_html(), unsafe_allow_html=True)
 # word origin table
 data = {
@@ -667,6 +823,14 @@ st.markdown(styled_df.to_html(), unsafe_allow_html=True)
 # heatmap
 num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
 def render_vanilla_heatmap():
@@ -693,6 +857,16 @@ def render_vanilla_heatmap():
 render_vanilla_heatmap()
 def render_level_row_unordered():
     # Compute the correlation matrix
@@ -752,4 +926,39 @@ def render_level_col_ordered():
 if st.checkbox('Flip and sort'):
     render_level_col_ordered()
 else:
-    render_level_row_unordered()

 #</style>
 #""", unsafe_allow_html=True)
+st.markdown("Note: this analysis is meant to viewed on a computer and not a phone (sorry!)")
+st.markdown("# What makes comprehensible input *comprehensible*?")
+st.markdown("**Comprehensible input** (or CI, for short) is a language teaching technique where teachers \
+            speak in a way that is understandable to their students. \
+            It is believed by many that CI is one of the most optimal and natural \
+             ways to acquire a foreign language \
+            ...but, what exactly is about CI that makes it comprehensible?")
+st.markdown("To answer this question, I'll be analyzing the videos on \
+            [cijapanese.com](https://cijapanese.com/) (CIJ), a \
             video platform for learning Japanese.")
 video_df = pd.read_csv("video_data.tsv", sep="\t")
 # Plot the WPM histogram
+st.markdown("## How fast is CI?")
+st.markdown("If we measure how fast the teachers speak on CIJ, we find that \
+            they speak more slowly in videos meant for beginners and more quickly \
+            for advanced learners.")
+#st.markdown("### Rate of speech in words per minute (WPM)")
+def get_wpm_chart(show_medians=False):
+    # Data for vertical lines corresponding to each level
+    line_data = pd.DataFrame({
+        'x': [75, 91, 124, 149],
+        'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+        'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
+    })
+    selection = alt.selection_point(fields=['level'], bind='legend', on='click')
+    highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
+    histogram = alt.Chart(video_df).mark_bar(
+        opacity=0.5,
+        binSpacing=3,
+        stroke='black',
+        strokeWidth=0,
+        cornerRadius=5,
+        cursor="pointer"
+    ).encode(
+        alt.X(
+            'wpm:Q',
+            bin=alt.Bin(maxbins=20),
+            title='Words per minute',
+            axis=alt.Axis(
+                labelFontSize=14,
+                titleFontSize=18,
+                #titleFont='Urbanist',
+                titleColor='black',
+                titleFontWeight='normal',
+                #titleFontStyle='italic',
+                titlePadding=20
+            )
         ),
+        alt.Y(
+            'count()',
+            title="Num. videos",
+            axis=alt.Axis(
+                labelFontSize=14,
+                titleFontSize=18,
+                #titleFont='Urbanist',
+                titleColor='black',
+                titleFontWeight='normal',
+                #titleFontStyle='italic',
+                titlePadding=20,
+                tickCount=5
+            ),
+            scale=alt.Scale(domain=[0,100])
+        ).stack(None),
+        alt.Color(
+            'level:N',
+            scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
+            sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=alt.Legend(
+                title='CIJ Level',
+                #titleFont='Urbanist',
+                titleFontSize=18,
+                titleFontWeight='bolder',
+                labelFontSize=16,
+                #labelFont='Urbanist',
+                symbolType='circle',
+                symbolSize=200,
+                symbolStrokeWidth=0,
+                orient='right',
+                direction='vertical',
+                fillColor='white',
+                padding=10,
+                cornerRadius=5,
+            )
+        ),
+        tooltip=[
+            alt.Tooltip('wpm:Q', title='Words per minute:', bin=True),  # Properly indicate that `wpm` is binned
+            alt.Tooltip('level:N', title='Level:'),
+            alt.Tooltip('count()', title='Video count:')
+        ],
+        opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
+        strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
+    ).properties(
+        #width=750,
+        #width='container',
+        #height='container',
+        height=500,
+        #background='beige',
+        #padding=50,
+        title=alt.TitleParams(
+            text='Rate of speech in words per minute (WPM)',
+            offset=20,
+            #subtitle='(clickable)',
+            #font='Urbanist',
+            fontSize=24,
+            fontWeight='normal',
+            anchor='middle',
+            color='black',
+            subtitleFontSize=15,
+            subtitleColor='gray'
         )
+    ).add_params(
+        selection,
+        highlight
+    )
+    # Vertical lines corresponding to each level
+    vertical_lines = alt.Chart(line_data).mark_rule(
+        color='red',
+        strokeWidth=6,
+        strokeDash = [10, 2], # first arg is length, second is gap
+    ).encode(
+        x='x:Q',
+        tooltip=[
+            alt.Tooltip('x:N', title='Median WPM:'),
+            alt.Tooltip('level:N', title='Level:')
+        ],
+        #color=alt.condition(select, 'level:N', alt.value('gray')),  # Link the color with the selection
+        color=alt.Color(
+            'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']),  # Use the same color scale as the histogram
+            sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None  # No legend for lines, it is already shown in the histogram
+        ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
+        strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
+    ).add_params(
+        selection,
+        highlight
+    )
+    text_labels = alt.Chart(line_data).mark_text(
+        align='center',  # Align text to the left of the line
+        dx=0,  # Offset the text to the right by 5 pixels
+        dy=-10, # Adjust vertical positioning
+        fontSize=16,
+        fontWeight='bold'
+    ).encode(
+        x='x:Q',
+        y=alt.value(0),  # Positioning y at the top of the chart, can be adjusted as needed
+        text=alt.Text('x:Q', format='.0f'),  # Display the x value, formatted as an integer
+        color=alt.Color(
+            'level:N',
+            scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
+            sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
+            legend=None
+        ),
+        opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)),  # Link opacity with selection
     )
+    if show_medians:
+        layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='#f6f8fb')
+    else:
+        layered_chart = alt.layer(histogram, background='#f6f8fb')
+    return layered_chart
 if st.checkbox('Show medians'):
+    layered_chart = get_wpm_chart(show_medians=True)
 else:
+    layered_chart = get_wpm_chart(show_medians=False)
 st.altair_chart(layered_chart, use_container_width=True)
+st.markdown("To put this data into perspective, native Japanese speakers \
+            tend to speak at rates of over 200 wpm, meaning that most of the videos \
+            on CIJ have been adapted to be a lot slower than that!")
 # wpm vs sps chart
 def get_wpm_vs_sps_chart(interactive=False):
 st.altair_chart(wpm_vs_sps_chart, use_container_width=True)
+st.markdown("We can also measure the rate of speech in syllables per second (SPS) \
+            and compare it to words per minute.")
+st.markdown("(Also, FYI, most of these **graphs are \
+            interactive** so please click around.)")
+st.markdown("## A quick statistics lesson")
+st.markdown("Before we continue this analysis, there's some basic things you should know.")
+st.markdown("### The data")
+st.markdown("The dataset we'll be analyzing comprises of just under 1,000 videos. \
+            In particular, we'll be analyzing the subtitles of the videos.")
+st.markdown('Every video has a Level: **Complete Beginner**, **Beginner**, \
+            **Intermediate**, or **Advanced**.')
+st.markdown("### The statistics")
+st.markdown("The goal of this analysis is to find features in the video data that lead \
+            to a specific pattern called an \"ordering\".")
+st.markdown("We're specifically looking for *any* statistic that can lead to an \
+            ordering of the levels in one of the two following orders:")
+st.markdown("> Complete Beginner < Beginner < Intermediate < Advanced")
+st.markdown("or")
+st.markdown("> Complete Beginner > Beginner > Intermediate > Advanced")
+st.markdown("For example: if a statistic is small for Complete Beginnner videos, but gets bigger \
+            for Beginner, Intermediate, then Advanced videos, it suggests \
+            that this is a good statistic for determining what makes a video comprehensible. \
+            In fact, we already saw this above when measuring the **words per minute** statistic.")
+st.markdown("Okay! Now we can continue.")
+st.markdown("## Sentence length")
+st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
+st.markdown("[TODO]: Add mean sentence length graph")
+st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
+            whereas short sentences are usually easier to understand.")
+st.markdown("## Amount of repetition")
+st.markdown("Words are repeated more often in easier videos.")
+st.markdown("[TODO]: Add Average rel reps histogram")
+st.markdown("If you don't catch a word the first time it's said, there's more opportunities \
+            in the easier videos to hear that word again.")
+st.markdown("## How many words you need to know")
+st.markdown("A popular statistic in language learning circles is that you generally \
+            need to know around 98% of words in a given piece of content to understand it well. \
+            This statistic is known as 'word coverage', the percentage of words you know in a given text.")
+st.markdown("How many words do you need to know to understand 98% of the words in each level?")
+st.markdown("If we take all the words in CIJ, count them then order them from most common, to least common, \
+             we can calculate the word coverage you get at different vocabulary sizes. \
+            For example, if we learn the top 500 words from CIJ, then we'll know around 80% of the words in the \
+            Complete Beginner videos. And if we learn the top 4,295 words, then we'll know 98% of the words in that category.")
 # word coverage chart
 def get_word_coverage_chart():
     word_coverage_chart = get_word_coverage_chart()
 st.altair_chart(word_coverage_chart, use_container_width=True)
+st.markdown("Using the same method of calculating word coverage as before, \
+            we can also calculate how many of the top words you need to know \
+            to achieve 98% word coverage in each video.")
+st.markdown("[TODO]: Add ne_spot histogram")
+st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
+st.markdown("## Word rareness")
+st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
+st.markdown("[TODO]: Add that that log rank histogram")
+st.markdown("How common a word is, is known as its 'rank'. The most common word \
+            in a text would be rank 1 and the fifth most common would be rank 5. \
+            A word with a low rank is a commonly used word (e.g., 'it', 'walk', 'up') whereas a word with a high rank \
+            is an uncommon or 'rare' word (e.g., 'esoteric', 'gauche', 'gallant').")
+st.markdown("The words in the videos were compared to the ranks of words generated from a frequency list made from over 4,000 Japanese Netflix \
+            TV episodes and movies. Duplicate ranks in the videos were removed, scaled with a log \
+            function then used to compute the 25th percentile. This was necessary due \
+            to power-law nature of word frequency distributions.")
+st.markdown("(It's okay ff the above didn't quite make sense to you - just know that the above graph \
+            demonstrates that easier videos tend to use more common words whereas \
+            advanced videos tend to use more rare words!)")
 # grammar table
+st.markdown("## Grammar")
+st.markdown("Easier videos tend to use less [subordinating conjunctions](https://universaldependencies.org/u/pos/SCONJ.html) than harder videos.")
+st.markdown("[TODO]: Add sconj histogram")
+st.markdown("We also notice differences in the use of other types of words.")
 data = {
     'Complete Beginner': [0.02638719922016275 ,0.0192492959834, 0.00476028625918155, 0.2503071253071253],
     'Beginner': [0.0473047304730473, 0.0266429840142095, 0.005813953488372, 0.2454068241469816],
 # Display the styled DataFrame
 st.markdown(styled_df.to_html(), unsafe_allow_html=True)
+st.markdown("## What type of word")
+st.markdown("There are three main categories of words in Japanese:")
+st.markdown("(1) Wago (和語), (2) Kango (漢語) and (3) Gairaigo (外来語)")
+st.markdown("Wago are native Japanese words, Kango are Chinese words and Gairaigo are foreign words.")
+st.markdown("Harder videos tend to use more Kango than easier videos")
+st.markdown("[TODO]: Add kango histogram")
+st.markdown("In Japanese, Kango are somewhat analogous to French words in English. \
+            These words tend to be more technical or sophisticated than other words.")
+st.markdown("We also notice orderings when counting the percentage of Wago and Gairaigo as well.")
 # word origin table
 data = {
 # heatmap
+st.markdown("## Which factors matter the most?")
+st.markdown("We've just found a number of statistics that lead to orderings in the data \
+            but which statistics matter the most?")
+st.markdown("To answer this, we can look at a correlation heatmap between each of the variables \
+            and observe which statistics correlate the most strongly with the video's level.")
 num_video_df = pd.read_csv('num_video_df.tsv', sep='\t')
 def render_vanilla_heatmap():
 render_vanilla_heatmap()
+st.markdown("In case you're not familiar with stuff like this, numbers close to 1 or -1 \
+            represent a high level or correlation and numbers close to 0 represent a low level of correlation. \
+            Positive numbers represent a positive relationship between the variables and negative numbers represent a \
+            reverse relationship between the variables.")
+st.markdown("Using a statistics rule of thumb and removing all variables that have correlations \
+            weaker than 0.3 (and more than -0.3), we can identify the variables with the strongest correlations.")
 def render_level_row_unordered():
     # Compute the correlation matrix
 if st.checkbox('Flip and sort'):
     render_level_col_ordered()
 else:
+    render_level_row_unordered()
+st.markdown("To summarize (and simplify), this suggests that the most important factors in comprehensibility are:")
+st.markdown("1. Rate of Speech")
+st.markdown("2. Sentence length")
+st.markdown("3. Amount of repetition of words")
+st.markdown("4. How common/rare the words are")
+st.markdown("5. Amount of subordinating conjunctions")
+st.markdown("6. Vocabulary size")
+st.markdown("7. Amount of adverbs")
+st.markdown("8. Amount of Chinese words")
+st.markdown("### Thanks for reading ✌️")
+st.markdown("---")
+st.markdown("In the unlikely chance that you happen to be a CI instructor or a CI content creator, I want to talk to you! \
+            I can be reached at hamiltonjoshuadavid@gmail.com and I'm interested in learning \
+            more about what you do. Please also add a link to your work if you decide to reach out.")
+st.markdown("Special thanks to [CIJ](https://cijapanese.com/). I'm a happy subscriber and I recommend you also pick up a \
+             a membership if you're a Japanese learner!")
+#st.markdown("---")
+#st.markdown("**Some extra notes:**")
+#st.markdown("1. No statistical tests of significance were conducted. This was just meant to be a light and unrigorous EDA.")
+#st.markdown("2. It should be noted that the levels of the videos were determined by experts, and not by learners. They do not reflect objective difficulty.")
+#st.markdown("3. While I stated that Japanese learners tend to speak at rates of over 200 wpm, I unfortunately haven't been able to find any good sources on this. \
+#            The actual average Japanese WPM is likely even higher than 200 wpm, but unfortunately I haven't found any good research on this.")
+#st.markdown("4. Technically, I didn't actually compute syllables per second, but rather moras per second which served as an approximation for syllables. \
+#            I understand that this is linguistically incorrect, but I didn't want to confuse the reader who might not know any Japanese or linguistics.")
+#st.markdown("5. More data cleaning could've been done to create better frequency lists, however, this was unnecessary in order to establish statistical patterns in a one-off analysis.")
+#st.markdown("6. As a disclaimer, I do not think that CI instructors should base how they create their content off of the findings in this analysis. \
+#            They should only use these findings for inspiration and to get them thinking more analytically about what they're doing.")