joshdavham commited on
Commit
e864e66
·
1 Parent(s): e496ce2

add repetition and ne spot hists

Browse files
Files changed (1) hide show
  1. app.py +350 -2
app.py CHANGED
@@ -533,15 +533,200 @@ st.altair_chart(sentence_length_hist, use_container_width=True)
533
  st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
534
  whereas short sentences are usually easier to understand.")
535
 
 
 
 
536
  st.markdown("## Amount of repetition")
537
 
538
  st.markdown("Words are repeated more often in easier videos.")
539
 
540
- st.markdown("[TODO]: Add Average rel reps histogram")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
 
542
  st.markdown("If you don't catch a word the first time it's said, there's more opportunities \
543
  in the easier videos to hear that word again.")
544
 
 
 
 
545
  st.markdown("## How many words you need to know")
546
 
547
  st.markdown("A popular statistic in language learning circles is that you generally \
@@ -872,7 +1057,170 @@ st.markdown("Using the same method of calculating word coverage as before, \
872
  we can also calculate how many of the top words you need to know \
873
  to achieve 98% word coverage in each video.")
874
 
875
- st.markdown("[TODO]: Add ne_spot histogram")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
876
 
877
  st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
878
 
 
533
  st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
534
  whereas short sentences are usually easier to understand.")
535
 
536
+ ###
537
+ # AMOUNT OF REPETITION
538
+ ###
539
  st.markdown("## Amount of repetition")
540
 
541
  st.markdown("Words are repeated more often in easier videos.")
542
 
543
+ def get_repetition_hist(show_medians=False):
544
+
545
+ video_df['average_rel_reps_perc'] = 100.0 * video_df['average_rel_reps']
546
+
547
+ #if show_medians:
548
+ # sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
549
+ #else:
550
+ # sub_video_df = video_df
551
+ # take the sub data frame for easier viewing
552
+ sub_video_df = video_df[video_df['average_rel_reps_perc'] <= 2.0]
553
+
554
+ # Data for vertical lines corresponding to each level
555
+ line_data = pd.DataFrame({
556
+ 'x': [0.99, 0.62, 0.37, 0.23],
557
+ 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
558
+ 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
559
+ })
560
+
561
+ selection = alt.selection_point(fields=['level'], bind='legend', on='click')
562
+
563
+ highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
564
+
565
+ histogram = alt.Chart(sub_video_df).mark_bar(
566
+ opacity=0.5,
567
+ binSpacing=3,
568
+ stroke='black',
569
+ strokeWidth=0,
570
+ cornerRadius=5,
571
+ cursor="pointer"
572
+ ).encode(
573
+ alt.X(
574
+ 'average_rel_reps_perc:Q',
575
+ bin=alt.Bin(maxbins=30),
576
+ title='Average relative repetitions (%)',
577
+ axis=alt.Axis(
578
+ labelFontSize=14,
579
+ titleFontSize=18,
580
+ #titleFont='Urbanist',
581
+ titleColor='black',
582
+ titleFontWeight='normal',
583
+ #titleFontStyle='italic',
584
+ titlePadding=20,
585
+ #format='.1f%'
586
+ ),
587
+ ),
588
+ alt.Y(
589
+ 'count()',
590
+ title="Num. videos",
591
+ axis=alt.Axis(
592
+ labelFontSize=14,
593
+ titleFontSize=18,
594
+ #titleFont='Urbanist',
595
+ titleColor='black',
596
+ titleFontWeight='normal',
597
+ #titleFontStyle='italic',
598
+ titlePadding=20,
599
+ tickCount=5
600
+ ),
601
+ scale=alt.Scale(domain=[0,100])
602
+ ).stack(None),
603
+ alt.Color(
604
+ 'level:N',
605
+ scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
606
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
607
+ legend=alt.Legend(
608
+ title='CIJ Level',
609
+ #titleFont='Urbanist',
610
+ titleFontSize=18,
611
+ titleFontWeight='bolder',
612
+ labelFontSize=16,
613
+ #labelFont='Urbanist',
614
+ symbolType='circle',
615
+ symbolSize=200,
616
+ symbolStrokeWidth=0,
617
+ orient='right',
618
+ direction='vertical',
619
+ fillColor='white',
620
+ padding=10,
621
+ cornerRadius=5,
622
+ )
623
+ ),
624
+ tooltip=[
625
+ alt.Tooltip('average_rel_reps:Q', title='Average relative repetitions:', bin=True), # Properly indicate that `wpm` is binned
626
+ alt.Tooltip('level:N', title='Level:'),
627
+ alt.Tooltip('count()', title='Video count:')
628
+ ],
629
+ opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
630
+ strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
631
+ ).properties(
632
+ #width=750,
633
+ width='container',
634
+ #height='container',
635
+ height=500,
636
+ #background='beige',
637
+ #padding=50,
638
+ title=alt.TitleParams(
639
+ text='Relative repetitions of words',
640
+ offset=20,
641
+ #subtitle='(clickable)',
642
+ #font='Urbanist',
643
+ fontSize=24,
644
+ fontWeight='normal',
645
+ anchor='middle',
646
+ color='black',
647
+ subtitleFontSize=15,
648
+ subtitleColor='gray'
649
+ )
650
+ ).add_params(
651
+ selection,
652
+ highlight
653
+ )
654
+
655
+ # Vertical lines corresponding to each level
656
+ vertical_lines = alt.Chart(line_data).mark_rule(
657
+ color='red',
658
+ strokeWidth=6,
659
+ strokeDash = [10, 2], # first arg is length, second is gap
660
+ ).encode(
661
+ alt.X(
662
+ 'x:Q'
663
+ ),
664
+ tooltip=[
665
+ alt.Tooltip('x:N', title='Median average relative repetitions:'),
666
+ alt.Tooltip('level:N', title='Level:')
667
+ ],
668
+ #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
669
+ color=alt.Color(
670
+ 'level:N',
671
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
672
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
673
+ legend=None # No legend for lines, it is already shown in the histogram
674
+ ),
675
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
676
+ strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1)),
677
+ ).add_params(
678
+ selection,
679
+ highlight
680
+ )
681
+
682
+ text_labels = alt.Chart(line_data).mark_text(
683
+ align='center', # Align text to the left of the line
684
+ dx=0, # Offset the text to the right by 5 pixels
685
+ dy=-10, # Adjust vertical positioning
686
+ fontSize=16,
687
+ fontWeight='bold'
688
+ ).encode(
689
+ alt.X(
690
+ 'x:Q'
691
+ ),
692
+ y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
693
+ text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
694
+ color=alt.Color(
695
+ 'level:N',
696
+ scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
697
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
698
+ legend=None
699
+ ),
700
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
701
+ )
702
+
703
+ if show_medians:
704
+
705
+ layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
706
+
707
+ else:
708
+
709
+ layered_chart = alt.layer(histogram, background='white')
710
+
711
+ return layered_chart
712
+
713
+ if st.checkbox('Show medians', key='repetition'):
714
+
715
+ repetition_hist = get_repetition_hist(show_medians=True)
716
+
717
+ else:
718
+
719
+ repetition_hist = get_repetition_hist(show_medians=False)
720
+
721
+ st.altair_chart(repetition_hist, use_container_width=True)
722
+
723
 
724
  st.markdown("If you don't catch a word the first time it's said, there's more opportunities \
725
  in the easier videos to hear that word again.")
726
 
727
+ ###
728
+ # HOW MANY WORDS
729
+ ###
730
  st.markdown("## How many words you need to know")
731
 
732
  st.markdown("A popular statistic in language learning circles is that you generally \
 
1057
  we can also calculate how many of the top words you need to know \
1058
  to achieve 98% word coverage in each video.")
1059
 
1060
+ def get_ne_spot_hist(show_medians=False):
1061
+
1062
+ # Data for vertical lines corresponding to each level
1063
+ line_data = pd.DataFrame({
1064
+ 'x': [3859, 5229, 6698, 7925],
1065
+ 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1066
+ 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
1067
+ })
1068
+
1069
+ selection = alt.selection_point(fields=['level'], bind='legend', on='click')
1070
+
1071
+ highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
1072
+
1073
+ histogram = alt.Chart(video_df).mark_bar(
1074
+ opacity=0.5,
1075
+ binSpacing=3,
1076
+ stroke='black',
1077
+ strokeWidth=0,
1078
+ cornerRadius=5,
1079
+ cursor="pointer"
1080
+ ).encode(
1081
+ alt.X(
1082
+ 'ne_spot:Q',
1083
+ bin=alt.Bin(maxbins=30),
1084
+ title='Number of most common CIJ words known',
1085
+ axis=alt.Axis(
1086
+ labelFontSize=14,
1087
+ titleFontSize=18,
1088
+ #titleFont='Urbanist',
1089
+ titleColor='black',
1090
+ titleFontWeight='normal',
1091
+ #titleFontStyle='italic',
1092
+ titlePadding=20,
1093
+ #format='.1f%'
1094
+ )
1095
+ ),
1096
+ alt.Y(
1097
+ 'count()',
1098
+ title="Num. videos",
1099
+ axis=alt.Axis(
1100
+ labelFontSize=14,
1101
+ titleFontSize=18,
1102
+ #titleFont='Urbanist',
1103
+ titleColor='black',
1104
+ titleFontWeight='normal',
1105
+ #titleFontStyle='italic',
1106
+ titlePadding=20,
1107
+ tickCount=5
1108
+ ),
1109
+ scale=alt.Scale(domain=[0,40])
1110
+ ).stack(None),
1111
+ alt.Color(
1112
+ 'level:N',
1113
+ scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
1114
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1115
+ legend=alt.Legend(
1116
+ title='CIJ Level',
1117
+ #titleFont='Urbanist',
1118
+ titleFontSize=18,
1119
+ titleFontWeight='bolder',
1120
+ labelFontSize=16,
1121
+ #labelFont='Urbanist',
1122
+ symbolType='circle',
1123
+ symbolSize=200,
1124
+ symbolStrokeWidth=0,
1125
+ orient='right',
1126
+ direction='vertical',
1127
+ fillColor='white',
1128
+ padding=10,
1129
+ cornerRadius=5,
1130
+ )
1131
+ ),
1132
+ tooltip=[
1133
+ alt.Tooltip('ne_spot:Q', title='Vocab size needed for 98% cov:', bin=True), # Properly indicate that `wpm` is binned
1134
+ alt.Tooltip('level:N', title='Level:'),
1135
+ alt.Tooltip('count()', title='Video count:')
1136
+ ],
1137
+ opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1138
+ strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1139
+ ).properties(
1140
+ #width=750,
1141
+ width='container',
1142
+ #height='container',
1143
+ height=500,
1144
+ #background='beige',
1145
+ #padding=50,
1146
+ title=alt.TitleParams(
1147
+ text='Vocab size needed for 98% coverage',
1148
+ offset=20,
1149
+ #subtitle='(clickable)',
1150
+ #font='Urbanist',
1151
+ fontSize=24,
1152
+ fontWeight='normal',
1153
+ anchor='middle',
1154
+ color='black',
1155
+ subtitleFontSize=15,
1156
+ subtitleColor='gray'
1157
+ )
1158
+ ).add_params(
1159
+ selection,
1160
+ highlight
1161
+ )
1162
+
1163
+ # Vertical lines corresponding to each level
1164
+ vertical_lines = alt.Chart(line_data).mark_rule(
1165
+ color='red',
1166
+ strokeWidth=6,
1167
+ strokeDash = [10, 2], # first arg is length, second is gap
1168
+ ).encode(
1169
+ x='x:Q',
1170
+ tooltip=[
1171
+ alt.Tooltip('x:N', title='Median vocab size needed for 98% cov:'),
1172
+ alt.Tooltip('level:N', title='Level:')
1173
+ ],
1174
+ #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1175
+ color=alt.Color(
1176
+ 'level:N',
1177
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1178
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1179
+ legend=None # No legend for lines, it is already shown in the histogram
1180
+ ),
1181
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1182
+ strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1183
+ ).add_params(
1184
+ selection,
1185
+ highlight
1186
+ )
1187
+
1188
+ text_labels = alt.Chart(line_data).mark_text(
1189
+ align='center', # Align text to the left of the line
1190
+ dx=0, # Offset the text to the right by 5 pixels
1191
+ dy=-10, # Adjust vertical positioning
1192
+ fontSize=16,
1193
+ fontWeight='bold'
1194
+ ).encode(
1195
+ x='x:Q',
1196
+ y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1197
+ text=alt.Text('x:Q', format='.0f'), # Display the x value, formatted as an integer
1198
+ color=alt.Color(
1199
+ 'level:N',
1200
+ scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1201
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1202
+ legend=None
1203
+ ),
1204
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1205
+ )
1206
+
1207
+
1208
+ if show_medians:
1209
+ layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
1210
+ else:
1211
+ layered_chart = alt.layer(histogram, background='white')
1212
+
1213
+ return layered_chart
1214
+
1215
+ if st.checkbox('Show medians', key='ne_spot'):
1216
+
1217
+ ne_spot_hist = get_ne_spot_hist(show_medians=True)
1218
+
1219
+ else:
1220
+
1221
+ ne_spot_hist = get_ne_spot_hist(show_medians=False)
1222
+
1223
+ st.altair_chart(ne_spot_hist, use_container_width=True)
1224
 
1225
  st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
1226