Commit ·
4b92973
1
Parent(s): e864e66
add word rareness hist
Browse files
app.py
CHANGED
|
@@ -1224,11 +1224,177 @@ st.altair_chart(ne_spot_hist, use_container_width=True)
|
|
| 1224 |
|
| 1225 |
st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
|
| 1226 |
|
|
|
|
|
|
|
|
|
|
| 1227 |
st.markdown("## Word rareness")
|
| 1228 |
|
| 1229 |
st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
|
| 1230 |
|
| 1231 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1232 |
|
| 1233 |
st.markdown("How common a word is, is known as its 'rank'. The most common word \
|
| 1234 |
in a text would be rank 1 and the fifth most common would be rank 5. \
|
|
|
|
| 1224 |
|
| 1225 |
st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
|
| 1226 |
|
| 1227 |
+
###
|
| 1228 |
+
# WORD RARENESS
|
| 1229 |
+
###
|
| 1230 |
st.markdown("## Word rareness")
|
| 1231 |
|
| 1232 |
st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
|
| 1233 |
|
| 1234 |
+
def get_tfplr_hist(show_medians=False):
|
| 1235 |
+
|
| 1236 |
+
# Data for vertical lines corresponding to each level
|
| 1237 |
+
line_data = pd.DataFrame({
|
| 1238 |
+
'x': [3.82, 4.30, 4.76, 5.21],
|
| 1239 |
+
'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 1240 |
+
'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
|
| 1241 |
+
})
|
| 1242 |
+
|
| 1243 |
+
selection = alt.selection_point(fields=['level'], bind='legend', on='click')
|
| 1244 |
+
|
| 1245 |
+
highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
|
| 1246 |
+
|
| 1247 |
+
histogram = alt.Chart(video_df).mark_bar(
|
| 1248 |
+
opacity=0.5,
|
| 1249 |
+
binSpacing=3,
|
| 1250 |
+
stroke='black',
|
| 1251 |
+
strokeWidth=0,
|
| 1252 |
+
cornerRadius=5,
|
| 1253 |
+
cursor="pointer"
|
| 1254 |
+
).encode(
|
| 1255 |
+
alt.X(
|
| 1256 |
+
'tfp_log_ranks_unique:Q',
|
| 1257 |
+
bin=alt.Bin(maxbins=30),
|
| 1258 |
+
title='Log ranks',
|
| 1259 |
+
axis=alt.Axis(
|
| 1260 |
+
labelFontSize=14,
|
| 1261 |
+
titleFontSize=18,
|
| 1262 |
+
#titleFont='Urbanist',
|
| 1263 |
+
titleColor='black',
|
| 1264 |
+
titleFontWeight='normal',
|
| 1265 |
+
#titleFontStyle='italic',
|
| 1266 |
+
titlePadding=30,
|
| 1267 |
+
#format='.1f%'
|
| 1268 |
+
)
|
| 1269 |
+
),
|
| 1270 |
+
alt.Y(
|
| 1271 |
+
'count()',
|
| 1272 |
+
title="Num. videos",
|
| 1273 |
+
axis=alt.Axis(
|
| 1274 |
+
labelFontSize=14,
|
| 1275 |
+
titleFontSize=18,
|
| 1276 |
+
#titleFont='Urbanist',
|
| 1277 |
+
titleColor='black',
|
| 1278 |
+
titleFontWeight='normal',
|
| 1279 |
+
#titleFontStyle='italic',
|
| 1280 |
+
titlePadding=20,
|
| 1281 |
+
tickCount=5
|
| 1282 |
+
),
|
| 1283 |
+
scale=alt.Scale(domain=[0,80])
|
| 1284 |
+
).stack(None),
|
| 1285 |
+
alt.Color(
|
| 1286 |
+
'level:N',
|
| 1287 |
+
scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
|
| 1288 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 1289 |
+
legend=alt.Legend(
|
| 1290 |
+
title='CIJ Level',
|
| 1291 |
+
#titleFont='Urbanist',
|
| 1292 |
+
titleFontSize=18,
|
| 1293 |
+
titleFontWeight='bolder',
|
| 1294 |
+
labelFontSize=16,
|
| 1295 |
+
#labelFont='Urbanist',
|
| 1296 |
+
symbolType='circle',
|
| 1297 |
+
symbolSize=200,
|
| 1298 |
+
symbolStrokeWidth=0,
|
| 1299 |
+
orient='right',
|
| 1300 |
+
direction='vertical',
|
| 1301 |
+
fillColor='white',
|
| 1302 |
+
padding=10,
|
| 1303 |
+
cornerRadius=5,
|
| 1304 |
+
)
|
| 1305 |
+
),
|
| 1306 |
+
tooltip=[
|
| 1307 |
+
alt.Tooltip('tfp_log_ranks_unique:Q', title='25th percentile word-frequency log rank:', bin=True), # Properly indicate that `wpm` is binned
|
| 1308 |
+
alt.Tooltip('level:N', title='Level:'),
|
| 1309 |
+
alt.Tooltip('count()', title='Video count:')
|
| 1310 |
+
],
|
| 1311 |
+
opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
|
| 1312 |
+
strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
|
| 1313 |
+
).properties(
|
| 1314 |
+
#width=750,
|
| 1315 |
+
width='container',
|
| 1316 |
+
#height='container',
|
| 1317 |
+
height=500,
|
| 1318 |
+
#background='beige',
|
| 1319 |
+
#padding=50,
|
| 1320 |
+
title=alt.TitleParams(
|
| 1321 |
+
text='25th percentile word-frequency log ranks',
|
| 1322 |
+
offset=20,
|
| 1323 |
+
#subtitle='(clickable)',
|
| 1324 |
+
#font='Urbanist',
|
| 1325 |
+
fontSize=24,
|
| 1326 |
+
fontWeight='normal',
|
| 1327 |
+
anchor='middle',
|
| 1328 |
+
color='black',
|
| 1329 |
+
subtitleFontSize=15,
|
| 1330 |
+
subtitleColor='gray'
|
| 1331 |
+
)
|
| 1332 |
+
).add_params(
|
| 1333 |
+
selection,
|
| 1334 |
+
highlight
|
| 1335 |
+
)
|
| 1336 |
+
|
| 1337 |
+
# Vertical lines corresponding to each level
|
| 1338 |
+
vertical_lines = alt.Chart(line_data).mark_rule(
|
| 1339 |
+
color='red',
|
| 1340 |
+
strokeWidth=6,
|
| 1341 |
+
strokeDash = [10, 2], # first arg is length, second is gap
|
| 1342 |
+
).encode(
|
| 1343 |
+
x='x:Q',
|
| 1344 |
+
tooltip=[
|
| 1345 |
+
alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
|
| 1346 |
+
alt.Tooltip('level:N', title='Level:')
|
| 1347 |
+
],
|
| 1348 |
+
#color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
|
| 1349 |
+
color=alt.Color(
|
| 1350 |
+
'level:N',
|
| 1351 |
+
scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
|
| 1352 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 1353 |
+
legend=None # No legend for lines, it is already shown in the histogram
|
| 1354 |
+
),
|
| 1355 |
+
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 1356 |
+
strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
|
| 1357 |
+
).add_params(
|
| 1358 |
+
selection,
|
| 1359 |
+
highlight
|
| 1360 |
+
)
|
| 1361 |
+
|
| 1362 |
+
text_labels = alt.Chart(line_data).mark_text(
|
| 1363 |
+
align='center', # Align text to the left of the line
|
| 1364 |
+
dx=0, # Offset the text to the right by 5 pixels
|
| 1365 |
+
dy=-10, # Adjust vertical positioning
|
| 1366 |
+
fontSize=16,
|
| 1367 |
+
fontWeight='bold'
|
| 1368 |
+
).encode(
|
| 1369 |
+
x='x:Q',
|
| 1370 |
+
y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
|
| 1371 |
+
text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
|
| 1372 |
+
color=alt.Color(
|
| 1373 |
+
'level:N',
|
| 1374 |
+
scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
|
| 1375 |
+
sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
|
| 1376 |
+
legend=None
|
| 1377 |
+
),
|
| 1378 |
+
opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
|
| 1379 |
+
)
|
| 1380 |
+
|
| 1381 |
+
#layered_chart = alt.layer(histogram, background='white')
|
| 1382 |
+
if show_medians:
|
| 1383 |
+
layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
|
| 1384 |
+
else:
|
| 1385 |
+
layered_chart = alt.layer(histogram, background='white')
|
| 1386 |
+
|
| 1387 |
+
return layered_chart
|
| 1388 |
+
|
| 1389 |
+
if st.checkbox('Show medians', key='tfplr'):
|
| 1390 |
+
|
| 1391 |
+
tfplr_hist = get_tfplr_hist(show_medians=True)
|
| 1392 |
+
|
| 1393 |
+
else:
|
| 1394 |
+
|
| 1395 |
+
tfplr_hist = get_tfplr_hist(show_medians=False)
|
| 1396 |
+
|
| 1397 |
+
st.altair_chart(tfplr_hist, use_container_width=True)
|
| 1398 |
|
| 1399 |
st.markdown("How common a word is, is known as its 'rank'. The most common word \
|
| 1400 |
in a text would be rank 1 and the fifth most common would be rank 5. \
|