joshdavham commited on
Commit
4b92973
·
1 Parent(s): e864e66

add word rareness hist

Browse files
Files changed (1) hide show
  1. app.py +167 -1
app.py CHANGED
@@ -1224,11 +1224,177 @@ st.altair_chart(ne_spot_hist, use_container_width=True)
1224
 
1225
  st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
1226
 
 
 
 
1227
  st.markdown("## Word rareness")
1228
 
1229
  st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
1230
 
1231
- st.markdown("[TODO]: Add that that log rank histogram")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1232
 
1233
  st.markdown("How common a word is, is known as its 'rank'. The most common word \
1234
  in a text would be rank 1 and the fifth most common would be rank 5. \
 
1224
 
1225
  st.markdown("In general, easier videos require smaller vocabulary sizes to understand.")
1226
 
1227
+ ###
1228
+ # WORD RARENESS
1229
+ ###
1230
  st.markdown("## Word rareness")
1231
 
1232
  st.markdown("More advanced videos tend to use rare/uncommon words more often than easier videos.")
1233
 
1234
+ def get_tfplr_hist(show_medians=False):
1235
+
1236
+ # Data for vertical lines corresponding to each level
1237
+ line_data = pd.DataFrame({
1238
+ 'x': [3.82, 4.30, 4.76, 5.21],
1239
+ 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1240
+ 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
1241
+ })
1242
+
1243
+ selection = alt.selection_point(fields=['level'], bind='legend', on='click')
1244
+
1245
+ highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
1246
+
1247
+ histogram = alt.Chart(video_df).mark_bar(
1248
+ opacity=0.5,
1249
+ binSpacing=3,
1250
+ stroke='black',
1251
+ strokeWidth=0,
1252
+ cornerRadius=5,
1253
+ cursor="pointer"
1254
+ ).encode(
1255
+ alt.X(
1256
+ 'tfp_log_ranks_unique:Q',
1257
+ bin=alt.Bin(maxbins=30),
1258
+ title='Log ranks',
1259
+ axis=alt.Axis(
1260
+ labelFontSize=14,
1261
+ titleFontSize=18,
1262
+ #titleFont='Urbanist',
1263
+ titleColor='black',
1264
+ titleFontWeight='normal',
1265
+ #titleFontStyle='italic',
1266
+ titlePadding=30,
1267
+ #format='.1f%'
1268
+ )
1269
+ ),
1270
+ alt.Y(
1271
+ 'count()',
1272
+ title="Num. videos",
1273
+ axis=alt.Axis(
1274
+ labelFontSize=14,
1275
+ titleFontSize=18,
1276
+ #titleFont='Urbanist',
1277
+ titleColor='black',
1278
+ titleFontWeight='normal',
1279
+ #titleFontStyle='italic',
1280
+ titlePadding=20,
1281
+ tickCount=5
1282
+ ),
1283
+ scale=alt.Scale(domain=[0,80])
1284
+ ).stack(None),
1285
+ alt.Color(
1286
+ 'level:N',
1287
+ scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
1288
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1289
+ legend=alt.Legend(
1290
+ title='CIJ Level',
1291
+ #titleFont='Urbanist',
1292
+ titleFontSize=18,
1293
+ titleFontWeight='bolder',
1294
+ labelFontSize=16,
1295
+ #labelFont='Urbanist',
1296
+ symbolType='circle',
1297
+ symbolSize=200,
1298
+ symbolStrokeWidth=0,
1299
+ orient='right',
1300
+ direction='vertical',
1301
+ fillColor='white',
1302
+ padding=10,
1303
+ cornerRadius=5,
1304
+ )
1305
+ ),
1306
+ tooltip=[
1307
+ alt.Tooltip('tfp_log_ranks_unique:Q', title='25th percentile word-frequency log rank:', bin=True), # Properly indicate that `wpm` is binned
1308
+ alt.Tooltip('level:N', title='Level:'),
1309
+ alt.Tooltip('count()', title='Video count:')
1310
+ ],
1311
+ opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
1312
+ strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
1313
+ ).properties(
1314
+ #width=750,
1315
+ width='container',
1316
+ #height='container',
1317
+ height=500,
1318
+ #background='beige',
1319
+ #padding=50,
1320
+ title=alt.TitleParams(
1321
+ text='25th percentile word-frequency log ranks',
1322
+ offset=20,
1323
+ #subtitle='(clickable)',
1324
+ #font='Urbanist',
1325
+ fontSize=24,
1326
+ fontWeight='normal',
1327
+ anchor='middle',
1328
+ color='black',
1329
+ subtitleFontSize=15,
1330
+ subtitleColor='gray'
1331
+ )
1332
+ ).add_params(
1333
+ selection,
1334
+ highlight
1335
+ )
1336
+
1337
+ # Vertical lines corresponding to each level
1338
+ vertical_lines = alt.Chart(line_data).mark_rule(
1339
+ color='red',
1340
+ strokeWidth=6,
1341
+ strokeDash = [10, 2], # first arg is length, second is gap
1342
+ ).encode(
1343
+ x='x:Q',
1344
+ tooltip=[
1345
+ alt.Tooltip('x:N', title='Median 25th percentile word-frequency log rank:'),
1346
+ alt.Tooltip('level:N', title='Level:')
1347
+ ],
1348
+ #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
1349
+ color=alt.Color(
1350
+ 'level:N',
1351
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
1352
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1353
+ legend=None # No legend for lines, it is already shown in the histogram
1354
+ ),
1355
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1356
+ strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
1357
+ ).add_params(
1358
+ selection,
1359
+ highlight
1360
+ )
1361
+
1362
+ text_labels = alt.Chart(line_data).mark_text(
1363
+ align='center', # Align text to the left of the line
1364
+ dx=0, # Offset the text to the right by 5 pixels
1365
+ dy=-10, # Adjust vertical positioning
1366
+ fontSize=16,
1367
+ fontWeight='bold'
1368
+ ).encode(
1369
+ x='x:Q',
1370
+ y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
1371
+ text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
1372
+ color=alt.Color(
1373
+ 'level:N',
1374
+ scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
1375
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
1376
+ legend=None
1377
+ ),
1378
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
1379
+ )
1380
+
1381
+ #layered_chart = alt.layer(histogram, background='white')
1382
+ if show_medians:
1383
+ layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
1384
+ else:
1385
+ layered_chart = alt.layer(histogram, background='white')
1386
+
1387
+ return layered_chart
1388
+
1389
+ if st.checkbox('Show medians', key='tfplr'):
1390
+
1391
+ tfplr_hist = get_tfplr_hist(show_medians=True)
1392
+
1393
+ else:
1394
+
1395
+ tfplr_hist = get_tfplr_hist(show_medians=False)
1396
+
1397
+ st.altair_chart(tfplr_hist, use_container_width=True)
1398
 
1399
  st.markdown("How common a word is, is known as its 'rank'. The most common word \
1400
  in a text would be rank 1 and the fifth most common would be rank 5. \