joshdavham commited on
Commit
e496ce2
·
1 Parent(s): 1cd252a

add sentence length histogram

Browse files
Files changed (1) hide show
  1. app.py +168 -1
app.py CHANGED
@@ -357,11 +357,178 @@ st.markdown("For example: if a statistic is small for Complete Beginnner videos,
357
 
358
  st.markdown("Okay! Now we can continue.")
359
 
 
 
 
360
  st.markdown("## Sentence length")
361
 
362
  st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
363
 
364
- st.markdown("[TODO]: Add mean sentence length graph")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
 
366
  st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
367
  whereas short sentences are usually easier to understand.")
 
357
 
358
  st.markdown("Okay! Now we can continue.")
359
 
360
+ ###
361
+ # SENTENCE LENGTH
362
+ ###
363
  st.markdown("## Sentence length")
364
 
365
  st.markdown("Videos meant for beginners tend to have shorter sentences on average.")
366
 
367
+ def get_sentence_length_hist(show_medians=False):
368
+
369
+ # Data for vertical lines corresponding to each level
370
+ line_data = pd.DataFrame({
371
+ 'x': [7.60, 10.45, 16.17, 19.39],
372
+ 'level': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
373
+ 'text': ['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced']
374
+ })
375
+
376
+ selection = alt.selection_point(fields=['level'], bind='legend', on='click')
377
+
378
+ highlight = alt.selection_point(name="highlight", fields=['level'], on='mouseover', empty=False)
379
+
380
+ histogram = alt.Chart(video_df).mark_bar(
381
+ opacity=0.5,
382
+ binSpacing=3,
383
+ stroke='black',
384
+ strokeWidth=0,
385
+ cornerRadius=5,
386
+ cursor="pointer"
387
+ ).encode(
388
+ alt.X(
389
+ 'mean_sentence_length:Q',
390
+ bin=alt.Bin(maxbins=30),
391
+ title='Average sentence length',
392
+ axis=alt.Axis(
393
+ labelFontSize=14,
394
+ titleFontSize=18,
395
+ #titleFont='Urbanist',
396
+ titleColor='black',
397
+ titleFontWeight='normal',
398
+ #titleFontStyle='italic',
399
+ titlePadding=20
400
+ )
401
+ ),
402
+ alt.Y(
403
+ 'count()',
404
+ title="Num. videos",
405
+ axis=alt.Axis(
406
+ labelFontSize=14,
407
+ titleFontSize=18,
408
+ #titleFont='Urbanist',
409
+ titleColor='black',
410
+ titleFontWeight='normal',
411
+ #titleFontStyle='italic',
412
+ titlePadding=20,
413
+ tickCount=5
414
+ ),
415
+ scale=alt.Scale(domain=[0,100])
416
+ ).stack(None),
417
+ alt.Color(
418
+ 'level:N',
419
+ scale=alt.Scale(range=['#a5bee4', '#9ad6d8', '#c7aecd', '#dd9e9e']),
420
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
421
+ legend=alt.Legend(
422
+ title='CIJ Level',
423
+ #titleFont='Urbanist',
424
+ titleFontSize=18,
425
+ titleFontWeight='bolder',
426
+ labelFontSize=16,
427
+ #labelFont='Urbanist',
428
+ symbolType='circle',
429
+ symbolSize=200,
430
+ symbolStrokeWidth=0,
431
+ orient='right',
432
+ direction='vertical',
433
+ fillColor='white',
434
+ padding=10,
435
+ cornerRadius=5,
436
+ )
437
+ ),
438
+ tooltip=[
439
+ alt.Tooltip('mean_sentence_length:Q', title='Average sentence length:', bin=True), # Properly indicate that `wpm` is binned
440
+ alt.Tooltip('level:N', title='Level:'),
441
+ alt.Tooltip('count()', title='Video count:')
442
+ ],
443
+ opacity=alt.condition(selection, alt.value(0.75), alt.value(0.1)),
444
+ strokeWidth=alt.condition(highlight, alt.value(2), alt.value(1))
445
+ ).properties(
446
+ #width=750,
447
+ width='container',
448
+ #height='container',
449
+ height=500,
450
+ #background='beige',
451
+ #padding=50,
452
+ title=alt.TitleParams(
453
+ text='Average number of words per sentence (sentence length)',
454
+ offset=20,
455
+ #subtitle='(clickable)',
456
+ #font='Urbanist',
457
+ fontSize=24,
458
+ fontWeight='normal',
459
+ anchor='middle',
460
+ color='black',
461
+ subtitleFontSize=15,
462
+ subtitleColor='gray'
463
+ )
464
+ ).add_params(
465
+ selection,
466
+ highlight
467
+ )
468
+
469
+ # Vertical lines corresponding to each level
470
+ vertical_lines = alt.Chart(line_data).mark_rule(
471
+ color='red',
472
+ strokeWidth=6,
473
+ strokeDash = [10, 2], # first arg is length, second is gap
474
+ ).encode(
475
+ x='x:Q',
476
+ tooltip=[
477
+ alt.Tooltip('x:N', title='Median average sentence length:'),
478
+ alt.Tooltip('level:N', title='Level:')
479
+ ],
480
+ #color=alt.condition(select, 'level:N', alt.value('gray')), # Link the color with the selection
481
+ color=alt.Color(
482
+ 'level:N',
483
+ scale=alt.Scale(range=['red', 'green', 'blue', 'yellow']), # Use the same color scale as the histogram
484
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
485
+ legend=None # No legend for lines, it is already shown in the histogram
486
+ ),
487
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
488
+ strokeWidth=alt.condition(highlight, alt.value(20), alt.value(1))
489
+ ).add_params(
490
+ selection,
491
+ highlight
492
+ )
493
+
494
+ text_labels = alt.Chart(line_data).mark_text(
495
+ align='center', # Align text to the left of the line
496
+ dx=0, # Offset the text to the right by 5 pixels
497
+ dy=-10, # Adjust vertical positioning
498
+ fontSize=16,
499
+ fontWeight='bold'
500
+ ).encode(
501
+ x='x:Q',
502
+ y=alt.value(0), # Positioning y at the top of the chart, can be adjusted as needed
503
+ text=alt.Text('x:Q', format='.2f'), # Display the x value, formatted as an integer
504
+ color=alt.Color(
505
+ 'level:N',
506
+ scale=alt.Scale(range=['red', 'green', 'blue', 'orange']),
507
+ sort=['Complete Beginner', 'Beginner', 'Intermediate', 'Advanced'],
508
+ legend=None
509
+ ),
510
+ opacity=alt.condition(selection, alt.value(1.0), alt.value(0.1)), # Link opacity with selection
511
+ )
512
+
513
+ if show_medians:
514
+
515
+ layered_chart = alt.layer(histogram, vertical_lines, text_labels, background='white')
516
+
517
+ else:
518
+
519
+ layered_chart = alt.layer(histogram, background='white')
520
+
521
+ return layered_chart
522
+
523
+ if st.checkbox('Show medians', key='sentence_length'):
524
+
525
+ sentence_length_hist = get_sentence_length_hist(show_medians=True)
526
+
527
+ else:
528
+
529
+ sentence_length_hist = get_sentence_length_hist(show_medians=False)
530
+
531
+ st.altair_chart(sentence_length_hist, use_container_width=True)
532
 
533
  st.markdown("This makes sense because long sentences generally tend to be more complex and packed with information \
534
  whereas short sentences are usually easier to understand.")