wanchichen commited on
Commit
b9c9746
·
verified ·
1 Parent(s): 66ba863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +275 -273
app.py CHANGED
@@ -1,4 +1,4 @@
1
-
2
  import os
3
  import shutil
4
  import time
@@ -446,285 +446,287 @@ def transcribe(
446
  # ------------------------
447
  # Executable Script
448
  # ------------------------
449
- api = HfApi()
450
- nltk.download("averaged_perceptron_tagger_eng")
451
- start_warmup()
452
- default_instruct=(
453
- "You are a helpful and friendly AI "
454
- "assistant. "
455
- "You are polite, respectful, and aim to "
456
- "provide concise and complete responses of "
457
- "less than 15 words."
458
- )
459
- import pandas as pd
460
- examples = pd.DataFrame([
461
- ["General Purpose Conversation", default_instruct],
462
- ["Translation", "You are a translator. Translate user text into English."],
463
- ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
464
- ["Summarization", "You are summarizer. Summarize user's utterance."]
465
- ], columns=["Task", "LLM Prompt"])
466
- with gr.Blocks(
467
- title="E2E Spoken Dialog System",
468
- ) as demo:
469
- with gr.Row():
470
- gr.Markdown(
 
 
 
 
 
 
 
 
 
 
 
471
  """
472
- ## ESPnet-SDS
473
- Welcome to our unified web interface for various cascaded and
474
- E2E spoken dialogue systems built using ESPnet-SDS toolkit,
475
- supporting real-time automated evaluation metrics, and
476
- human-in-the-loop feedback collection.
477
-
478
- For more details on how to use the app, refer to the [README]
479
- (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
480
- """
481
- )
482
- with gr.Row():
483
- with gr.Column(scale=1):
484
- user_audio = gr.Audio(
485
- sources=["microphone"],
486
- streaming=True,
487
- waveform_options=gr.WaveformOptions(sample_rate=16000),
488
- )
489
- input_text=gr.Textbox(
490
- label="LLM prompt",
491
- visible=True,
492
- interactive=True,
493
- value=default_instruct
494
  )
495
- with gr.Row():
496
- type_radio = gr.Radio(
497
- choices=["Cascaded", "E2E"],
498
- label="Choose type of Spoken Dialog:",
499
- value="Cascaded",
 
500
  )
501
- with gr.Row():
502
- ASR_radio = gr.Radio(
503
- choices=ASR_options,
504
- label="Choose ASR:",
505
- value=ASR_name,
506
- )
507
- with gr.Row():
508
- LLM_radio = gr.Radio(
509
- choices=LLM_options,
510
- label="Choose LLM:",
511
- value=LLM_name,
512
- )
513
- with gr.Row():
514
- radio = gr.Radio(
515
- choices=TTS_options,
516
- label="Choose TTS:",
517
- value=TTS_name,
518
- )
519
- with gr.Row():
520
- E2Eradio = gr.Radio(
521
- choices=["mini-omni"],
522
- label="Choose E2E model:",
523
- value="mini-omni",
524
- visible=False,
525
- )
526
- with gr.Row():
527
- feedback_btn = gr.Button(
528
- value=(
529
- "Please provide your feedback "
530
- "after each system response below."
531
- ),
532
  visible=True,
533
- interactive=False,
534
- elem_id="button",
535
- )
536
- with gr.Row():
537
- natural_btn1 = gr.Button(
538
- value="Very Natural", visible=False, interactive=False, scale=1
539
  )
540
- natural_btn2 = gr.Button(
541
- value="Somewhat Awkward", visible=False, interactive=False, scale=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  )
543
- natural_btn3 = gr.Button(
544
- value="Very Awkward", visible=False, interactive=False, scale=1
545
- )
546
- natural_btn4 = gr.Button(
547
- value="Unnatural", visible=False, interactive=False, scale=1
548
- )
549
- with gr.Row():
550
- relevant_btn1 = gr.Button(
551
- value="Highly Relevant", visible=False, interactive=False, scale=1
552
- )
553
- relevant_btn2 = gr.Button(
554
- value="Partially Relevant",
555
  visible=False,
556
- interactive=False,
557
- scale=1,
558
  )
559
- relevant_btn3 = gr.Button(
560
- value="Slightly Irrelevant",
561
- visible=False,
562
- interactive=False,
563
- scale=1,
564
- )
565
- relevant_btn4 = gr.Button(
566
- value="Completely Irrelevant",
567
- visible=False,
568
- interactive=False,
569
- scale=1,
570
- )
571
- with gr.Column(scale=1):
572
- output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
573
- output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
574
- output_asr_text = gr.Textbox(label="ASR output", interactive=False)
575
- output_text = gr.Textbox(label="LLM output", interactive=False)
576
- eval_radio = gr.Radio(
577
- choices=[
578
- "Latency",
579
- "TTS Intelligibility",
580
- "TTS Speech Quality",
581
- "ASR WER",
582
- "Text Dialog Metrics",
583
- ],
584
- label="Choose Evaluation metrics:",
585
- )
586
- eval_radio_E2E = gr.Radio(
587
- choices=[
588
- "Latency",
589
- "TTS Intelligibility",
590
- "TTS Speech Quality",
591
- "Text Dialog Metrics",
592
- ],
593
- label="Choose Evaluation metrics:",
594
- visible=False,
595
  )
596
- output_eval_text = gr.Textbox(label="Evaluation Results")
597
- state = gr.State()
598
- gr.Markdown("### Example Prompts & Responses")
599
- gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
600
- with gr.Row():
601
- privacy_text = gr.Textbox(
602
- label="Privacy Notice",
603
- interactive=False,
604
- value=(
605
- "By using this demo, you acknowledge that"
606
- "interactions with this dialog system are collected "
607
- "for research and improvement purposes. The data "
608
- "will only be used to enhance the performance and "
609
- "understanding of the system. If you have any "
610
- "concerns about data collection, please discontinue "
611
- "use."
612
- ),
613
- )
614
-
615
- btn_list = [
616
- natural_btn1,
617
- natural_btn2,
618
- natural_btn3,
619
- natural_btn4,
620
- relevant_btn1,
621
- relevant_btn2,
622
- relevant_btn3,
623
- relevant_btn4,
624
- ]
625
- natural_btn_list = [
626
- natural_btn1,
627
- natural_btn2,
628
- natural_btn3,
629
- natural_btn4,
630
- ]
631
- relevant_btn_list = [
632
- relevant_btn1,
633
- relevant_btn2,
634
- relevant_btn3,
635
- relevant_btn4,
636
- ]
637
- natural_response = gr.Textbox(
638
- label="natural_response", visible=False, interactive=False
639
- )
640
- diversity_response = gr.Textbox(
641
- label="diversity_response", visible=False, interactive=False
642
- )
643
- ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
644
- callback.setup(
645
- [
646
- user_audio,
647
- output_asr_text,
648
- output_text,
649
- output_audio,
650
- output_audio1,
651
- type_radio,
652
- ASR_radio,
653
- LLM_radio,
654
- radio,
655
- E2Eradio,
656
- natural_response,
657
- diversity_response,
658
- ip_address,
659
- ],
660
- "flagged_data_points",
661
- )
662
- user_audio.stream(
663
- transcribe,
664
- inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
665
- outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
666
- ).then(
667
- lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
668
- )
669
- radio.change(
670
- fn=dialogue_model.handle_TTS_selection,
671
- inputs=[radio],
672
- outputs=[output_asr_text, output_text, output_audio],
673
- )
674
- LLM_radio.change(
675
- fn=dialogue_model.handle_LLM_selection,
676
- inputs=[LLM_radio],
677
- outputs=[output_asr_text, output_text, output_audio],
678
- )
679
- ASR_radio.change(
680
- fn=dialogue_model.handle_ASR_selection,
681
- inputs=[ASR_radio],
682
- outputs=[output_asr_text, output_text, output_audio],
683
- )
684
- eval_radio.change(
685
- fn=handle_eval_selection,
686
- inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
687
- outputs=[eval_radio, output_eval_text],
688
- )
689
- eval_radio_E2E.change(
690
- fn=handle_eval_selection_E2E,
691
- inputs=[eval_radio_E2E, output_audio, output_text],
692
- outputs=[eval_radio_E2E, output_eval_text],
693
- )
694
- type_radio.change(
695
- fn=dialogue_model.handle_type_selection,
696
- inputs=[type_radio, radio, ASR_radio, LLM_radio],
697
- outputs=[
698
- radio,
699
- ASR_radio,
700
- LLM_radio,
701
- E2Eradio,
702
- output_asr_text,
703
- output_text,
704
- output_audio,
705
- eval_radio,
706
- eval_radio_E2E,
707
- ],
708
- )
709
- output_audio.play(
710
- flash_buttons, [], [natural_response, diversity_response] + btn_list
711
- ).then(
712
- lambda *args: callback.flag(list(args)),
713
- [
714
- user_audio,
715
- output_asr_text,
716
- output_text,
717
- output_audio,
718
- output_audio1,
719
- type_radio,
720
- ASR_radio,
721
- LLM_radio,
722
- radio,
723
- E2Eradio,
724
- ],
725
- None,
726
- preprocess=False,
727
- )
728
 
729
- demo.queue(max_size=10, default_concurrency_limit=1)
730
- demo.launch(share=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
  import os
3
  import shutil
4
  import time
 
446
  # ------------------------
447
  # Executable Script
448
  # ------------------------
449
+ @spaces.GPU
450
+ def start():
451
+ api = HfApi()
452
+ nltk.download("averaged_perceptron_tagger_eng")
453
+ start_warmup()
454
+ default_instruct=(
455
+ "You are a helpful and friendly AI "
456
+ "assistant. "
457
+ "You are polite, respectful, and aim to "
458
+ "provide concise and complete responses of "
459
+ "less than 15 words."
460
+ )
461
+ import pandas as pd
462
+ examples = pd.DataFrame([
463
+ ["General Purpose Conversation", default_instruct],
464
+ ["Translation", "You are a translator. Translate user text into English."],
465
+ ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
466
+ ["Summarization", "You are summarizer. Summarize user's utterance."]
467
+ ], columns=["Task", "LLM Prompt"])
468
+ with gr.Blocks(
469
+ title="E2E Spoken Dialog System",
470
+ ) as demo:
471
+ with gr.Row():
472
+ gr.Markdown(
473
+ """
474
+ ## ESPnet-SDS
475
+ Welcome to our unified web interface for various cascaded and
476
+ E2E spoken dialogue systems built using ESPnet-SDS toolkit,
477
+ supporting real-time automated evaluation metrics, and
478
+ human-in-the-loop feedback collection.
479
+
480
+ For more details on how to use the app, refer to the [README]
481
+ (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
482
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  )
484
+ with gr.Row():
485
+ with gr.Column(scale=1):
486
+ user_audio = gr.Audio(
487
+ sources=["microphone"],
488
+ streaming=True,
489
+ waveform_options=gr.WaveformOptions(sample_rate=16000),
490
  )
491
+ input_text=gr.Textbox(
492
+ label="LLM prompt",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  visible=True,
494
+ interactive=True,
495
+ value=default_instruct
 
 
 
 
496
  )
497
+ with gr.Row():
498
+ type_radio = gr.Radio(
499
+ choices=["Cascaded", "E2E"],
500
+ label="Choose type of Spoken Dialog:",
501
+ value="Cascaded",
502
+ )
503
+ with gr.Row():
504
+ ASR_radio = gr.Radio(
505
+ choices=ASR_options,
506
+ label="Choose ASR:",
507
+ value=ASR_name,
508
+ )
509
+ with gr.Row():
510
+ LLM_radio = gr.Radio(
511
+ choices=LLM_options,
512
+ label="Choose LLM:",
513
+ value=LLM_name,
514
+ )
515
+ with gr.Row():
516
+ radio = gr.Radio(
517
+ choices=TTS_options,
518
+ label="Choose TTS:",
519
+ value=TTS_name,
520
+ )
521
+ with gr.Row():
522
+ E2Eradio = gr.Radio(
523
+ choices=["mini-omni"],
524
+ label="Choose E2E model:",
525
+ value="mini-omni",
526
+ visible=False,
527
+ )
528
+ with gr.Row():
529
+ feedback_btn = gr.Button(
530
+ value=(
531
+ "Please provide your feedback "
532
+ "after each system response below."
533
+ ),
534
+ visible=True,
535
+ interactive=False,
536
+ elem_id="button",
537
+ )
538
+ with gr.Row():
539
+ natural_btn1 = gr.Button(
540
+ value="Very Natural", visible=False, interactive=False, scale=1
541
+ )
542
+ natural_btn2 = gr.Button(
543
+ value="Somewhat Awkward", visible=False, interactive=False, scale=1
544
+ )
545
+ natural_btn3 = gr.Button(
546
+ value="Very Awkward", visible=False, interactive=False, scale=1
547
+ )
548
+ natural_btn4 = gr.Button(
549
+ value="Unnatural", visible=False, interactive=False, scale=1
550
+ )
551
+ with gr.Row():
552
+ relevant_btn1 = gr.Button(
553
+ value="Highly Relevant", visible=False, interactive=False, scale=1
554
+ )
555
+ relevant_btn2 = gr.Button(
556
+ value="Partially Relevant",
557
+ visible=False,
558
+ interactive=False,
559
+ scale=1,
560
+ )
561
+ relevant_btn3 = gr.Button(
562
+ value="Slightly Irrelevant",
563
+ visible=False,
564
+ interactive=False,
565
+ scale=1,
566
+ )
567
+ relevant_btn4 = gr.Button(
568
+ value="Completely Irrelevant",
569
+ visible=False,
570
+ interactive=False,
571
+ scale=1,
572
+ )
573
+ with gr.Column(scale=1):
574
+ output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
575
+ output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
576
+ output_asr_text = gr.Textbox(label="ASR output", interactive=False)
577
+ output_text = gr.Textbox(label="LLM output", interactive=False)
578
+ eval_radio = gr.Radio(
579
+ choices=[
580
+ "Latency",
581
+ "TTS Intelligibility",
582
+ "TTS Speech Quality",
583
+ "ASR WER",
584
+ "Text Dialog Metrics",
585
+ ],
586
+ label="Choose Evaluation metrics:",
587
  )
588
+ eval_radio_E2E = gr.Radio(
589
+ choices=[
590
+ "Latency",
591
+ "TTS Intelligibility",
592
+ "TTS Speech Quality",
593
+ "Text Dialog Metrics",
594
+ ],
595
+ label="Choose Evaluation metrics:",
 
 
 
 
596
  visible=False,
 
 
597
  )
598
+ output_eval_text = gr.Textbox(label="Evaluation Results")
599
+ state = gr.State()
600
+ gr.Markdown("### Example Prompts & Responses")
601
+ gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
602
+ with gr.Row():
603
+ privacy_text = gr.Textbox(
604
+ label="Privacy Notice",
605
+ interactive=False,
606
+ value=(
607
+ "By using this demo, you acknowledge that"
608
+ "interactions with this dialog system are collected "
609
+ "for research and improvement purposes. The data "
610
+ "will only be used to enhance the performance and "
611
+ "understanding of the system. If you have any "
612
+ "concerns about data collection, please discontinue "
613
+ "use."
614
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
616
 
617
+ btn_list = [
618
+ natural_btn1,
619
+ natural_btn2,
620
+ natural_btn3,
621
+ natural_btn4,
622
+ relevant_btn1,
623
+ relevant_btn2,
624
+ relevant_btn3,
625
+ relevant_btn4,
626
+ ]
627
+ natural_btn_list = [
628
+ natural_btn1,
629
+ natural_btn2,
630
+ natural_btn3,
631
+ natural_btn4,
632
+ ]
633
+ relevant_btn_list = [
634
+ relevant_btn1,
635
+ relevant_btn2,
636
+ relevant_btn3,
637
+ relevant_btn4,
638
+ ]
639
+ natural_response = gr.Textbox(
640
+ label="natural_response", visible=False, interactive=False
641
+ )
642
+ diversity_response = gr.Textbox(
643
+ label="diversity_response", visible=False, interactive=False
644
+ )
645
+ ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
646
+ callback.setup(
647
+ [
648
+ user_audio,
649
+ output_asr_text,
650
+ output_text,
651
+ output_audio,
652
+ output_audio1,
653
+ type_radio,
654
+ ASR_radio,
655
+ LLM_radio,
656
+ radio,
657
+ E2Eradio,
658
+ natural_response,
659
+ diversity_response,
660
+ ip_address,
661
+ ],
662
+ "flagged_data_points",
663
+ )
664
+ user_audio.stream(
665
+ transcribe,
666
+ inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
667
+ outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
668
+ ).then(
669
+ lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
670
+ )
671
+ radio.change(
672
+ fn=dialogue_model.handle_TTS_selection,
673
+ inputs=[radio],
674
+ outputs=[output_asr_text, output_text, output_audio],
675
+ )
676
+ LLM_radio.change(
677
+ fn=dialogue_model.handle_LLM_selection,
678
+ inputs=[LLM_radio],
679
+ outputs=[output_asr_text, output_text, output_audio],
680
+ )
681
+ ASR_radio.change(
682
+ fn=dialogue_model.handle_ASR_selection,
683
+ inputs=[ASR_radio],
684
+ outputs=[output_asr_text, output_text, output_audio],
685
+ )
686
+ eval_radio.change(
687
+ fn=handle_eval_selection,
688
+ inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
689
+ outputs=[eval_radio, output_eval_text],
690
+ )
691
+ eval_radio_E2E.change(
692
+ fn=handle_eval_selection_E2E,
693
+ inputs=[eval_radio_E2E, output_audio, output_text],
694
+ outputs=[eval_radio_E2E, output_eval_text],
695
+ )
696
+ type_radio.change(
697
+ fn=dialogue_model.handle_type_selection,
698
+ inputs=[type_radio, radio, ASR_radio, LLM_radio],
699
+ outputs=[
700
+ radio,
701
+ ASR_radio,
702
+ LLM_radio,
703
+ E2Eradio,
704
+ output_asr_text,
705
+ output_text,
706
+ output_audio,
707
+ eval_radio,
708
+ eval_radio_E2E,
709
+ ],
710
+ )
711
+ output_audio.play(
712
+ flash_buttons, [], [natural_response, diversity_response] + btn_list
713
+ ).then(
714
+ lambda *args: callback.flag(list(args)),
715
+ [
716
+ user_audio,
717
+ output_asr_text,
718
+ output_text,
719
+ output_audio,
720
+ output_audio1,
721
+ type_radio,
722
+ ASR_radio,
723
+ LLM_radio,
724
+ radio,
725
+ E2Eradio,
726
+ ],
727
+ None,
728
+ preprocess=False,
729
+ )
730
+
731
+ demo.queue(max_size=10, default_concurrency_limit=1)
732
+ demo.launch(share=True)