wanchichen commited on
Commit
1efc72a
·
1 Parent(s): 2e3ab02
Files changed (1) hide show
  1. app.py +235 -240
app.py CHANGED
@@ -403,7 +403,6 @@ def transcribe(
403
  latency_TTS,
404
  )
405
  text_str1 = text_str
406
- print(text_str1, asr_output_str, flush=True)
407
  if change:
408
  print("Output changed")
409
  if asr_output_str != "":
@@ -446,253 +445,249 @@ def transcribe(
446
  # ------------------------
447
  # Executable Script
448
  # ------------------------
449
- @spaces.GPU(duration=500)
450
- def start():
451
- api = HfApi()
452
- nltk.download("averaged_perceptron_tagger_eng")
453
- start_warmup()
454
- default_instruct=(
455
- "You are a helpful and friendly AI "
456
- "assistant. "
457
- "You are polite, respectful, and aim to "
458
- "provide concise and complete responses of "
459
- "less than 15 words."
460
- )
461
- import pandas as pd
462
- examples = pd.DataFrame([
463
- ["General Purpose Conversation", default_instruct],
464
- ["Translation", "You are a translator. Translate user text into English."],
465
- ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
466
- ["Summarization", "You are summarizer. Summarize user's utterance."]
467
- ], columns=["Task", "LLM Prompt"])
468
- with gr.Blocks(
469
- title="E2E Spoken Dialog System",
470
- ) as demo:
471
- with gr.Row():
472
- gr.Markdown(
473
- """
474
- ## ESPnet-SDS
475
- Welcome to our unified web interface for various cascaded and
476
- E2E spoken dialogue systems built using ESPnet-SDS toolkit,
477
- supporting real-time automated evaluation metrics, and
478
- human-in-the-loop feedback collection.
479
-
480
- For more details on how to use the app, refer to the [README]
481
- (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
482
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  )
484
- with gr.Row():
485
- with gr.Column(scale=1):
486
- user_audio = gr.Audio(
487
- sources=["microphone"],
488
- streaming=True,
489
- waveform_options=gr.WaveformOptions(sample_rate=16000),
 
 
 
 
 
490
  )
491
- input_text=gr.Textbox(
492
- label="LLM prompt",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
493
  visible=True,
494
- interactive=True,
495
- value=default_instruct
 
 
 
 
496
  )
497
- with gr.Row():
498
- type_radio = gr.Radio(
499
- choices=["Cascaded", "E2E"],
500
- label="Choose type of Spoken Dialog:",
501
- value="Cascaded",
502
- )
503
- with gr.Row():
504
- ASR_radio = gr.Radio(
505
- choices=ASR_options,
506
- label="Choose ASR:",
507
- value=ASR_name,
508
- )
509
- with gr.Row():
510
- LLM_radio = gr.Radio(
511
- choices=LLM_options,
512
- label="Choose LLM:",
513
- value=LLM_name,
514
- )
515
- with gr.Row():
516
- radio = gr.Radio(
517
- choices=TTS_options,
518
- label="Choose TTS:",
519
- value=TTS_name,
520
- )
521
- with gr.Row():
522
- E2Eradio = gr.Radio(
523
- choices=["mini-omni"],
524
- label="Choose E2E model:",
525
- value="mini-omni",
526
- visible=False,
527
- )
528
- with gr.Row():
529
- feedback_btn = gr.Button(
530
- value=(
531
- "Please provide your feedback "
532
- "after each system response below."
533
- ),
534
- visible=True,
535
- interactive=False,
536
- elem_id="button",
537
- )
538
- with gr.Row():
539
- natural_btn1 = gr.Button(
540
- value="Very Natural", visible=False, interactive=False, scale=1
541
- )
542
- natural_btn2 = gr.Button(
543
- value="Somewhat Awkward", visible=False, interactive=False, scale=1
544
- )
545
- natural_btn3 = gr.Button(
546
- value="Very Awkward", visible=False, interactive=False, scale=1
547
- )
548
- natural_btn4 = gr.Button(
549
- value="Unnatural", visible=False, interactive=False, scale=1
550
- )
551
- with gr.Row():
552
- relevant_btn1 = gr.Button(
553
- value="Highly Relevant", visible=False, interactive=False, scale=1
554
- )
555
- relevant_btn2 = gr.Button(
556
- value="Partially Relevant",
557
- visible=False,
558
- interactive=False,
559
- scale=1,
560
- )
561
- relevant_btn3 = gr.Button(
562
- value="Slightly Irrelevant",
563
- visible=False,
564
- interactive=False,
565
- scale=1,
566
- )
567
- relevant_btn4 = gr.Button(
568
- value="Completely Irrelevant",
569
- visible=False,
570
- interactive=False,
571
- scale=1,
572
- )
573
- with gr.Column(scale=1):
574
- output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
575
- output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
576
- output_asr_text = gr.Textbox(label="ASR output", interactive=False)
577
- output_text = gr.Textbox(label="LLM output", interactive=False)
578
- eval_radio = gr.Radio(
579
- choices=[
580
- "Latency",
581
- "TTS Intelligibility",
582
- "TTS Speech Quality",
583
- "ASR WER",
584
- "Text Dialog Metrics",
585
- ],
586
- label="Choose Evaluation metrics:",
587
  )
588
- eval_radio_E2E = gr.Radio(
589
- choices=[
590
- "Latency",
591
- "TTS Intelligibility",
592
- "TTS Speech Quality",
593
- "Text Dialog Metrics",
594
- ],
595
- label="Choose Evaluation metrics:",
 
 
 
 
 
 
 
 
 
 
596
  visible=False,
 
 
597
  )
598
- output_eval_text = gr.Textbox(label="Evaluation Results")
599
- state = gr.State(value=None)
600
- #gr.Markdown("### Example Prompts & Responses")
601
- #gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
602
- with gr.Row():
603
- privacy_text = gr.Textbox(
604
- label="Privacy Notice",
605
- interactive=False,
606
- value=(
607
- "By using this demo, you acknowledge that"
608
- "interactions with this dialog system are collected "
609
- "for research and improvement purposes. The data "
610
- "will only be used to enhance the performance and "
611
- "understanding of the system. If you have any "
612
- "concerns about data collection, please discontinue "
613
- "use."
614
- ),
 
 
 
615
  )
616
-
617
- btn_list = [
618
- natural_btn1,
619
- natural_btn2,
620
- natural_btn3,
621
- natural_btn4,
622
- relevant_btn1,
623
- relevant_btn2,
624
- relevant_btn3,
625
- relevant_btn4,
626
- ]
627
- natural_btn_list = [
628
- natural_btn1,
629
- natural_btn2,
630
- natural_btn3,
631
- natural_btn4,
632
- ]
633
- relevant_btn_list = [
634
- relevant_btn1,
635
- relevant_btn2,
636
- relevant_btn3,
637
- relevant_btn4,
638
- ]
639
- natural_response = gr.Textbox(
640
- label="natural_response", visible=False, interactive=False
641
- )
642
- diversity_response = gr.Textbox(
643
- label="diversity_response", visible=False, interactive=False
644
- )
645
- ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
646
- user_audio.stream(
647
- transcribe,
648
- inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
649
- outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
650
- )
651
- radio.change(
652
- fn=dialogue_model.handle_TTS_selection,
653
- inputs=[radio],
654
- outputs=[output_asr_text, output_text, output_audio],
655
- )
656
- LLM_radio.change(
657
- fn=dialogue_model.handle_LLM_selection,
658
- inputs=[LLM_radio],
659
- outputs=[output_asr_text, output_text, output_audio],
660
- )
661
- ASR_radio.change(
662
- fn=dialogue_model.handle_ASR_selection,
663
- inputs=[ASR_radio],
664
- outputs=[output_asr_text, output_text, output_audio],
665
- )
666
- eval_radio.change(
667
- fn=handle_eval_selection,
668
- inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
669
- outputs=[eval_radio, output_eval_text],
670
- )
671
- eval_radio_E2E.change(
672
- fn=handle_eval_selection_E2E,
673
- inputs=[eval_radio_E2E, output_audio, output_text],
674
- outputs=[eval_radio_E2E, output_eval_text],
675
- )
676
- type_radio.change(
677
- fn=dialogue_model.handle_type_selection,
678
- inputs=[type_radio, radio, ASR_radio, LLM_radio],
679
- outputs=[
680
- radio,
681
- ASR_radio,
682
- LLM_radio,
683
- E2Eradio,
684
- output_asr_text,
685
- output_text,
686
- output_audio,
687
- eval_radio,
688
- eval_radio_E2E,
689
- ],
690
- )
691
- output_audio.play(
692
- flash_buttons, [], [natural_response, diversity_response] + btn_list
693
  )
694
-
695
- demo.queue(max_size=10, default_concurrency_limit=1)
696
- demo.launch(debug=True)
697
 
698
- start()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  latency_TTS,
404
  )
405
  text_str1 = text_str
 
406
  if change:
407
  print("Output changed")
408
  if asr_output_str != "":
 
445
  # ------------------------
446
  # Executable Script
447
  # ------------------------
448
+ api = HfApi()
449
+ nltk.download("averaged_perceptron_tagger_eng")
450
+ start_warmup()
451
+ default_instruct=(
452
+ "You are a helpful and friendly AI "
453
+ "assistant. "
454
+ "You are polite, respectful, and aim to "
455
+ "provide concise and complete responses of "
456
+ "less than 15 words."
457
+ )
458
+ import pandas as pd
459
+ examples = pd.DataFrame([
460
+ ["General Purpose Conversation", default_instruct],
461
+ ["Translation", "You are a translator. Translate user text into English."],
462
+ ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
463
+ ["Summarization", "You are summarizer. Summarize user's utterance."]
464
+ ], columns=["Task", "LLM Prompt"])
465
+ with gr.Blocks(
466
+ title="E2E Spoken Dialog System",
467
+ ) as demo:
468
+ with gr.Row():
469
+ gr.Markdown(
 
 
 
 
 
 
 
 
 
 
 
470
  """
471
+ ## ESPnet-SDS
472
+ Welcome to our unified web interface for various cascaded and
473
+ E2E spoken dialogue systems built using ESPnet-SDS toolkit,
474
+ supporting real-time automated evaluation metrics, and
475
+ human-in-the-loop feedback collection.
476
+
477
+ For more details on how to use the app, refer to the [README]
478
+ (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
479
+ """
480
+ )
481
+ with gr.Row():
482
+ with gr.Column(scale=1):
483
+ user_audio = gr.Audio(
484
+ sources=["microphone"],
485
+ streaming=True,
486
+ waveform_options=gr.WaveformOptions(sample_rate=16000),
487
+ )
488
+ input_text=gr.Textbox(
489
+ label="LLM prompt",
490
+ visible=True,
491
+ interactive=True,
492
+ value=default_instruct
493
  )
494
+ with gr.Row():
495
+ type_radio = gr.Radio(
496
+ choices=["Cascaded", "E2E"],
497
+ label="Choose type of Spoken Dialog:",
498
+ value="Cascaded",
499
+ )
500
+ with gr.Row():
501
+ ASR_radio = gr.Radio(
502
+ choices=ASR_options,
503
+ label="Choose ASR:",
504
+ value=ASR_name,
505
  )
506
+ with gr.Row():
507
+ LLM_radio = gr.Radio(
508
+ choices=LLM_options,
509
+ label="Choose LLM:",
510
+ value=LLM_name,
511
+ )
512
+ with gr.Row():
513
+ radio = gr.Radio(
514
+ choices=TTS_options,
515
+ label="Choose TTS:",
516
+ value=TTS_name,
517
+ )
518
+ with gr.Row():
519
+ E2Eradio = gr.Radio(
520
+ choices=["mini-omni"],
521
+ label="Choose E2E model:",
522
+ value="mini-omni",
523
+ visible=False,
524
+ )
525
+ with gr.Row():
526
+ feedback_btn = gr.Button(
527
+ value=(
528
+ "Please provide your feedback "
529
+ "after each system response below."
530
+ ),
531
  visible=True,
532
+ interactive=False,
533
+ elem_id="button",
534
+ )
535
+ with gr.Row():
536
+ natural_btn1 = gr.Button(
537
+ value="Very Natural", visible=False, interactive=False, scale=1
538
  )
539
+ natural_btn2 = gr.Button(
540
+ value="Somewhat Awkward", visible=False, interactive=False, scale=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
541
  )
542
+ natural_btn3 = gr.Button(
543
+ value="Very Awkward", visible=False, interactive=False, scale=1
544
+ )
545
+ natural_btn4 = gr.Button(
546
+ value="Unnatural", visible=False, interactive=False, scale=1
547
+ )
548
+ with gr.Row():
549
+ relevant_btn1 = gr.Button(
550
+ value="Highly Relevant", visible=False, interactive=False, scale=1
551
+ )
552
+ relevant_btn2 = gr.Button(
553
+ value="Partially Relevant",
554
+ visible=False,
555
+ interactive=False,
556
+ scale=1,
557
+ )
558
+ relevant_btn3 = gr.Button(
559
+ value="Slightly Irrelevant",
560
  visible=False,
561
+ interactive=False,
562
+ scale=1,
563
  )
564
+ relevant_btn4 = gr.Button(
565
+ value="Completely Irrelevant",
566
+ visible=False,
567
+ interactive=False,
568
+ scale=1,
569
+ )
570
+ with gr.Column(scale=1):
571
+ output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
572
+ output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
573
+ output_asr_text = gr.Textbox(label="ASR output", interactive=False)
574
+ output_text = gr.Textbox(label="LLM output", interactive=False)
575
+ eval_radio = gr.Radio(
576
+ choices=[
577
+ "Latency",
578
+ "TTS Intelligibility",
579
+ "TTS Speech Quality",
580
+ "ASR WER",
581
+ "Text Dialog Metrics",
582
+ ],
583
+ label="Choose Evaluation metrics:",
584
  )
585
+ eval_radio_E2E = gr.Radio(
586
+ choices=[
587
+ "Latency",
588
+ "TTS Intelligibility",
589
+ "TTS Speech Quality",
590
+ "Text Dialog Metrics",
591
+ ],
592
+ label="Choose Evaluation metrics:",
593
+ visible=False,
594
+ )
595
+ output_eval_text = gr.Textbox(label="Evaluation Results")
596
+ state = gr.State(value=None)
597
+ #gr.Markdown("### Example Prompts & Responses")
598
+ #gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
599
+ with gr.Row():
600
+ privacy_text = gr.Textbox(
601
+ label="Privacy Notice",
602
+ interactive=False,
603
+ value=(
604
+ "By using this demo, you acknowledge that"
605
+ "interactions with this dialog system are collected "
606
+ "for research and improvement purposes. The data "
607
+ "will only be used to enhance the performance and "
608
+ "understanding of the system. If you have any "
609
+ "concerns about data collection, please discontinue "
610
+ "use."
611
+ ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  )
 
 
 
613
 
614
+ btn_list = [
615
+ natural_btn1,
616
+ natural_btn2,
617
+ natural_btn3,
618
+ natural_btn4,
619
+ relevant_btn1,
620
+ relevant_btn2,
621
+ relevant_btn3,
622
+ relevant_btn4,
623
+ ]
624
+ natural_btn_list = [
625
+ natural_btn1,
626
+ natural_btn2,
627
+ natural_btn3,
628
+ natural_btn4,
629
+ ]
630
+ relevant_btn_list = [
631
+ relevant_btn1,
632
+ relevant_btn2,
633
+ relevant_btn3,
634
+ relevant_btn4,
635
+ ]
636
+ natural_response = gr.Textbox(
637
+ label="natural_response", visible=False, interactive=False
638
+ )
639
+ diversity_response = gr.Textbox(
640
+ label="diversity_response", visible=False, interactive=False
641
+ )
642
+ ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
643
+ user_audio.stream(
644
+ transcribe,
645
+ inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
646
+ outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
647
+ )
648
+ radio.change(
649
+ fn=dialogue_model.handle_TTS_selection,
650
+ inputs=[radio],
651
+ outputs=[output_asr_text, output_text, output_audio],
652
+ )
653
+ LLM_radio.change(
654
+ fn=dialogue_model.handle_LLM_selection,
655
+ inputs=[LLM_radio],
656
+ outputs=[output_asr_text, output_text, output_audio],
657
+ )
658
+ ASR_radio.change(
659
+ fn=dialogue_model.handle_ASR_selection,
660
+ inputs=[ASR_radio],
661
+ outputs=[output_asr_text, output_text, output_audio],
662
+ )
663
+ eval_radio.change(
664
+ fn=handle_eval_selection,
665
+ inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
666
+ outputs=[eval_radio, output_eval_text],
667
+ )
668
+ eval_radio_E2E.change(
669
+ fn=handle_eval_selection_E2E,
670
+ inputs=[eval_radio_E2E, output_audio, output_text],
671
+ outputs=[eval_radio_E2E, output_eval_text],
672
+ )
673
+ type_radio.change(
674
+ fn=dialogue_model.handle_type_selection,
675
+ inputs=[type_radio, radio, ASR_radio, LLM_radio],
676
+ outputs=[
677
+ radio,
678
+ ASR_radio,
679
+ LLM_radio,
680
+ E2Eradio,
681
+ output_asr_text,
682
+ output_text,
683
+ output_audio,
684
+ eval_radio,
685
+ eval_radio_E2E,
686
+ ],
687
+ )
688
+ output_audio.play(
689
+ flash_buttons, [], [natural_response, diversity_response] + btn_list
690
+ )
691
+
692
+ demo.queue(max_size=10, default_concurrency_limit=1)
693
+ demo.launch(debug=True)