wanchichen commited on
Commit
b2af2e4
·
1 Parent(s): 1efc72a
Files changed (1) hide show
  1. app.py +10 -101
app.py CHANGED
@@ -20,10 +20,10 @@ from espnet2.sds.espnet_model import ESPnetSDSModelInterface
20
  access_token = os.environ.get("HF_TOKEN")
21
  ASR_name="pyf98/owsm_ctc_v3.1_1B"
22
  LLM_name="meta-llama/Llama-3.2-1B-Instruct"
23
- TTS_name="kan-bayashi/ljspeech_vits"
24
- ASR_options="pyf98/owsm_ctc_v3.1_1B,espnet/owsm_ctc_v3.2_ft_1B,espnet/owsm_v3.1_ebf,librispeech_asr".split(",")
25
- LLM_options="meta-llama/Llama-3.2-1B-Instruct,HuggingFaceTB/SmolLM2-1.7B-Instruct".split(",")
26
- TTS_options="kan-bayashi/ljspeech_vits,kan-bayashi/libritts_xvector_vits,kan-bayashi/vctk_multi_spk_vits,ChatTTS".split(",")
27
  Eval_options="Latency,TTS Intelligibility,TTS Speech Quality,ASR WER,Text Dialog Metrics"
28
  upload_to_hub=None
29
  dialogue_model = ESPnetSDSModelInterface(
@@ -241,10 +241,10 @@ def start_warmup():
241
  if opt == LLM_name:
242
  LLM_name = LLM_options[0]
243
  for opt_count in range(len(TTS_options)):
244
- opt = TTS_options[opt_count]
245
  opt_count-=remove
246
  if opt_count>=len(TTS_options):
247
  break
 
248
  try:
249
  for _ in dialogue_model.handle_TTS_selection(opt):
250
  continue
@@ -493,7 +493,7 @@ with gr.Blocks(
493
  )
494
  with gr.Row():
495
  type_radio = gr.Radio(
496
- choices=["Cascaded", "E2E"],
497
  label="Choose type of Spoken Dialog:",
498
  value="Cascaded",
499
  )
@@ -522,51 +522,6 @@ with gr.Blocks(
522
  value="mini-omni",
523
  visible=False,
524
  )
525
- with gr.Row():
526
- feedback_btn = gr.Button(
527
- value=(
528
- "Please provide your feedback "
529
- "after each system response below."
530
- ),
531
- visible=True,
532
- interactive=False,
533
- elem_id="button",
534
- )
535
- with gr.Row():
536
- natural_btn1 = gr.Button(
537
- value="Very Natural", visible=False, interactive=False, scale=1
538
- )
539
- natural_btn2 = gr.Button(
540
- value="Somewhat Awkward", visible=False, interactive=False, scale=1
541
- )
542
- natural_btn3 = gr.Button(
543
- value="Very Awkward", visible=False, interactive=False, scale=1
544
- )
545
- natural_btn4 = gr.Button(
546
- value="Unnatural", visible=False, interactive=False, scale=1
547
- )
548
- with gr.Row():
549
- relevant_btn1 = gr.Button(
550
- value="Highly Relevant", visible=False, interactive=False, scale=1
551
- )
552
- relevant_btn2 = gr.Button(
553
- value="Partially Relevant",
554
- visible=False,
555
- interactive=False,
556
- scale=1,
557
- )
558
- relevant_btn3 = gr.Button(
559
- value="Slightly Irrelevant",
560
- visible=False,
561
- interactive=False,
562
- scale=1,
563
- )
564
- relevant_btn4 = gr.Button(
565
- value="Completely Irrelevant",
566
- visible=False,
567
- interactive=False,
568
- scale=1,
569
- )
570
  with gr.Column(scale=1):
571
  output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
572
  output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
@@ -581,6 +536,7 @@ with gr.Blocks(
581
  "Text Dialog Metrics",
582
  ],
583
  label="Choose Evaluation metrics:",
 
584
  )
585
  eval_radio_E2E = gr.Radio(
586
  choices=[
@@ -592,47 +548,10 @@ with gr.Blocks(
592
  label="Choose Evaluation metrics:",
593
  visible=False,
594
  )
595
- output_eval_text = gr.Textbox(label="Evaluation Results")
596
  state = gr.State(value=None)
597
- #gr.Markdown("### Example Prompts & Responses")
598
- #gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
599
- with gr.Row():
600
- privacy_text = gr.Textbox(
601
- label="Privacy Notice",
602
- interactive=False,
603
- value=(
604
- "By using this demo, you acknowledge that"
605
- "interactions with this dialog system are collected "
606
- "for research and improvement purposes. The data "
607
- "will only be used to enhance the performance and "
608
- "understanding of the system. If you have any "
609
- "concerns about data collection, please discontinue "
610
- "use."
611
- ),
612
- )
613
 
614
- btn_list = [
615
- natural_btn1,
616
- natural_btn2,
617
- natural_btn3,
618
- natural_btn4,
619
- relevant_btn1,
620
- relevant_btn2,
621
- relevant_btn3,
622
- relevant_btn4,
623
- ]
624
- natural_btn_list = [
625
- natural_btn1,
626
- natural_btn2,
627
- natural_btn3,
628
- natural_btn4,
629
- ]
630
- relevant_btn_list = [
631
- relevant_btn1,
632
- relevant_btn2,
633
- relevant_btn3,
634
- relevant_btn4,
635
- ]
636
  natural_response = gr.Textbox(
637
  label="natural_response", visible=False, interactive=False
638
  )
@@ -660,16 +579,6 @@ with gr.Blocks(
660
  inputs=[ASR_radio],
661
  outputs=[output_asr_text, output_text, output_audio],
662
  )
663
- eval_radio.change(
664
- fn=handle_eval_selection,
665
- inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
666
- outputs=[eval_radio, output_eval_text],
667
- )
668
- eval_radio_E2E.change(
669
- fn=handle_eval_selection_E2E,
670
- inputs=[eval_radio_E2E, output_audio, output_text],
671
- outputs=[eval_radio_E2E, output_eval_text],
672
- )
673
  type_radio.change(
674
  fn=dialogue_model.handle_type_selection,
675
  inputs=[type_radio, radio, ASR_radio, LLM_radio],
@@ -686,7 +595,7 @@ with gr.Blocks(
686
  ],
687
  )
688
  output_audio.play(
689
- flash_buttons, [], [natural_response, diversity_response] + btn_list
690
  )
691
 
692
  demo.queue(max_size=10, default_concurrency_limit=1)
 
20
  access_token = os.environ.get("HF_TOKEN")
21
  ASR_name="pyf98/owsm_ctc_v3.1_1B"
22
  LLM_name="meta-llama/Llama-3.2-1B-Instruct"
23
+ TTS_name="espnet/kan-bayashi_ljspeech_vits"
24
+ ASR_options="pyf98/owsm_ctc_v3.1_1B,espnet/owsm_ctc_v3.2_ft_1B,espnet/owsm_v3.1_ebf".split(",")
25
+ LLM_options="meta-llama/Llama-3.2-1B-Instruct".split(",")
26
+ TTS_options="espnet/kan-bayashi_ljspeech_vits,espnet/kan-bayashi_libritts_xvector_vits,espnet/kan-bayashi_vctk_multi_spk_vits,ChatTTS".split(",")
27
  Eval_options="Latency,TTS Intelligibility,TTS Speech Quality,ASR WER,Text Dialog Metrics"
28
  upload_to_hub=None
29
  dialogue_model = ESPnetSDSModelInterface(
 
241
  if opt == LLM_name:
242
  LLM_name = LLM_options[0]
243
  for opt_count in range(len(TTS_options)):
 
244
  opt_count-=remove
245
  if opt_count>=len(TTS_options):
246
  break
247
+ opt = TTS_options[opt_count]
248
  try:
249
  for _ in dialogue_model.handle_TTS_selection(opt):
250
  continue
 
493
  )
494
  with gr.Row():
495
  type_radio = gr.Radio(
496
+ choices=["Cascaded"],
497
  label="Choose type of Spoken Dialog:",
498
  value="Cascaded",
499
  )
 
522
  value="mini-omni",
523
  visible=False,
524
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
525
  with gr.Column(scale=1):
526
  output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
527
  output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
 
536
  "Text Dialog Metrics",
537
  ],
538
  label="Choose Evaluation metrics:",
539
+ visible=False,
540
  )
541
  eval_radio_E2E = gr.Radio(
542
  choices=[
 
548
  label="Choose Evaluation metrics:",
549
  visible=False,
550
  )
551
+ output_eval_text = gr.Textbox(label="Evaluation Results", visible=False)
552
  state = gr.State(value=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
553
 
554
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  natural_response = gr.Textbox(
556
  label="natural_response", visible=False, interactive=False
557
  )
 
579
  inputs=[ASR_radio],
580
  outputs=[output_asr_text, output_text, output_audio],
581
  )
 
 
 
 
 
 
 
 
 
 
582
  type_radio.change(
583
  fn=dialogue_model.handle_type_selection,
584
  inputs=[type_radio, radio, ASR_radio, LLM_radio],
 
595
  ],
596
  )
597
  output_audio.play(
598
+ flash_buttons, [], [natural_response, diversity_response]
599
  )
600
 
601
  demo.queue(max_size=10, default_concurrency_limit=1)