Voice_Assistant_Demo

Runtime error

App Files Files Community

wanchichen commited on Jan 20

Commit

1efc72a

1 Parent(s): 2e3ab02

fixes

Browse files

Files changed (1) hide show

app.py +235 -240

app.py CHANGED Viewed

@@ -403,7 +403,6 @@ def transcribe(
         latency_TTS,
     )
     text_str1 = text_str
-    print(text_str1, asr_output_str, flush=True)
     if change:
         print("Output changed")
         if asr_output_str != "":
@@ -446,253 +445,249 @@ def transcribe(
 # ------------------------
 # Executable Script
 # ------------------------
-@spaces.GPU(duration=500)
-def start():
-    api = HfApi()
-    nltk.download("averaged_perceptron_tagger_eng")
-    start_warmup()
-    default_instruct=(
-        "You are a helpful and friendly AI "
-        "assistant. "
-        "You are polite, respectful, and aim to "
-        "provide concise and complete responses of "
-        "less than 15 words."
-    )
-    import pandas as pd
-    examples = pd.DataFrame([
-        ["General Purpose Conversation", default_instruct],
-        ["Translation", "You are a translator. Translate user text into English."],
-        ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
-        ["Summarization", "You are summarizer. Summarize user's utterance."]
-    ], columns=["Task", "LLM Prompt"])
-    with gr.Blocks(
-        title="E2E Spoken Dialog System",
-    ) as demo:
-        with gr.Row():
-            gr.Markdown(
-                """
-                ## ESPnet-SDS
-                Welcome to our unified web interface for various cascaded and
-                E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
-                supporting real-time automated evaluation metrics, and
-                human-in-the-loop feedback collection.
-                For more details on how to use the app, refer to the [README]
-                (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
             """
             )
-        with gr.Row():
-            with gr.Column(scale=1):
-                user_audio = gr.Audio(
-                    sources=["microphone"],
-                    streaming=True,
-                    waveform_options=gr.WaveformOptions(sample_rate=16000),
                 )
-                input_text=gr.Textbox(
-                    label="LLM prompt",
                     visible=True,
-                    interactive=True,
-                    value=default_instruct
                 )
-                with gr.Row():
-                    type_radio = gr.Radio(
-                        choices=["Cascaded", "E2E"],
-                        label="Choose type of Spoken Dialog:",
-                        value="Cascaded",
-                    )
-                with gr.Row():
-                    ASR_radio = gr.Radio(
-                        choices=ASR_options,
-                        label="Choose ASR:",
-                        value=ASR_name,
-                    )
-                with gr.Row():
-                    LLM_radio = gr.Radio(
-                        choices=LLM_options,
-                        label="Choose LLM:",
-                        value=LLM_name,
-                    )
-                with gr.Row():
-                    radio = gr.Radio(
-                        choices=TTS_options,
-                        label="Choose TTS:",
-                        value=TTS_name,
-                    )
-                with gr.Row():
-                    E2Eradio = gr.Radio(
-                        choices=["mini-omni"],
-                        label="Choose E2E model:",
-                        value="mini-omni",
-                        visible=False,
-                    )
-                with gr.Row():
-                    feedback_btn = gr.Button(
-                        value=(
-                            "Please provide your feedback "
-                            "after each system response below."
-                        ),
-                        visible=True,
-                        interactive=False,
-                        elem_id="button",
-                    )
-                with gr.Row():
-                    natural_btn1 = gr.Button(
-                        value="Very Natural", visible=False, interactive=False, scale=1
-                    )
-                    natural_btn2 = gr.Button(
-                        value="Somewhat Awkward", visible=False, interactive=False, scale=1
-                    )
-                    natural_btn3 = gr.Button(
-                        value="Very Awkward", visible=False, interactive=False, scale=1
-                    )
-                    natural_btn4 = gr.Button(
-                        value="Unnatural", visible=False, interactive=False, scale=1
-                    )
-                with gr.Row():
-                    relevant_btn1 = gr.Button(
-                        value="Highly Relevant", visible=False, interactive=False, scale=1
-                    )
-                    relevant_btn2 = gr.Button(
-                        value="Partially Relevant",
-                        visible=False,
-                        interactive=False,
-                        scale=1,
-                    )
-                    relevant_btn3 = gr.Button(
-                        value="Slightly Irrelevant",
-                        visible=False,
-                        interactive=False,
-                        scale=1,
-                    )
-                    relevant_btn4 = gr.Button(
-                        value="Completely Irrelevant",
-                        visible=False,
-                        interactive=False,
-                        scale=1,
-                    )
-            with gr.Column(scale=1):
-                output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
-                output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
-                output_asr_text = gr.Textbox(label="ASR output", interactive=False)
-                output_text = gr.Textbox(label="LLM output", interactive=False)
-                eval_radio = gr.Radio(
-                    choices=[
-                        "Latency",
-                        "TTS Intelligibility",
-                        "TTS Speech Quality",
-                        "ASR WER",
-                        "Text Dialog Metrics",
-                    ],
-                    label="Choose Evaluation metrics:",
                 )
-                eval_radio_E2E = gr.Radio(
-                    choices=[
-                        "Latency",
-                        "TTS Intelligibility",
-                        "TTS Speech Quality",
-                        "Text Dialog Metrics",
-                    ],
-                    label="Choose Evaluation metrics:",
                     visible=False,
                 )
-                output_eval_text = gr.Textbox(label="Evaluation Results")
-                state = gr.State(value=None)
-        #gr.Markdown("### Example Prompts & Responses")
-        #gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
-        with gr.Row():
-            privacy_text = gr.Textbox(
-                label="Privacy Notice",
-                interactive=False,
-                value=(
-                    "By using this demo, you acknowledge that"
-                    "interactions with this dialog system are collected "
-                    "for research and improvement purposes. The data "
-                    "will only be used to enhance the performance and "
-                    "understanding of the system. If you have any "
-                    "concerns about data collection, please discontinue "
-                    "use."
-                ),
             )
-        btn_list = [
-            natural_btn1,
-            natural_btn2,
-            natural_btn3,
-            natural_btn4,
-            relevant_btn1,
-            relevant_btn2,
-            relevant_btn3,
-            relevant_btn4,
-        ]
-        natural_btn_list = [
-            natural_btn1,
-            natural_btn2,
-            natural_btn3,
-            natural_btn4,
-        ]
-        relevant_btn_list = [
-            relevant_btn1,
-            relevant_btn2,
-            relevant_btn3,
-            relevant_btn4,
-        ]
-        natural_response = gr.Textbox(
-            label="natural_response", visible=False, interactive=False
-        )
-        diversity_response = gr.Textbox(
-            label="diversity_response", visible=False, interactive=False
-        )
-        ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
-        user_audio.stream(
-            transcribe,
-            inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
-            outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
-        )
-        radio.change(
-            fn=dialogue_model.handle_TTS_selection,
-            inputs=[radio],
-            outputs=[output_asr_text, output_text, output_audio],
-        )
-        LLM_radio.change(
-            fn=dialogue_model.handle_LLM_selection,
-            inputs=[LLM_radio],
-            outputs=[output_asr_text, output_text, output_audio],
-        )
-        ASR_radio.change(
-            fn=dialogue_model.handle_ASR_selection,
-            inputs=[ASR_radio],
-            outputs=[output_asr_text, output_text, output_audio],
-        )
-        eval_radio.change(
-            fn=handle_eval_selection,
-            inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
-            outputs=[eval_radio, output_eval_text],
-        )
-        eval_radio_E2E.change(
-            fn=handle_eval_selection_E2E,
-            inputs=[eval_radio_E2E, output_audio, output_text],
-            outputs=[eval_radio_E2E, output_eval_text],
-        )
-        type_radio.change(
-            fn=dialogue_model.handle_type_selection,
-            inputs=[type_radio, radio, ASR_radio, LLM_radio],
-            outputs=[
-                radio,
-                ASR_radio,
-                LLM_radio,
-                E2Eradio,
-                output_asr_text,
-                output_text,
-                output_audio,
-                eval_radio,
-                eval_radio_E2E,
-            ],
-        )
-        output_audio.play(
-            flash_buttons, [], [natural_response, diversity_response] + btn_list
         )
-    demo.queue(max_size=10, default_concurrency_limit=1)
-    demo.launch(debug=True)
-start()

         latency_TTS,
     )
     text_str1 = text_str
     if change:
         print("Output changed")
         if asr_output_str != "":
 # ------------------------
 # Executable Script
 # ------------------------
+api = HfApi()
+nltk.download("averaged_perceptron_tagger_eng")
+start_warmup()
+default_instruct=(
+    "You are a helpful and friendly AI "
+    "assistant. "
+    "You are polite, respectful, and aim to "
+    "provide concise and complete responses of "
+    "less than 15 words."
+)
+import pandas as pd
+examples = pd.DataFrame([
+    ["General Purpose Conversation", default_instruct],
+    ["Translation", "You are a translator. Translate user text into English."],
+    ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
+    ["Summarization", "You are summarizer. Summarize user's utterance."]
+], columns=["Task", "LLM Prompt"])
+with gr.Blocks(
+    title="E2E Spoken Dialog System",
+) as demo:
+    with gr.Row():
+        gr.Markdown(
             """
+            ## ESPnet-SDS
+            Welcome to our unified web interface for various cascaded and
+            E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
+            supporting real-time automated evaluation metrics, and
+            human-in-the-loop feedback collection.
+            For more details on how to use the app, refer to the [README]
+            (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
+        """
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            user_audio = gr.Audio(
+                sources=["microphone"],
+                streaming=True,
+                waveform_options=gr.WaveformOptions(sample_rate=16000),
+            )
+            input_text=gr.Textbox(
+                label="LLM prompt",
+                visible=True,
+                interactive=True,
+                value=default_instruct
             )
+            with gr.Row():
+                type_radio = gr.Radio(
+                    choices=["Cascaded", "E2E"],
+                    label="Choose type of Spoken Dialog:",
+                    value="Cascaded",
+                )
+            with gr.Row():
+                ASR_radio = gr.Radio(
+                    choices=ASR_options,
+                    label="Choose ASR:",
+                    value=ASR_name,
                 )
+            with gr.Row():
+                LLM_radio = gr.Radio(
+                    choices=LLM_options,
+                    label="Choose LLM:",
+                    value=LLM_name,
+                )
+            with gr.Row():
+                radio = gr.Radio(
+                    choices=TTS_options,
+                    label="Choose TTS:",
+                    value=TTS_name,
+                )
+            with gr.Row():
+                E2Eradio = gr.Radio(
+                    choices=["mini-omni"],
+                    label="Choose E2E model:",
+                    value="mini-omni",
+                    visible=False,
+                )
+            with gr.Row():
+                feedback_btn = gr.Button(
+                    value=(
+                        "Please provide your feedback "
+                        "after each system response below."
+                    ),
                     visible=True,
+                    interactive=False,
+                    elem_id="button",
+                )
+            with gr.Row():
+                natural_btn1 = gr.Button(
+                    value="Very Natural", visible=False, interactive=False, scale=1
                 )
+                natural_btn2 = gr.Button(
+                    value="Somewhat Awkward", visible=False, interactive=False, scale=1
                 )
+                natural_btn3 = gr.Button(
+                    value="Very Awkward", visible=False, interactive=False, scale=1
+                )
+                natural_btn4 = gr.Button(
+                    value="Unnatural", visible=False, interactive=False, scale=1
+                )
+            with gr.Row():
+                relevant_btn1 = gr.Button(
+                    value="Highly Relevant", visible=False, interactive=False, scale=1
+                )
+                relevant_btn2 = gr.Button(
+                    value="Partially Relevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+                relevant_btn3 = gr.Button(
+                    value="Slightly Irrelevant",
                     visible=False,
+                    interactive=False,
+                    scale=1,
                 )
+                relevant_btn4 = gr.Button(
+                    value="Completely Irrelevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
+            output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
+            output_asr_text = gr.Textbox(label="ASR output", interactive=False)
+            output_text = gr.Textbox(label="LLM output", interactive=False)
+            eval_radio = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "ASR WER",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
             )
+            eval_radio_E2E = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
+                visible=False,
+            )
+            output_eval_text = gr.Textbox(label="Evaluation Results")
+            state = gr.State(value=None)
+    #gr.Markdown("### Example Prompts & Responses")
+    #gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
+    with gr.Row():
+        privacy_text = gr.Textbox(
+            label="Privacy Notice",
+            interactive=False,
+            value=(
+                "By using this demo, you acknowledge that"
+                "interactions with this dialog system are collected "
+                "for research and improvement purposes. The data "
+                "will only be used to enhance the performance and "
+                "understanding of the system. If you have any "
+                "concerns about data collection, please discontinue "
+                "use."
+            ),
         )
+    btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+    ]
+    relevant_btn_list = [
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_response = gr.Textbox(
+        label="natural_response", visible=False, interactive=False
+    )
+    diversity_response = gr.Textbox(
+        label="diversity_response", visible=False, interactive=False
+    )
+    ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
+    user_audio.stream(
+        transcribe,
+        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
+        outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
+    )
+    radio.change(
+        fn=dialogue_model.handle_TTS_selection,
+        inputs=[radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    LLM_radio.change(
+        fn=dialogue_model.handle_LLM_selection,
+        inputs=[LLM_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    ASR_radio.change(
+        fn=dialogue_model.handle_ASR_selection,
+        inputs=[ASR_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    eval_radio.change(
+        fn=handle_eval_selection,
+        inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
+        outputs=[eval_radio, output_eval_text],
+    )
+    eval_radio_E2E.change(
+        fn=handle_eval_selection_E2E,
+        inputs=[eval_radio_E2E, output_audio, output_text],
+        outputs=[eval_radio_E2E, output_eval_text],
+    )
+    type_radio.change(
+        fn=dialogue_model.handle_type_selection,
+        inputs=[type_radio, radio, ASR_radio, LLM_radio],
+        outputs=[
+            radio,
+            ASR_radio,
+            LLM_radio,
+            E2Eradio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            eval_radio,
+            eval_radio_E2E,
+        ],
+    )
+    output_audio.play(
+        flash_buttons, [], [natural_response, diversity_response] + btn_list
+    )
+demo.queue(max_size=10, default_concurrency_limit=1)
+demo.launch(debug=True)