Voice_Assistant_Demo

Runtime error

App Files Files Community

wanchichen commited on Jan 20

Commit

b9c9746

verified ·

1 Parent(s): 66ba863

Update app.py

Browse files

Files changed (1) hide show

app.py +275 -273

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
 import os
 import shutil
 import time
@@ -446,285 +446,287 @@ def transcribe(
 # ------------------------
 # Executable Script
 # ------------------------
-api = HfApi()
-nltk.download("averaged_perceptron_tagger_eng")
-start_warmup()
-default_instruct=(
-    "You are a helpful and friendly AI "
-    "assistant. "
-    "You are polite, respectful, and aim to "
-    "provide concise and complete responses of "
-    "less than 15 words."
-)
-import pandas as pd
-examples = pd.DataFrame([
-    ["General Purpose Conversation", default_instruct],
-    ["Translation", "You are a translator. Translate user text into English."],
-    ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
-    ["Summarization", "You are summarizer. Summarize user's utterance."]
-], columns=["Task", "LLM Prompt"])
-with gr.Blocks(
-    title="E2E Spoken Dialog System",
-) as demo:
-    with gr.Row():
-        gr.Markdown(
             """
-            ## ESPnet-SDS
-            Welcome to our unified web interface for various cascaded and
-            E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
-            supporting real-time automated evaluation metrics, and
-            human-in-the-loop feedback collection.
-            For more details on how to use the app, refer to the [README]
-            (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
-        """
-        )
-    with gr.Row():
-        with gr.Column(scale=1):
-            user_audio = gr.Audio(
-                sources=["microphone"],
-                streaming=True,
-                waveform_options=gr.WaveformOptions(sample_rate=16000),
-            )
-            input_text=gr.Textbox(
-                label="LLM prompt",
-                visible=True,
-                interactive=True,
-                value=default_instruct
             )
-            with gr.Row():
-                type_radio = gr.Radio(
-                    choices=["Cascaded", "E2E"],
-                    label="Choose type of Spoken Dialog:",
-                    value="Cascaded",
                 )
-            with gr.Row():
-                ASR_radio = gr.Radio(
-                    choices=ASR_options,
-                    label="Choose ASR:",
-                    value=ASR_name,
-                )
-            with gr.Row():
-                LLM_radio = gr.Radio(
-                    choices=LLM_options,
-                    label="Choose LLM:",
-                    value=LLM_name,
-                )
-            with gr.Row():
-                radio = gr.Radio(
-                    choices=TTS_options,
-                    label="Choose TTS:",
-                    value=TTS_name,
-                )
-            with gr.Row():
-                E2Eradio = gr.Radio(
-                    choices=["mini-omni"],
-                    label="Choose E2E model:",
-                    value="mini-omni",
-                    visible=False,
-                )
-            with gr.Row():
-                feedback_btn = gr.Button(
-                    value=(
-                        "Please provide your feedback "
-                        "after each system response below."
-                    ),
                     visible=True,
-                    interactive=False,
-                    elem_id="button",
-                )
-            with gr.Row():
-                natural_btn1 = gr.Button(
-                    value="Very Natural", visible=False, interactive=False, scale=1
                 )
-                natural_btn2 = gr.Button(
-                    value="Somewhat Awkward", visible=False, interactive=False, scale=1
                 )
-                natural_btn3 = gr.Button(
-                    value="Very Awkward", visible=False, interactive=False, scale=1
-                )
-                natural_btn4 = gr.Button(
-                    value="Unnatural", visible=False, interactive=False, scale=1
-                )
-            with gr.Row():
-                relevant_btn1 = gr.Button(
-                    value="Highly Relevant", visible=False, interactive=False, scale=1
-                )
-                relevant_btn2 = gr.Button(
-                    value="Partially Relevant",
                     visible=False,
-                    interactive=False,
-                    scale=1,
                 )
-                relevant_btn3 = gr.Button(
-                    value="Slightly Irrelevant",
-                    visible=False,
-                    interactive=False,
-                    scale=1,
-                )
-                relevant_btn4 = gr.Button(
-                    value="Completely Irrelevant",
-                    visible=False,
-                    interactive=False,
-                    scale=1,
-                )
-        with gr.Column(scale=1):
-            output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
-            output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
-            output_asr_text = gr.Textbox(label="ASR output", interactive=False)
-            output_text = gr.Textbox(label="LLM output", interactive=False)
-            eval_radio = gr.Radio(
-                choices=[
-                    "Latency",
-                    "TTS Intelligibility",
-                    "TTS Speech Quality",
-                    "ASR WER",
-                    "Text Dialog Metrics",
-                ],
-                label="Choose Evaluation metrics:",
-            )
-            eval_radio_E2E = gr.Radio(
-                choices=[
-                    "Latency",
-                    "TTS Intelligibility",
-                    "TTS Speech Quality",
-                    "Text Dialog Metrics",
-                ],
-                label="Choose Evaluation metrics:",
-                visible=False,
             )
-            output_eval_text = gr.Textbox(label="Evaluation Results")
-            state = gr.State()
-    gr.Markdown("### Example Prompts & Responses")
-    gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
-    with gr.Row():
-        privacy_text = gr.Textbox(
-            label="Privacy Notice",
-            interactive=False,
-            value=(
-                "By using this demo, you acknowledge that"
-                "interactions with this dialog system are collected "
-                "for research and improvement purposes. The data "
-                "will only be used to enhance the performance and "
-                "understanding of the system. If you have any "
-                "concerns about data collection, please discontinue "
-                "use."
-            ),
-        )
-    btn_list = [
-        natural_btn1,
-        natural_btn2,
-        natural_btn3,
-        natural_btn4,
-        relevant_btn1,
-        relevant_btn2,
-        relevant_btn3,
-        relevant_btn4,
-    ]
-    natural_btn_list = [
-        natural_btn1,
-        natural_btn2,
-        natural_btn3,
-        natural_btn4,
-    ]
-    relevant_btn_list = [
-        relevant_btn1,
-        relevant_btn2,
-        relevant_btn3,
-        relevant_btn4,
-    ]
-    natural_response = gr.Textbox(
-        label="natural_response", visible=False, interactive=False
-    )
-    diversity_response = gr.Textbox(
-        label="diversity_response", visible=False, interactive=False
-    )
-    ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
-    callback.setup(
-        [
-            user_audio,
-            output_asr_text,
-            output_text,
-            output_audio,
-            output_audio1,
-            type_radio,
-            ASR_radio,
-            LLM_radio,
-            radio,
-            E2Eradio,
-            natural_response,
-            diversity_response,
-            ip_address,
-        ],
-        "flagged_data_points",
-    )
-    user_audio.stream(
-        transcribe,
-        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
-        outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
-    ).then(
-        lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
-    )
-    radio.change(
-        fn=dialogue_model.handle_TTS_selection,
-        inputs=[radio],
-        outputs=[output_asr_text, output_text, output_audio],
-    )
-    LLM_radio.change(
-        fn=dialogue_model.handle_LLM_selection,
-        inputs=[LLM_radio],
-        outputs=[output_asr_text, output_text, output_audio],
-    )
-    ASR_radio.change(
-        fn=dialogue_model.handle_ASR_selection,
-        inputs=[ASR_radio],
-        outputs=[output_asr_text, output_text, output_audio],
-    )
-    eval_radio.change(
-        fn=handle_eval_selection,
-        inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
-        outputs=[eval_radio, output_eval_text],
-    )
-    eval_radio_E2E.change(
-        fn=handle_eval_selection_E2E,
-        inputs=[eval_radio_E2E, output_audio, output_text],
-        outputs=[eval_radio_E2E, output_eval_text],
-    )
-    type_radio.change(
-        fn=dialogue_model.handle_type_selection,
-        inputs=[type_radio, radio, ASR_radio, LLM_radio],
-        outputs=[
-            radio,
-            ASR_radio,
-            LLM_radio,
-            E2Eradio,
-            output_asr_text,
-            output_text,
-            output_audio,
-            eval_radio,
-            eval_radio_E2E,
-        ],
-    )
-    output_audio.play(
-        flash_buttons, [], [natural_response, diversity_response] + btn_list
-    ).then(
-        lambda *args: callback.flag(list(args)),
-        [
-            user_audio,
-            output_asr_text,
-            output_text,
-            output_audio,
-            output_audio1,
-            type_radio,
-            ASR_radio,
-            LLM_radio,
-            radio,
-            E2Eradio,
-        ],
-        None,
-        preprocess=False,
-    )
-demo.queue(max_size=10, default_concurrency_limit=1)
-demo.launch(share=True)

+import spaces
 import os
 import shutil
 import time
 # ------------------------
 # Executable Script
 # ------------------------
+@spaces.GPU
+def start():
+    api = HfApi()
+    nltk.download("averaged_perceptron_tagger_eng")
+    start_warmup()
+    default_instruct=(
+        "You are a helpful and friendly AI "
+        "assistant. "
+        "You are polite, respectful, and aim to "
+        "provide concise and complete responses of "
+        "less than 15 words."
+    )
+    import pandas as pd
+    examples = pd.DataFrame([
+        ["General Purpose Conversation", default_instruct],
+        ["Translation", "You are a translator. Translate user text into English."],
+        ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
+        ["Summarization", "You are summarizer. Summarize user's utterance."]
+    ], columns=["Task", "LLM Prompt"])
+    with gr.Blocks(
+        title="E2E Spoken Dialog System",
+    ) as demo:
+        with gr.Row():
+            gr.Markdown(
+                """
+                ## ESPnet-SDS
+                Welcome to our unified web interface for various cascaded and
+                E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
+                supporting real-time automated evaluation metrics, and
+                human-in-the-loop feedback collection.
+                For more details on how to use the app, refer to the [README]
+                (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
             """
             )
+        with gr.Row():
+            with gr.Column(scale=1):
+                user_audio = gr.Audio(
+                    sources=["microphone"],
+                    streaming=True,
+                    waveform_options=gr.WaveformOptions(sample_rate=16000),
                 )
+                input_text=gr.Textbox(
+                    label="LLM prompt",
                     visible=True,
+                    interactive=True,
+                    value=default_instruct
                 )
+                with gr.Row():
+                    type_radio = gr.Radio(
+                        choices=["Cascaded", "E2E"],
+                        label="Choose type of Spoken Dialog:",
+                        value="Cascaded",
+                    )
+                with gr.Row():
+                    ASR_radio = gr.Radio(
+                        choices=ASR_options,
+                        label="Choose ASR:",
+                        value=ASR_name,
+                    )
+                with gr.Row():
+                    LLM_radio = gr.Radio(
+                        choices=LLM_options,
+                        label="Choose LLM:",
+                        value=LLM_name,
+                    )
+                with gr.Row():
+                    radio = gr.Radio(
+                        choices=TTS_options,
+                        label="Choose TTS:",
+                        value=TTS_name,
+                    )
+                with gr.Row():
+                    E2Eradio = gr.Radio(
+                        choices=["mini-omni"],
+                        label="Choose E2E model:",
+                        value="mini-omni",
+                        visible=False,
+                    )
+                with gr.Row():
+                    feedback_btn = gr.Button(
+                        value=(
+                            "Please provide your feedback "
+                            "after each system response below."
+                        ),
+                        visible=True,
+                        interactive=False,
+                        elem_id="button",
+                    )
+                with gr.Row():
+                    natural_btn1 = gr.Button(
+                        value="Very Natural", visible=False, interactive=False, scale=1
+                    )
+                    natural_btn2 = gr.Button(
+                        value="Somewhat Awkward", visible=False, interactive=False, scale=1
+                    )
+                    natural_btn3 = gr.Button(
+                        value="Very Awkward", visible=False, interactive=False, scale=1
+                    )
+                    natural_btn4 = gr.Button(
+                        value="Unnatural", visible=False, interactive=False, scale=1
+                    )
+                with gr.Row():
+                    relevant_btn1 = gr.Button(
+                        value="Highly Relevant", visible=False, interactive=False, scale=1
+                    )
+                    relevant_btn2 = gr.Button(
+                        value="Partially Relevant",
+                        visible=False,
+                        interactive=False,
+                        scale=1,
+                    )
+                    relevant_btn3 = gr.Button(
+                        value="Slightly Irrelevant",
+                        visible=False,
+                        interactive=False,
+                        scale=1,
+                    )
+                    relevant_btn4 = gr.Button(
+                        value="Completely Irrelevant",
+                        visible=False,
+                        interactive=False,
+                        scale=1,
+                    )
+            with gr.Column(scale=1):
+                output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
+                output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
+                output_asr_text = gr.Textbox(label="ASR output", interactive=False)
+                output_text = gr.Textbox(label="LLM output", interactive=False)
+                eval_radio = gr.Radio(
+                    choices=[
+                        "Latency",
+                        "TTS Intelligibility",
+                        "TTS Speech Quality",
+                        "ASR WER",
+                        "Text Dialog Metrics",
+                    ],
+                    label="Choose Evaluation metrics:",
                 )
+                eval_radio_E2E = gr.Radio(
+                    choices=[
+                        "Latency",
+                        "TTS Intelligibility",
+                        "TTS Speech Quality",
+                        "Text Dialog Metrics",
+                    ],
+                    label="Choose Evaluation metrics:",
                     visible=False,
                 )
+                output_eval_text = gr.Textbox(label="Evaluation Results")
+                state = gr.State()
+        gr.Markdown("### Example Prompts & Responses")
+        gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
+        with gr.Row():
+            privacy_text = gr.Textbox(
+                label="Privacy Notice",
+                interactive=False,
+                value=(
+                    "By using this demo, you acknowledge that"
+                    "interactions with this dialog system are collected "
+                    "for research and improvement purposes. The data "
+                    "will only be used to enhance the performance and "
+                    "understanding of the system. If you have any "
+                    "concerns about data collection, please discontinue "
+                    "use."
+                ),
             )
+        btn_list = [
+            natural_btn1,
+            natural_btn2,
+            natural_btn3,
+            natural_btn4,
+            relevant_btn1,
+            relevant_btn2,
+            relevant_btn3,
+            relevant_btn4,
+        ]
+        natural_btn_list = [
+            natural_btn1,
+            natural_btn2,
+            natural_btn3,
+            natural_btn4,
+        ]
+        relevant_btn_list = [
+            relevant_btn1,
+            relevant_btn2,
+            relevant_btn3,
+            relevant_btn4,
+        ]
+        natural_response = gr.Textbox(
+            label="natural_response", visible=False, interactive=False
+        )
+        diversity_response = gr.Textbox(
+            label="diversity_response", visible=False, interactive=False
+        )
+        ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
+        callback.setup(
+            [
+                user_audio,
+                output_asr_text,
+                output_text,
+                output_audio,
+                output_audio1,
+                type_radio,
+                ASR_radio,
+                LLM_radio,
+                radio,
+                E2Eradio,
+                natural_response,
+                diversity_response,
+                ip_address,
+            ],
+            "flagged_data_points",
+        )
+        user_audio.stream(
+            transcribe,
+            inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
+            outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
+        ).then(
+            lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
+        )
+        radio.change(
+            fn=dialogue_model.handle_TTS_selection,
+            inputs=[radio],
+            outputs=[output_asr_text, output_text, output_audio],
+        )
+        LLM_radio.change(
+            fn=dialogue_model.handle_LLM_selection,
+            inputs=[LLM_radio],
+            outputs=[output_asr_text, output_text, output_audio],
+        )
+        ASR_radio.change(
+            fn=dialogue_model.handle_ASR_selection,
+            inputs=[ASR_radio],
+            outputs=[output_asr_text, output_text, output_audio],
+        )
+        eval_radio.change(
+            fn=handle_eval_selection,
+            inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
+            outputs=[eval_radio, output_eval_text],
+        )
+        eval_radio_E2E.change(
+            fn=handle_eval_selection_E2E,
+            inputs=[eval_radio_E2E, output_audio, output_text],
+            outputs=[eval_radio_E2E, output_eval_text],
+        )
+        type_radio.change(
+            fn=dialogue_model.handle_type_selection,
+            inputs=[type_radio, radio, ASR_radio, LLM_radio],
+            outputs=[
+                radio,
+                ASR_radio,
+                LLM_radio,
+                E2Eradio,
+                output_asr_text,
+                output_text,
+                output_audio,
+                eval_radio,
+                eval_radio_E2E,
+            ],
+        )
+        output_audio.play(
+            flash_buttons, [], [natural_response, diversity_response] + btn_list
+        ).then(
+            lambda *args: callback.flag(list(args)),
+            [
+                user_audio,
+                output_asr_text,
+                output_text,
+                output_audio,
+                output_audio1,
+                type_radio,
+                ASR_radio,
+                LLM_radio,
+                radio,
+                E2Eradio,
+            ],
+            None,
+            preprocess=False,
+        )
+    demo.queue(max_size=10, default_concurrency_limit=1)
+    demo.launch(share=True)