Prompt_Edit_Demo

Sleeping

App Files Files Community

owaski commited on Feb 9

Commit

1acf95b

1 Parent(s): 8818ae1

remove streaming

Browse files

Files changed (1) hide show

app.py +70 -122

app.py CHANGED Viewed

@@ -37,14 +37,8 @@ latency_ASR = 0.0
 latency_LM = 0.0
 latency_TTS = 0.0
-text_str = ""
-asr_output_str = ""
-vad_output = None
-audio_output = None
-audio_output1 = None
 LLM_response_arr = []
 total_response_arr = []
-start_record_time = None
 enable_btn = gr.Button(interactive=True, visible=True)
 # ------------------------
@@ -289,9 +283,8 @@ def flash_buttons():
 @spaces.GPU
-def transcribe(
-    stream: np.ndarray,
-    new_chunk: Tuple[int, np.ndarray],
     TTS_option: str,
     ASR_option: str,
     LLM_option: str,
@@ -299,88 +292,62 @@ def transcribe(
     input_text: str,
 ):
     """
-    Processes and transcribes an audio stream in real-time.
-    This function handles the transcription of audio input
-    and its transformation through a cascaded
-    or E2E conversational AI system.
-    It dynamically updates the transcription, text generation,
-    and synthesized speech output, while managing global states and latencies.
     Args:
-        stream: The current audio stream buffer.
-            `None` if the stream is being reset (e.g., after user refresh).
-        new_chunk: A tuple containing:
-            - `sr`: Sample rate of the new audio chunk.
-            - `y`: New audio data chunk.
         TTS_option: Selected TTS model option.
         ASR_option: Selected ASR model option.
         LLM_option: Selected LLM model option.
         type_option: Type of system ("Cascaded" or "E2E").
-    Yields:
-        Tuple[Optional[np.ndarray], Optional[str], Optional[str],
-        Optional[Tuple[int, np.ndarray]], Optional[Tuple[int, np.ndarray]]]:
             A tuple containing:
-            - Updated stream buffer.
-            - ASR output text.
-            - Generated LLM output text.
-            - Audio output as a tuple of sample rate and audio waveform.
-            - User input audio as a tuple of sample rate and audio waveform.
     Notes:
-        - Resets the session if the transcription exceeds 5 minutes.
-        - Updates the Gradio interface elements dynamically.
-        - Manages latencies.
     """
-    sr, y = new_chunk
-    global text_str
-    global chat
-    global user_role
-    global audio_output
-    global audio_output1
-    global vad_output
-    global asr_output_str
-    global start_record_time
-    global sids
-    global spembs
     global latency_ASR
     global latency_LM
     global latency_TTS
     global LLM_response_arr
     global total_response_arr
-    if stream is None:
-        # Handle user refresh
-        for (
-            _,
-            _,
-            _,
-            _,
-            asr_output_box,
-            text_box,
-            audio_box,
-            _,
-            _,
-        ) in dialogue_model.handle_type_selection(
-            type_option, TTS_option, ASR_option, LLM_option
-        ):
-            gr.Info("The models are being reloaded due to a browser refresh.")
-            yield (stream, asr_output_box, text_box, audio_box, gr.Audio(visible=False))
-        stream = y
-        text_str = ""
-        audio_output = None
-        audio_output1 = None
-    else:
-        stream = np.concatenate((stream, y))
-    # import pdb;pdb.set_trace()
     dialogue_model.chat.init_chat(
         {
             "role": "system",
-            "content": (
-                input_text
-            ),
         }
     )
     (
         asr_output_str,
         text_str,
@@ -403,44 +370,16 @@ def transcribe(
         latency_LM,
         latency_TTS,
     )
-    text_str1 = text_str
     if change:
-        print("Output changed")
         if asr_output_str != "":
             total_response_arr.append(asr_output_str.replace("\n", " "))
         LLM_response_arr.append(text_str.replace("\n", " "))
         total_response_arr.append(text_str.replace("\n", " "))
-    if (text_str != "") and (start_record_time is None):
-        start_record_time = time.time()
-    elif start_record_time is not None:
-        current_record_time = time.time()
-        if current_record_time - start_record_time > 300:
-            gr.Info(
-                "Conversations are limited to 5 minutes. "
-                "The session will restart in approximately 60 seconds. "
-                "Please wait for the demo to reset. "
-                "Close this message once you have read it.",
-                duration=None,
-            )
-            yield stream, gr.Textbox(visible=False), gr.Textbox(
-                visible=False
-            ), gr.Audio(visible=False), gr.Audio(visible=False)
-            dialogue_model.chat.buffer = []
-            text_str = ""
-            audio_output = None
-            audio_output1 = None
-            asr_output_str = ""
-            start_record_time = None
-            LLM_response_arr = []
-            total_response_arr = []
-            shutil.rmtree("flagged_data_points")
-            os.mkdir("flagged_data_points")
-            yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
-            yield stream, gr.Textbox(visible=True), gr.Textbox(visible=True), gr.Audio(
-                visible=True
-            ), gr.Audio(visible=False)
-    yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
 # ------------------------
@@ -464,28 +403,37 @@ examples = pd.DataFrame([
     ["Summarization", "You are summarizer. Summarize user's utterance."]
 ], columns=["Task", "LLM Prompt"])
 with gr.Blocks(
-    title="E2E Spoken Dialog System",
 ) as demo:
     with gr.Row():
         gr.Markdown(
             """
-            ## ESPnet-SDS
-            Welcome to our unified web interface for various cascaded and
-            E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
-            supporting real-time automated evaluation metrics, and
-            human-in-the-loop feedback collection.
-            For more details on how to use the app, refer to the [README]
             (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
         """
         )
     with gr.Row():
         with gr.Column(scale=1):
             user_audio = gr.Audio(
-                sources=["microphone"],
-                streaming=True,
-                waveform_options=gr.WaveformOptions(sample_rate=16000),
             )
             input_text=gr.Textbox(
                 label="LLM prompt",
                 visible=True,
@@ -524,10 +472,9 @@ with gr.Blocks(
                     visible=False,
                 )
         with gr.Column(scale=1):
-            output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
-            output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
-            output_asr_text = gr.Textbox(label="ASR output", interactive=False)
-            output_text = gr.Textbox(label="LLM output", interactive=False)
             eval_radio = gr.Radio(
                 choices=[
                     "Latency",
@@ -550,7 +497,6 @@ with gr.Blocks(
                 visible=False,
             )
             output_eval_text = gr.Textbox(label="Evaluation Results", visible=False)
-            state = gr.State(value=None)
     natural_response = gr.Textbox(
@@ -560,10 +506,12 @@ with gr.Blocks(
         label="diversity_response", visible=False, interactive=False
     )
     ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
-    user_audio.stream(
-        transcribe,
-        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
-        outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
     )
     radio.change(
         fn=dialogue_model.handle_TTS_selection,

 latency_LM = 0.0
 latency_TTS = 0.0
 LLM_response_arr = []
 total_response_arr = []
 enable_btn = gr.Button(interactive=True, visible=True)
 # ------------------------
 @spaces.GPU
+def process_audio_file(
+    audio_file: Optional[Tuple[int, np.ndarray]],
     TTS_option: str,
     ASR_option: str,
     LLM_option: str,
     input_text: str,
 ):
     """
+    Processes a recorded audio file through the dialogue system.
+    This function handles the transcription of an uploaded audio file
+    and its transformation through a cascaded conversational AI system.
+    It processes the entire audio file at once (offline mode).
     Args:
+        audio_file: A tuple containing:
+            - `sr`: Sample rate of the audio file.
+            - `y`: Audio data array.
         TTS_option: Selected TTS model option.
         ASR_option: Selected ASR model option.
         LLM_option: Selected LLM model option.
         type_option: Type of system ("Cascaded" or "E2E").
+        input_text: System prompt for the LLM.
+    Returns:
+        Tuple[str, str, Optional[Tuple[int, np.ndarray]]]:
             A tuple containing:
+            - ASR output text (transcription).
+            - Generated LLM output text (response).
+            - Audio output as a tuple of sample rate and audio waveform (TTS).
     Notes:
+        - Processes the complete audio file in one go.
+        - Updates latency metrics.
     """
     global latency_ASR
     global latency_LM
     global latency_TTS
     global LLM_response_arr
     global total_response_arr
+    if audio_file is None:
+        gr.Info("Please upload an audio file.")
+        return "", "", None
+    # Extract sample rate and audio data
+    sr, y = audio_file
+    # Initialize chat with system prompt
     dialogue_model.chat.init_chat(
         {
             "role": "system",
+            "content": input_text,
         }
     )
+    # Initialize variables
+    asr_output_str = ""
+    text_str = ""
+    audio_output = None
+    audio_output1 = None
+    stream = y  # Use entire audio file as stream
+    # Process the audio file
     (
         asr_output_str,
         text_str,
         latency_LM,
         latency_TTS,
     )
+    # Store results
     if change:
+        print("Processing completed")
         if asr_output_str != "":
             total_response_arr.append(asr_output_str.replace("\n", " "))
         LLM_response_arr.append(text_str.replace("\n", " "))
         total_response_arr.append(text_str.replace("\n", " "))
+    return asr_output_str, text_str, audio_output
 # ------------------------
     ["Summarization", "You are summarizer. Summarize user's utterance."]
 ], columns=["Task", "LLM Prompt"])
 with gr.Blocks(
+    title="ESPnet-SDS Offline Audio Processing",
 ) as demo:
     with gr.Row():
         gr.Markdown(
             """
+            ## ESPnet-SDS (Offline Mode)
+            Welcome to our offline audio processing interface for various cascaded and
+            E2E spoken dialogue systems built using ESPnet-SDS toolkit.
+            **How to use:**
+            1. Upload or record an audio file
+            2. Configure the LLM prompt and select models
+            3. Click "Process Audio" to transcribe and generate a response
+            The system will:
+            - **Transcribe** your audio using ASR (Automatic Speech Recognition)
+            - **Generate** a response using the selected LLM
+            - **Synthesize** speech output using TTS (Text-to-Speech)
+            For more details, refer to the [README]
             (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
         """
         )
     with gr.Row():
         with gr.Column(scale=1):
             user_audio = gr.Audio(
+                sources=["upload", "microphone"],
+                type="numpy",
+                label="Upload or Record Audio File",
             )
+            process_btn = gr.Button("Process Audio", variant="primary")
             input_text=gr.Textbox(
                 label="LLM prompt",
                 visible=True,
                     visible=False,
                 )
         with gr.Column(scale=1):
+            output_asr_text = gr.Textbox(label="ASR Transcription", interactive=False)
+            output_text = gr.Textbox(label="LLM Response", interactive=False)
+            output_audio = gr.Audio(label="TTS Output", autoplay=True, visible=True, interactive=False)
             eval_radio = gr.Radio(
                 choices=[
                     "Latency",
                 visible=False,
             )
             output_eval_text = gr.Textbox(label="Evaluation Results", visible=False)
     natural_response = gr.Textbox(
         label="diversity_response", visible=False, interactive=False
     )
     ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
+    # Process button click event
+    process_btn.click(
+        process_audio_file,
+        inputs=[user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
+        outputs=[output_asr_text, output_text, output_audio],
     )
     radio.change(
         fn=dialogue_model.handle_TTS_selection,