Spaces:

Rcarvalo
/

speech-to-speech

Runtime error

App Files Files Community

Rcarvalo commited on 8 days ago

Commit

5880918

verified ·

1 Parent(s): 6df4eaa

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +129 -141

app.py CHANGED Viewed

@@ -1,15 +1,12 @@
 """
-Real-time WebRTC speech-to-speech demo with fastrtc
-Based on the original liquid-audio demo
 """
-from queue import Queue
-from threading import Thread
 import gradio as gr
 import numpy as np
 import torch
-from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
 from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
@@ -31,138 +28,132 @@ mimi = mimi.to(device)
 print(f"Models loaded on {device}")
-def chat_producer(
-    q: Queue[torch.Tensor | None],
-    chat: ChatState,
-    temp: float | None,
-    topk: int | None,
-):
-    """Producer thread that generates tokens"""
-    print(f"Starting generation with state {chat}.")
-    with torch.no_grad(), mimi.streaming(1):
         for t in model.generate_interleaved(
-            **chat,
             max_new_tokens=1024,
             audio_temperature=temp,
             audio_top_k=topk,
         ):
-            q.put(t)
-            if t.numel() > 1:
-                if (t == 2048).any():
-                    continue
-                wav_chunk = mimi.decode(t[None, :, None])[0]
-                q.put(wav_chunk)
-    q.put(None)
-def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
-    """Handle incoming audio and generate streaming response"""
-    if temp == 0:
-        temp = None
-    if topk == 0:
-        topk = None
-    if temp is not None:
-        temp = float(temp)
-    if topk is not None:
-        topk = int(topk)
-    if len(chat.text) == 1:
-        chat.new_turn("system")
-        chat.add_text("Respond with interleaved text and audio.")
-        chat.end_turn()
-        chat.new_turn("user")
-    rate, wav = audio
-    # Convert to tensor with proper shape (channels, samples)
-    wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
-    # Ensure correct shape
-    if len(wav_tensor.shape) == 1:
-        wav_tensor = wav_tensor.unsqueeze(0)
-    elif len(wav_tensor.shape) > 1:
-        # If stereo, convert to mono
-        wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
-    chat.add_audio(wav_tensor, rate)
-    chat.end_turn()
-    chat.new_turn("assistant")
-    q: Queue[torch.Tensor | None] = Queue()
-    chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
-    chat_thread.start()
-    out_text: list[torch.Tensor] = []
-    out_audio: list[torch.Tensor] = []
-    out_modality: list[LFMModality] = []
-    while True:
-        t = q.get()
-        if t is None:
-            break
-        elif t.numel() == 1:  # text
-            out_text.append(t)
-            out_modality.append(LFMModality.TEXT)
-            print(processor.text.decode(t), end="")
-            cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
-            yield AdditionalOutputs(cur_string)
-        elif t.numel() == 8:
-            out_audio.append(t)
-            out_modality.append(LFMModality.AUDIO_OUT)
-        elif t.numel() == 1920:
-            np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
-            yield (24_000, np_chunk)
-        else:
-            raise RuntimeError(f"unexpected shape: {t.shape}")
-    chat.append(
-        text=torch.stack(out_text, 1),
-        audio_out=torch.stack(out_audio, 1),
-        modality_flag=torch.tensor(out_modality, device=device),
-    )
-    chat.end_turn()
-    chat.new_turn("user")
-def clear():
-    """Clear chat history"""
-    gr.Info("Cleared chat history", duration=3)
-    return ChatState(processor), None
 # Create Gradio interface
-with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
     gr.Markdown("""
-    # LFM2-Audio Real-time Speech-to-Speech Chat
-    **Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
     **How to use:**
-    1. Click "Allow" when prompted for microphone access
-    2. Start speaking - the model listens and responds in real-time
-    3. The conversation flows naturally with minimal latency
-    **Features:**
-    - 🎙️ Real-time WebRTC streaming
-    - ⚡ Low latency response
-    - 💬 Interleaved text and audio output
-    - 🔄 Multi-turn conversations
     """)
     chat_state = gr.State(ChatState(processor))
     with gr.Row():
         with gr.Column():
-            webrtc = WebRTC(
-                modality="audio",
-                mode="send-receive",
-                full_screen=False,
             )
             with gr.Row():
@@ -172,7 +163,7 @@ with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
                     value=1.0,
                     step=0.1,
                     label="Temperature (0 for greedy)",
-                    info="Higher = more creative"
                 )
                 top_k = gr.Slider(
                     minimum=0,
@@ -180,50 +171,47 @@ with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
                     value=4,
                     step=1,
                     label="Top-k (0 for no filtering)",
-                    info="Sampling diversity"
                 )
-            clear_btn = gr.Button("Reset Chat")
         with gr.Column():
-            text_out = gr.Textbox(
-                lines=10,
-                label="Conversation Text",
                 interactive=False
             )
     gr.Markdown("""
-    ### About this demo
-    This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
-    The model processes your speech and generates both text and audio responses simultaneously.
-    **Model**: LFM2-Audio-1.5B by Liquid AI
-    **Mode**: Interleaved generation (optimized for real-time)
-    **Audio Codec**: Mimi (24kHz)
-    [Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
     """)
-    # Setup WebRTC streaming
-    webrtc.stream(
-        ReplyOnPause(
-            chat_response,  # type: ignore[arg-type]
-            input_sample_rate=24_000,
-            output_sample_rate=24_000,
-            can_interrupt=False,
-        ),
-        inputs=[webrtc, chat_state, temperature, top_k],
-        outputs=[webrtc],
     )
-    webrtc.on_additional_outputs(
-        lambda s: s,
-        outputs=[text_out],
     )
-    clear_btn.click(clear, outputs=[chat_state, text_out])
 if __name__ == "__main__":
-    demo.launch()

 """
+Gradio app for LFM2-Audio speech-to-speech demo
+Compatible with Hugging Face Spaces
 """
 import gradio as gr
 import numpy as np
 import torch
+import torchaudio
 from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
 print(f"Models loaded on {device}")
+def generate_response(audio_input, temperature, top_k, chat_state):
+    """Generate speech-to-speech response"""
+    if audio_input is None:
+        return None, "Please record audio first", chat_state
+    # Parse audio input
+    rate, wav = audio_input
+    # Convert to torch tensor
+    if wav.dtype == np.int16:
+        wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
+    else:
+        wav_tensor = torch.tensor(wav, dtype=torch.float32)
+    # Ensure mono and correct shape (channels, samples)
+    if len(wav_tensor.shape) > 1:
+        wav_tensor = wav_tensor.mean(dim=-1)
+    # add_audio expects shape (channels, samples), so add channel dimension
+    if len(wav_tensor.shape) == 1:
+        wav_tensor = wav_tensor.unsqueeze(0)
+    # Initialize chat state if empty
+    if len(chat_state.text) == 1:
+        chat_state.new_turn("system")
+        chat_state.add_text("Respond with interleaved text and audio.")
+        chat_state.end_turn()
+    # Add user audio
+    chat_state.new_turn("user")
+    chat_state.add_audio(wav_tensor, rate)
+    chat_state.end_turn()
+    # Start assistant turn
+    chat_state.new_turn("assistant")
+    # Set generation parameters
+    temp = None if temperature == 0 else float(temperature)
+    topk = None if top_k == 0 else int(top_k)
+    # Generate response
+    text_out = []
+    audio_out = []
+    modality_out = []
+    full_text = ""
+    print("Generating response...")
+    with torch.no_grad():
         for t in model.generate_interleaved(
+            **chat_state,
             max_new_tokens=1024,
             audio_temperature=temp,
             audio_top_k=topk,
         ):
+            if t.numel() == 1:  # Text token
+                text_out.append(t)
+                modality_out.append(LFMModality.TEXT)
+                decoded = processor.text.decode(t)
+                full_text += decoded
+                print(decoded, end="", flush=True)
+            elif t.numel() == 8:  # Audio token
+                audio_out.append(t)
+                modality_out.append(LFMModality.AUDIO_OUT)
+    print("\nGeneration complete")
+    # Clean up text
+    full_text = full_text.replace("<|text_end|>", "").strip()
+    # Decode audio (remove last end-of-audio token)
+    if len(audio_out) > 1:
+        mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
+        with torch.no_grad():
+            waveform = mimi.decode(mimi_codes)[0]
+        # Convert to numpy for Gradio
+        audio_np = waveform.cpu().numpy()
+        audio_output = (24000, audio_np.T)  # Gradio expects (rate, data)
+    else:
+        audio_output = None
+    # Update chat state
+    if text_out and audio_out:
+        chat_state.append(
+            text=torch.stack(text_out, 1),
+            audio_out=torch.stack(audio_out, 1),
+            modality_flag=torch.tensor(modality_out, device=device),
+        )
+    chat_state.end_turn()
+    chat_state.new_turn("user")
+    return audio_output, full_text, chat_state
+def reset_chat():
+    """Reset chat state"""
+    return ChatState(processor), "", None
 # Create Gradio interface
+with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
     gr.Markdown("""
+    # LFM2-Audio Speech-to-Speech Chat
+    Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
     **How to use:**
+    1. Click the microphone button to record your voice
+    2. Adjust temperature and top-k parameters if needed (or leave defaults)
+    3. Click "Generate Response"
+    4. Listen to the audio response and read the text transcription
+    **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
     """)
     chat_state = gr.State(ChatState(processor))
     with gr.Row():
         with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="numpy",
+                label="Record your voice"
             )
             with gr.Row():
                     value=1.0,
                     step=0.1,
                     label="Temperature (0 for greedy)",
+                    info="Higher = more creative, lower = more deterministic"
                 )
                 top_k = gr.Slider(
                     minimum=0,
                     value=4,
                     step=1,
                     label="Top-k (0 for no filtering)",
+                    info="Number of top tokens to sample from"
                 )
+            generate_btn = gr.Button("Generate Response", variant="primary")
+            reset_btn = gr.Button("Reset Chat")
         with gr.Column():
+            text_output = gr.Textbox(
+                label="Assistant Response (Text)",
+                lines=4,
+                interactive=False
+            )
+            audio_output = gr.Audio(
+                label="Assistant Response (Audio)",
+                type="numpy",
                 interactive=False
             )
     gr.Markdown("""
+    ### About LFM2-Audio
+    LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
+    - Real-time speech-to-speech conversations
+    - Low-latency interleaved text and audio generation
+    - Natural flowing conversations
+    [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
     """)
+    # Event handlers
+    generate_btn.click(
+        fn=generate_response,
+        inputs=[audio_input, temperature, top_k, chat_state],
+        outputs=[audio_output, text_output, chat_state]
     )
+    reset_btn.click(
+        fn=reset_chat,
+        outputs=[chat_state, text_output, audio_output]
     )
 if __name__ == "__main__":
+    demo.launch()