Spaces:

Rcarvalo
/

speech-to-speech

Runtime error

App Files Files Community

Rcarvalo commited on 8 days ago

Commit

71c51fd

verified ·

1 Parent(s): 099ceaf

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +213 -0

app.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Gradio app for LFM2-Audio speech-to-speech demo
+Compatible with Hugging Face Spaces
+"""
+import gradio as gr
+import numpy as np
+import torch
+import torchaudio
+from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
+# Load models
+HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
+print("Loading processor...")
+processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
+print("Loading model...")
+model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
+print("Loading audio codec...")
+mimi = processor.mimi.eval()
+# Move to CUDA if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+mimi = mimi.to(device)
+print(f"Models loaded on {device}")
+def generate_response(audio_input, temperature, top_k, chat_state):
+    """Generate speech-to-speech response"""
+    if audio_input is None:
+        return None, "Please record audio first", chat_state
+    # Parse audio input
+    rate, wav = audio_input
+    # Convert to torch tensor
+    if wav.dtype == np.int16:
+        wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
+    else:
+        wav_tensor = torch.tensor(wav, dtype=torch.float32)
+    # Ensure mono
+    if len(wav_tensor.shape) > 1:
+        wav_tensor = wav_tensor.mean(dim=-1)
+    # Initialize chat state if empty
+    if len(chat_state.text) == 1:
+        chat_state.new_turn("system")
+        chat_state.add_text("Respond with interleaved text and audio.")
+        chat_state.end_turn()
+    # Add user audio
+    chat_state.new_turn("user")
+    chat_state.add_audio(wav_tensor, rate)
+    chat_state.end_turn()
+    # Start assistant turn
+    chat_state.new_turn("assistant")
+    # Set generation parameters
+    temp = None if temperature == 0 else float(temperature)
+    topk = None if top_k == 0 else int(top_k)
+    # Generate response
+    text_out = []
+    audio_out = []
+    modality_out = []
+    full_text = ""
+    print("Generating response...")
+    with torch.no_grad():
+        for t in model.generate_interleaved(
+            **chat_state,
+            max_new_tokens=1024,
+            audio_temperature=temp,
+            audio_top_k=topk,
+        ):
+            if t.numel() == 1:  # Text token
+                text_out.append(t)
+                modality_out.append(LFMModality.TEXT)
+                decoded = processor.text.decode(t)
+                full_text += decoded
+                print(decoded, end="", flush=True)
+            elif t.numel() == 8:  # Audio token
+                audio_out.append(t)
+                modality_out.append(LFMModality.AUDIO_OUT)
+    print("\nGeneration complete")
+    # Clean up text
+    full_text = full_text.replace("<|text_end|>", "").strip()
+    # Decode audio (remove last end-of-audio token)
+    if len(audio_out) > 1:
+        mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
+        with torch.no_grad():
+            waveform = mimi.decode(mimi_codes)[0]
+        # Convert to numpy for Gradio
+        audio_np = waveform.cpu().numpy()
+        audio_output = (24000, audio_np.T)  # Gradio expects (rate, data)
+    else:
+        audio_output = None
+    # Update chat state
+    if text_out and audio_out:
+        chat_state.append(
+            text=torch.stack(text_out, 1),
+            audio_out=torch.stack(audio_out, 1),
+            modality_flag=torch.tensor(modality_out, device=device),
+        )
+    chat_state.end_turn()
+    chat_state.new_turn("user")
+    return audio_output, full_text, chat_state
+def reset_chat():
+    """Reset chat state"""
+    return ChatState(processor), "", None
+# Create Gradio interface
+with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
+    gr.Markdown("""
+    # LFM2-Audio Speech-to-Speech Chat
+    Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
+    **How to use:**
+    1. Click the microphone button to record your voice
+    2. Adjust temperature and top-k parameters if needed (or leave defaults)
+    3. Click "Generate Response"
+    4. Listen to the audio response and read the text transcription
+    **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
+    """)
+    chat_state = gr.State(ChatState(processor))
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                sources=["microphone"],
+                type="numpy",
+                label="Record your voice"
+            )
+            with gr.Row():
+                temperature = gr.Slider(
+                    minimum=0,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Temperature (0 for greedy)",
+                    info="Higher = more creative, lower = more deterministic"
+                )
+                top_k = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=4,
+                    step=1,
+                    label="Top-k (0 for no filtering)",
+                    info="Number of top tokens to sample from"
+                )
+            generate_btn = gr.Button("Generate Response", variant="primary")
+            reset_btn = gr.Button("Reset Chat")
+        with gr.Column():
+            text_output = gr.Textbox(
+                label="Assistant Response (Text)",
+                lines=4,
+                interactive=False
+            )
+            audio_output = gr.Audio(
+                label="Assistant Response (Audio)",
+                type="numpy",
+                interactive=False
+            )
+    gr.Markdown("""
+    ### About LFM2-Audio
+    LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
+    - Real-time speech-to-speech conversations
+    - Low-latency interleaved text and audio generation
+    - Natural flowing conversations
+    [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
+    """)
+    # Event handlers
+    generate_btn.click(
+        fn=generate_response,
+        inputs=[audio_input, temperature, top_k, chat_state],
+        outputs=[audio_output, text_output, chat_state]
+    )
+    reset_btn.click(
+        fn=reset_chat,
+        outputs=[chat_state, text_output, audio_output]
+    )
+if __name__ == "__main__":
+    demo.launch()