Spaces:
Runtime error
Runtime error
| """ | |
| Gradio app for LFM2-Audio speech-to-speech demo | |
| Compatible with Hugging Face Spaces | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| import torchaudio | |
| from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality | |
| # Load models | |
| HF_REPO = "LiquidAI/LFM2-Audio-1.5B" | |
| print("Loading processor...") | |
| processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval() | |
| print("Loading model...") | |
| model = LFM2AudioModel.from_pretrained(HF_REPO).eval() | |
| print("Loading audio codec...") | |
| mimi = processor.mimi.eval() | |
| # Move to CUDA if available | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = model.to(device) | |
| mimi = mimi.to(device) | |
| print(f"Models loaded on {device}") | |
| def generate_response(audio_input, temperature, top_k, chat_state): | |
| """Generate speech-to-speech response""" | |
| if audio_input is None: | |
| return None, "Please record audio first", chat_state | |
| # Parse audio input | |
| rate, wav = audio_input | |
| # Convert to torch tensor | |
| if wav.dtype == np.int16: | |
| wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32) | |
| else: | |
| wav_tensor = torch.tensor(wav, dtype=torch.float32) | |
| # Ensure mono and correct shape (channels, samples) | |
| if len(wav_tensor.shape) > 1: | |
| wav_tensor = wav_tensor.mean(dim=-1) | |
| # add_audio expects shape (channels, samples), so add channel dimension | |
| if len(wav_tensor.shape) == 1: | |
| wav_tensor = wav_tensor.unsqueeze(0) | |
| # Initialize chat state if empty | |
| if len(chat_state.text) == 1: | |
| chat_state.new_turn("system") | |
| chat_state.add_text("Respond with interleaved text and audio.") | |
| chat_state.end_turn() | |
| # Add user audio | |
| chat_state.new_turn("user") | |
| chat_state.add_audio(wav_tensor, rate) | |
| chat_state.end_turn() | |
| # Start assistant turn | |
| chat_state.new_turn("assistant") | |
| # Set generation parameters | |
| temp = None if temperature == 0 else float(temperature) | |
| topk = None if top_k == 0 else int(top_k) | |
| # Generate response | |
| text_out = [] | |
| audio_out = [] | |
| modality_out = [] | |
| full_text = "" | |
| print("Generating response...") | |
| with torch.no_grad(): | |
| for t in model.generate_interleaved( | |
| **chat_state, | |
| max_new_tokens=1024, | |
| audio_temperature=temp, | |
| audio_top_k=topk, | |
| ): | |
| if t.numel() == 1: # Text token | |
| text_out.append(t) | |
| modality_out.append(LFMModality.TEXT) | |
| decoded = processor.text.decode(t) | |
| full_text += decoded | |
| print(decoded, end="", flush=True) | |
| elif t.numel() == 8: # Audio token | |
| audio_out.append(t) | |
| modality_out.append(LFMModality.AUDIO_OUT) | |
| print("\nGeneration complete") | |
| # Clean up text | |
| full_text = full_text.replace("<|text_end|>", "").strip() | |
| # Decode audio (remove last end-of-audio token) | |
| if len(audio_out) > 1: | |
| mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| waveform = mimi.decode(mimi_codes)[0] | |
| # Convert to numpy for Gradio | |
| audio_np = waveform.cpu().numpy() | |
| audio_output = (24000, audio_np.T) # Gradio expects (rate, data) | |
| else: | |
| audio_output = None | |
| # Update chat state | |
| if text_out and audio_out: | |
| chat_state.append( | |
| text=torch.stack(text_out, 1), | |
| audio_out=torch.stack(audio_out, 1), | |
| modality_flag=torch.tensor(modality_out, device=device), | |
| ) | |
| chat_state.end_turn() | |
| chat_state.new_turn("user") | |
| return audio_output, full_text, chat_state | |
| def reset_chat(): | |
| """Reset chat state""" | |
| return ChatState(processor), "", None | |
| # Create Gradio interface | |
| with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo: | |
| gr.Markdown(""" | |
| # LFM2-Audio Speech-to-Speech Chat | |
| Talk to LFM2-Audio! Record your voice and get a response with both text and audio. | |
| **How to use:** | |
| 1. Click the microphone button to record your voice | |
| 2. Adjust temperature and top-k parameters if needed (or leave defaults) | |
| 3. Click "Generate Response" | |
| 4. Listen to the audio response and read the text transcription | |
| **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded. | |
| """) | |
| chat_state = gr.State(ChatState(processor)) | |
| with gr.Row(): | |
| with gr.Column(): | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="Record your voice" | |
| ) | |
| with gr.Row(): | |
| temperature = gr.Slider( | |
| minimum=0, | |
| maximum=2.0, | |
| value=1.0, | |
| step=0.1, | |
| label="Temperature (0 for greedy)", | |
| info="Higher = more creative, lower = more deterministic" | |
| ) | |
| top_k = gr.Slider( | |
| minimum=0, | |
| maximum=100, | |
| value=4, | |
| step=1, | |
| label="Top-k (0 for no filtering)", | |
| info="Number of top tokens to sample from" | |
| ) | |
| generate_btn = gr.Button("Generate Response", variant="primary") | |
| reset_btn = gr.Button("Reset Chat") | |
| with gr.Column(): | |
| text_output = gr.Textbox( | |
| label="Assistant Response (Text)", | |
| lines=4, | |
| interactive=False | |
| ) | |
| audio_output = gr.Audio( | |
| label="Assistant Response (Audio)", | |
| type="numpy", | |
| interactive=False | |
| ) | |
| gr.Markdown(""" | |
| ### About LFM2-Audio | |
| LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports: | |
| - Real-time speech-to-speech conversations | |
| - Low-latency interleaved text and audio generation | |
| - Natural flowing conversations | |
| [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | |
| """) | |
| # Event handlers | |
| generate_btn.click( | |
| fn=generate_response, | |
| inputs=[audio_input, temperature, top_k, chat_state], | |
| outputs=[audio_output, text_output, chat_state] | |
| ) | |
| reset_btn.click( | |
| fn=reset_chat, | |
| outputs=[chat_state, text_output, audio_output] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |