""" Gradio app for LFM2-Audio speech-to-speech demo Compatible with Hugging Face Spaces """ import gradio as gr import numpy as np import torch import torchaudio from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality # Load models HF_REPO = "LiquidAI/LFM2-Audio-1.5B" print("Loading processor...") processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval() print("Loading model...") model = LFM2AudioModel.from_pretrained(HF_REPO).eval() print("Loading audio codec...") mimi = processor.mimi.eval() # Move to CUDA if available device = "cuda" if torch.cuda.is_available() else "cpu" model = model.to(device) mimi = mimi.to(device) print(f"Models loaded on {device}") def generate_response(audio_input, temperature, top_k, chat_state): """Generate speech-to-speech response""" if audio_input is None: return None, "Please record audio first", chat_state # Parse audio input rate, wav = audio_input # Convert to torch tensor if wav.dtype == np.int16: wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32) else: wav_tensor = torch.tensor(wav, dtype=torch.float32) # Ensure mono and correct shape (channels, samples) if len(wav_tensor.shape) > 1: wav_tensor = wav_tensor.mean(dim=-1) # add_audio expects shape (channels, samples), so add channel dimension if len(wav_tensor.shape) == 1: wav_tensor = wav_tensor.unsqueeze(0) # Initialize chat state if empty if len(chat_state.text) == 1: chat_state.new_turn("system") chat_state.add_text("Respond with interleaved text and audio.") chat_state.end_turn() # Add user audio chat_state.new_turn("user") chat_state.add_audio(wav_tensor, rate) chat_state.end_turn() # Start assistant turn chat_state.new_turn("assistant") # Set generation parameters temp = None if temperature == 0 else float(temperature) topk = None if top_k == 0 else int(top_k) # Generate response text_out = [] audio_out = [] modality_out = [] full_text = "" print("Generating response...") with torch.no_grad(): for t in model.generate_interleaved( **chat_state, max_new_tokens=1024, audio_temperature=temp, audio_top_k=topk, ): if t.numel() == 1: # Text token text_out.append(t) modality_out.append(LFMModality.TEXT) decoded = processor.text.decode(t) full_text += decoded print(decoded, end="", flush=True) elif t.numel() == 8: # Audio token audio_out.append(t) modality_out.append(LFMModality.AUDIO_OUT) print("\nGeneration complete") # Clean up text full_text = full_text.replace("<|text_end|>", "").strip() # Decode audio (remove last end-of-audio token) if len(audio_out) > 1: mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device) with torch.no_grad(): waveform = mimi.decode(mimi_codes)[0] # Convert to numpy for Gradio audio_np = waveform.cpu().numpy() audio_output = (24000, audio_np.T) # Gradio expects (rate, data) else: audio_output = None # Update chat state if text_out and audio_out: chat_state.append( text=torch.stack(text_out, 1), audio_out=torch.stack(audio_out, 1), modality_flag=torch.tensor(modality_out, device=device), ) chat_state.end_turn() chat_state.new_turn("user") return audio_output, full_text, chat_state def reset_chat(): """Reset chat state""" return ChatState(processor), "", None # Create Gradio interface with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo: gr.Markdown(""" # LFM2-Audio Speech-to-Speech Chat Talk to LFM2-Audio! Record your voice and get a response with both text and audio. **How to use:** 1. Click the microphone button to record your voice 2. Adjust temperature and top-k parameters if needed (or leave defaults) 3. Click "Generate Response" 4. Listen to the audio response and read the text transcription **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded. """) chat_state = gr.State(ChatState(processor)) with gr.Row(): with gr.Column(): audio_input = gr.Audio( sources=["microphone"], type="numpy", label="Record your voice" ) with gr.Row(): temperature = gr.Slider( minimum=0, maximum=2.0, value=1.0, step=0.1, label="Temperature (0 for greedy)", info="Higher = more creative, lower = more deterministic" ) top_k = gr.Slider( minimum=0, maximum=100, value=4, step=1, label="Top-k (0 for no filtering)", info="Number of top tokens to sample from" ) generate_btn = gr.Button("Generate Response", variant="primary") reset_btn = gr.Button("Reset Chat") with gr.Column(): text_output = gr.Textbox( label="Assistant Response (Text)", lines=4, interactive=False ) audio_output = gr.Audio( label="Assistant Response (Audio)", type="numpy", interactive=False ) gr.Markdown(""" ### About LFM2-Audio LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports: - Real-time speech-to-speech conversations - Low-latency interleaved text and audio generation - Natural flowing conversations [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) """) # Event handlers generate_btn.click( fn=generate_response, inputs=[audio_input, temperature, top_k, chat_state], outputs=[audio_output, text_output, chat_state] ) reset_btn.click( fn=reset_chat, outputs=[chat_state, text_output, audio_output] ) if __name__ == "__main__": demo.launch()