Rcarvalo's picture
Upload app.py with huggingface_hub
5880918 verified
raw
history blame
6.49 kB
"""
Gradio app for LFM2-Audio speech-to-speech demo
Compatible with Hugging Face Spaces
"""
import gradio as gr
import numpy as np
import torch
import torchaudio
from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
print("Loading processor...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
print("Loading model...")
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
print("Loading audio codec...")
mimi = processor.mimi.eval()
# Move to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
mimi = mimi.to(device)
print(f"Models loaded on {device}")
def generate_response(audio_input, temperature, top_k, chat_state):
"""Generate speech-to-speech response"""
if audio_input is None:
return None, "Please record audio first", chat_state
# Parse audio input
rate, wav = audio_input
# Convert to torch tensor
if wav.dtype == np.int16:
wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
else:
wav_tensor = torch.tensor(wav, dtype=torch.float32)
# Ensure mono and correct shape (channels, samples)
if len(wav_tensor.shape) > 1:
wav_tensor = wav_tensor.mean(dim=-1)
# add_audio expects shape (channels, samples), so add channel dimension
if len(wav_tensor.shape) == 1:
wav_tensor = wav_tensor.unsqueeze(0)
# Initialize chat state if empty
if len(chat_state.text) == 1:
chat_state.new_turn("system")
chat_state.add_text("Respond with interleaved text and audio.")
chat_state.end_turn()
# Add user audio
chat_state.new_turn("user")
chat_state.add_audio(wav_tensor, rate)
chat_state.end_turn()
# Start assistant turn
chat_state.new_turn("assistant")
# Set generation parameters
temp = None if temperature == 0 else float(temperature)
topk = None if top_k == 0 else int(top_k)
# Generate response
text_out = []
audio_out = []
modality_out = []
full_text = ""
print("Generating response...")
with torch.no_grad():
for t in model.generate_interleaved(
**chat_state,
max_new_tokens=1024,
audio_temperature=temp,
audio_top_k=topk,
):
if t.numel() == 1: # Text token
text_out.append(t)
modality_out.append(LFMModality.TEXT)
decoded = processor.text.decode(t)
full_text += decoded
print(decoded, end="", flush=True)
elif t.numel() == 8: # Audio token
audio_out.append(t)
modality_out.append(LFMModality.AUDIO_OUT)
print("\nGeneration complete")
# Clean up text
full_text = full_text.replace("<|text_end|>", "").strip()
# Decode audio (remove last end-of-audio token)
if len(audio_out) > 1:
mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
with torch.no_grad():
waveform = mimi.decode(mimi_codes)[0]
# Convert to numpy for Gradio
audio_np = waveform.cpu().numpy()
audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
else:
audio_output = None
# Update chat state
if text_out and audio_out:
chat_state.append(
text=torch.stack(text_out, 1),
audio_out=torch.stack(audio_out, 1),
modality_flag=torch.tensor(modality_out, device=device),
)
chat_state.end_turn()
chat_state.new_turn("user")
return audio_output, full_text, chat_state
def reset_chat():
"""Reset chat state"""
return ChatState(processor), "", None
# Create Gradio interface
with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
gr.Markdown("""
# LFM2-Audio Speech-to-Speech Chat
Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
**How to use:**
1. Click the microphone button to record your voice
2. Adjust temperature and top-k parameters if needed (or leave defaults)
3. Click "Generate Response"
4. Listen to the audio response and read the text transcription
**Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
""")
chat_state = gr.State(ChatState(processor))
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record your voice"
)
with gr.Row():
temperature = gr.Slider(
minimum=0,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature (0 for greedy)",
info="Higher = more creative, lower = more deterministic"
)
top_k = gr.Slider(
minimum=0,
maximum=100,
value=4,
step=1,
label="Top-k (0 for no filtering)",
info="Number of top tokens to sample from"
)
generate_btn = gr.Button("Generate Response", variant="primary")
reset_btn = gr.Button("Reset Chat")
with gr.Column():
text_output = gr.Textbox(
label="Assistant Response (Text)",
lines=4,
interactive=False
)
audio_output = gr.Audio(
label="Assistant Response (Audio)",
type="numpy",
interactive=False
)
gr.Markdown("""
### About LFM2-Audio
LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
- Real-time speech-to-speech conversations
- Low-latency interleaved text and audio generation
- Natural flowing conversations
[Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
""")
# Event handlers
generate_btn.click(
fn=generate_response,
inputs=[audio_input, temperature, top_k, chat_state],
outputs=[audio_output, text_output, chat_state]
)
reset_btn.click(
fn=reset_chat,
outputs=[chat_state, text_output, audio_output]
)
if __name__ == "__main__":
demo.launch()