Rcarvalo's picture
Upload app.py with huggingface_hub
fc67d54 verified
raw
history blame
6.68 kB
"""
Real-time WebRTC speech-to-speech demo with fastrtc
Based on the original liquid-audio demo
"""
from queue import Queue
from threading import Thread
import gradio as gr
import numpy as np
import torch
from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
print("Loading processor...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
print("Loading model...")
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
print("Loading audio codec...")
mimi = processor.mimi.eval()
# Move to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
mimi = mimi.to(device)
print(f"Models loaded on {device}")
def chat_producer(
q: Queue[torch.Tensor | None],
chat: ChatState,
temp: float | None,
topk: int | None,
):
"""Producer thread that generates tokens"""
print(f"Starting generation with state {chat}.")
with torch.no_grad(), mimi.streaming(1):
for t in model.generate_interleaved(
**chat,
max_new_tokens=1024,
audio_temperature=temp,
audio_top_k=topk,
):
q.put(t)
if t.numel() > 1:
if (t == 2048).any():
continue
wav_chunk = mimi.decode(t[None, :, None])[0]
q.put(wav_chunk)
q.put(None)
def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
"""Handle incoming audio and generate streaming response"""
if temp == 0:
temp = None
if topk == 0:
topk = None
if temp is not None:
temp = float(temp)
if topk is not None:
topk = int(topk)
if len(chat.text) == 1:
chat.new_turn("system")
chat.add_text("Respond with interleaved text and audio.")
chat.end_turn()
chat.new_turn("user")
rate, wav = audio
# Convert to tensor with proper shape (channels, samples)
wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
# Ensure correct shape
if len(wav_tensor.shape) == 1:
wav_tensor = wav_tensor.unsqueeze(0)
elif len(wav_tensor.shape) > 1:
# If stereo, convert to mono
wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
chat.add_audio(wav_tensor, rate)
chat.end_turn()
chat.new_turn("assistant")
q: Queue[torch.Tensor | None] = Queue()
chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
chat_thread.start()
out_text: list[torch.Tensor] = []
out_audio: list[torch.Tensor] = []
out_modality: list[LFMModality] = []
while True:
t = q.get()
if t is None:
break
elif t.numel() == 1: # text
out_text.append(t)
out_modality.append(LFMModality.TEXT)
print(processor.text.decode(t), end="")
cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
yield AdditionalOutputs(cur_string)
elif t.numel() == 8:
out_audio.append(t)
out_modality.append(LFMModality.AUDIO_OUT)
elif t.numel() == 1920:
np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
yield (24_000, np_chunk)
else:
raise RuntimeError(f"unexpected shape: {t.shape}")
chat.append(
text=torch.stack(out_text, 1),
audio_out=torch.stack(out_audio, 1),
modality_flag=torch.tensor(out_modality, device=device),
)
chat.end_turn()
chat.new_turn("user")
def clear():
"""Clear chat history"""
gr.Info("Cleared chat history", duration=3)
return ChatState(processor), None
# Create Gradio interface
with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
gr.Markdown("""
# LFM2-Audio Real-time Speech-to-Speech Chat
**Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
**How to use:**
1. Click "Allow" when prompted for microphone access
2. Start speaking - the model listens and responds in real-time
3. The conversation flows naturally with minimal latency
**Features:**
- πŸŽ™οΈ Real-time WebRTC streaming
- ⚑ Low latency response
- πŸ’¬ Interleaved text and audio output
- πŸ”„ Multi-turn conversations
""")
chat_state = gr.State(ChatState(processor))
with gr.Row():
with gr.Column():
webrtc = WebRTC(
modality="audio",
mode="send-receive",
full_screen=False,
)
with gr.Row():
temperature = gr.Slider(
minimum=0,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature (0 for greedy)",
info="Higher = more creative"
)
top_k = gr.Slider(
minimum=0,
maximum=100,
value=4,
step=1,
label="Top-k (0 for no filtering)",
info="Sampling diversity"
)
clear_btn = gr.Button("Reset Chat")
with gr.Column():
text_out = gr.Textbox(
lines=10,
label="Conversation Text",
interactive=False
)
gr.Markdown("""
### About this demo
This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
The model processes your speech and generates both text and audio responses simultaneously.
**Model**: LFM2-Audio-1.5B by Liquid AI
**Mode**: Interleaved generation (optimized for real-time)
**Audio Codec**: Mimi (24kHz)
[Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
""")
# Setup WebRTC streaming
webrtc.stream(
ReplyOnPause(
chat_response, # type: ignore[arg-type]
input_sample_rate=24_000,
output_sample_rate=24_000,
can_interrupt=False,
),
inputs=[webrtc, chat_state, temperature, top_k],
outputs=[webrtc],
)
webrtc.on_additional_outputs(
lambda s: s,
outputs=[text_out],
)
clear_btn.click(clear, outputs=[chat_state, text_out])
if __name__ == "__main__":
demo.launch()