Spaces:
Runtime error
Runtime error
File size: 6,297 Bytes
71c51fd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
"""
Gradio app for LFM2-Audio speech-to-speech demo
Compatible with Hugging Face Spaces
"""
import gradio as gr
import numpy as np
import torch
import torchaudio
from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
# Load models
HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
print("Loading processor...")
processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
print("Loading model...")
model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
print("Loading audio codec...")
mimi = processor.mimi.eval()
# Move to CUDA if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
mimi = mimi.to(device)
print(f"Models loaded on {device}")
def generate_response(audio_input, temperature, top_k, chat_state):
"""Generate speech-to-speech response"""
if audio_input is None:
return None, "Please record audio first", chat_state
# Parse audio input
rate, wav = audio_input
# Convert to torch tensor
if wav.dtype == np.int16:
wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
else:
wav_tensor = torch.tensor(wav, dtype=torch.float32)
# Ensure mono
if len(wav_tensor.shape) > 1:
wav_tensor = wav_tensor.mean(dim=-1)
# Initialize chat state if empty
if len(chat_state.text) == 1:
chat_state.new_turn("system")
chat_state.add_text("Respond with interleaved text and audio.")
chat_state.end_turn()
# Add user audio
chat_state.new_turn("user")
chat_state.add_audio(wav_tensor, rate)
chat_state.end_turn()
# Start assistant turn
chat_state.new_turn("assistant")
# Set generation parameters
temp = None if temperature == 0 else float(temperature)
topk = None if top_k == 0 else int(top_k)
# Generate response
text_out = []
audio_out = []
modality_out = []
full_text = ""
print("Generating response...")
with torch.no_grad():
for t in model.generate_interleaved(
**chat_state,
max_new_tokens=1024,
audio_temperature=temp,
audio_top_k=topk,
):
if t.numel() == 1: # Text token
text_out.append(t)
modality_out.append(LFMModality.TEXT)
decoded = processor.text.decode(t)
full_text += decoded
print(decoded, end="", flush=True)
elif t.numel() == 8: # Audio token
audio_out.append(t)
modality_out.append(LFMModality.AUDIO_OUT)
print("\nGeneration complete")
# Clean up text
full_text = full_text.replace("<|text_end|>", "").strip()
# Decode audio (remove last end-of-audio token)
if len(audio_out) > 1:
mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
with torch.no_grad():
waveform = mimi.decode(mimi_codes)[0]
# Convert to numpy for Gradio
audio_np = waveform.cpu().numpy()
audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
else:
audio_output = None
# Update chat state
if text_out and audio_out:
chat_state.append(
text=torch.stack(text_out, 1),
audio_out=torch.stack(audio_out, 1),
modality_flag=torch.tensor(modality_out, device=device),
)
chat_state.end_turn()
chat_state.new_turn("user")
return audio_output, full_text, chat_state
def reset_chat():
"""Reset chat state"""
return ChatState(processor), "", None
# Create Gradio interface
with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
gr.Markdown("""
# LFM2-Audio Speech-to-Speech Chat
Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
**How to use:**
1. Click the microphone button to record your voice
2. Adjust temperature and top-k parameters if needed (or leave defaults)
3. Click "Generate Response"
4. Listen to the audio response and read the text transcription
**Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
""")
chat_state = gr.State(ChatState(processor))
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record your voice"
)
with gr.Row():
temperature = gr.Slider(
minimum=0,
maximum=2.0,
value=1.0,
step=0.1,
label="Temperature (0 for greedy)",
info="Higher = more creative, lower = more deterministic"
)
top_k = gr.Slider(
minimum=0,
maximum=100,
value=4,
step=1,
label="Top-k (0 for no filtering)",
info="Number of top tokens to sample from"
)
generate_btn = gr.Button("Generate Response", variant="primary")
reset_btn = gr.Button("Reset Chat")
with gr.Column():
text_output = gr.Textbox(
label="Assistant Response (Text)",
lines=4,
interactive=False
)
audio_output = gr.Audio(
label="Assistant Response (Audio)",
type="numpy",
interactive=False
)
gr.Markdown("""
### About LFM2-Audio
LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
- Real-time speech-to-speech conversations
- Low-latency interleaved text and audio generation
- Natural flowing conversations
[Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
""")
# Event handlers
generate_btn.click(
fn=generate_response,
inputs=[audio_input, temperature, top_k, chat_state],
outputs=[audio_output, text_output, chat_state]
)
reset_btn.click(
fn=reset_chat,
outputs=[chat_state, text_output, audio_output]
)
if __name__ == "__main__":
demo.launch()
|