Rcarvalo commited on
Commit
5880918
Β·
verified Β·
1 Parent(s): 6df4eaa

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +129 -141
app.py CHANGED
@@ -1,15 +1,12 @@
1
  """
2
- Real-time WebRTC speech-to-speech demo with fastrtc
3
- Based on the original liquid-audio demo
4
  """
5
 
6
- from queue import Queue
7
- from threading import Thread
8
-
9
  import gradio as gr
10
  import numpy as np
11
  import torch
12
- from fastrtc import AdditionalOutputs, ReplyOnPause, WebRTC
13
 
14
  from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
15
 
@@ -31,138 +28,132 @@ mimi = mimi.to(device)
31
  print(f"Models loaded on {device}")
32
 
33
 
34
- def chat_producer(
35
- q: Queue[torch.Tensor | None],
36
- chat: ChatState,
37
- temp: float | None,
38
- topk: int | None,
39
- ):
40
- """Producer thread that generates tokens"""
41
- print(f"Starting generation with state {chat}.")
42
- with torch.no_grad(), mimi.streaming(1):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  for t in model.generate_interleaved(
44
- **chat,
45
  max_new_tokens=1024,
46
  audio_temperature=temp,
47
  audio_top_k=topk,
48
  ):
49
- q.put(t)
50
-
51
- if t.numel() > 1:
52
- if (t == 2048).any():
53
- continue
54
-
55
- wav_chunk = mimi.decode(t[None, :, None])[0]
56
- q.put(wav_chunk)
57
-
58
- q.put(None)
59
 
 
60
 
61
- def chat_response(audio: tuple[int, np.ndarray], _id: str, chat: ChatState, temp: float | None = 1.0, topk: int | None = 4):
62
- """Handle incoming audio and generate streaming response"""
63
- if temp == 0:
64
- temp = None
65
- if topk == 0:
66
- topk = None
67
 
68
- if temp is not None:
69
- temp = float(temp)
70
- if topk is not None:
71
- topk = int(topk)
 
72
 
73
- if len(chat.text) == 1:
74
- chat.new_turn("system")
75
- chat.add_text("Respond with interleaved text and audio.")
76
- chat.end_turn()
 
77
 
78
- chat.new_turn("user")
 
 
 
 
 
 
79
 
80
- rate, wav = audio
81
- # Convert to tensor with proper shape (channels, samples)
82
- wav_tensor = torch.tensor(wav / 32_768, dtype=torch.float)
83
 
84
- # Ensure correct shape
85
- if len(wav_tensor.shape) == 1:
86
- wav_tensor = wav_tensor.unsqueeze(0)
87
- elif len(wav_tensor.shape) > 1:
88
- # If stereo, convert to mono
89
- wav_tensor = wav_tensor.mean(dim=-1, keepdim=True).T
90
-
91
- chat.add_audio(wav_tensor, rate)
92
- chat.end_turn()
93
-
94
- chat.new_turn("assistant")
95
-
96
- q: Queue[torch.Tensor | None] = Queue()
97
- chat_thread = Thread(target=chat_producer, args=(q, chat, temp, topk))
98
- chat_thread.start()
99
-
100
- out_text: list[torch.Tensor] = []
101
- out_audio: list[torch.Tensor] = []
102
- out_modality: list[LFMModality] = []
103
-
104
- while True:
105
- t = q.get()
106
- if t is None:
107
- break
108
- elif t.numel() == 1: # text
109
- out_text.append(t)
110
- out_modality.append(LFMModality.TEXT)
111
- print(processor.text.decode(t), end="")
112
- cur_string = processor.text.decode(torch.cat(out_text)).removesuffix("<|text_end|>")
113
- yield AdditionalOutputs(cur_string)
114
- elif t.numel() == 8:
115
- out_audio.append(t)
116
- out_modality.append(LFMModality.AUDIO_OUT)
117
- elif t.numel() == 1920:
118
- np_chunk = (t.cpu().numpy() * 32_767).astype(np.int16)
119
- yield (24_000, np_chunk)
120
- else:
121
- raise RuntimeError(f"unexpected shape: {t.shape}")
122
-
123
- chat.append(
124
- text=torch.stack(out_text, 1),
125
- audio_out=torch.stack(out_audio, 1),
126
- modality_flag=torch.tensor(out_modality, device=device),
127
- )
128
 
129
- chat.end_turn()
130
- chat.new_turn("user")
131
 
132
-
133
- def clear():
134
- """Clear chat history"""
135
- gr.Info("Cleared chat history", duration=3)
136
- return ChatState(processor), None
137
 
138
 
139
  # Create Gradio interface
140
- with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
141
  gr.Markdown("""
142
- # LFM2-Audio Real-time Speech-to-Speech Chat
143
 
144
- **Real-time WebRTC streaming** powered by fastrtc - Talk naturally and get instant responses!
145
 
146
  **How to use:**
147
- 1. Click "Allow" when prompted for microphone access
148
- 2. Start speaking - the model listens and responds in real-time
149
- 3. The conversation flows naturally with minimal latency
150
-
151
- **Features:**
152
- - πŸŽ™οΈ Real-time WebRTC streaming
153
- - ⚑ Low latency response
154
- - πŸ’¬ Interleaved text and audio output
155
- - πŸ”„ Multi-turn conversations
156
  """)
157
 
158
  chat_state = gr.State(ChatState(processor))
159
 
160
  with gr.Row():
161
  with gr.Column():
162
- webrtc = WebRTC(
163
- modality="audio",
164
- mode="send-receive",
165
- full_screen=False,
166
  )
167
 
168
  with gr.Row():
@@ -172,7 +163,7 @@ with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
172
  value=1.0,
173
  step=0.1,
174
  label="Temperature (0 for greedy)",
175
- info="Higher = more creative"
176
  )
177
  top_k = gr.Slider(
178
  minimum=0,
@@ -180,50 +171,47 @@ with gr.Blocks(title="LFM2-Audio Real-time Speech-to-Speech") as demo:
180
  value=4,
181
  step=1,
182
  label="Top-k (0 for no filtering)",
183
- info="Sampling diversity"
184
  )
185
 
186
- clear_btn = gr.Button("Reset Chat")
 
187
 
188
  with gr.Column():
189
- text_out = gr.Textbox(
190
- lines=10,
191
- label="Conversation Text",
 
 
 
 
 
192
  interactive=False
193
  )
194
 
195
  gr.Markdown("""
196
- ### About this demo
197
-
198
- This demo uses **fastrtc** for WebRTC streaming, enabling real-time speech-to-speech interaction with minimal latency.
199
- The model processes your speech and generates both text and audio responses simultaneously.
200
 
201
- **Model**: LFM2-Audio-1.5B by Liquid AI
202
- **Mode**: Interleaved generation (optimized for real-time)
203
- **Audio Codec**: Mimi (24kHz)
 
204
 
205
- [Liquid AI](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/) | [Model Card](https://huggingface.co/LiquidAI/LFM2-Audio-1.5B)
206
  """)
207
 
208
- # Setup WebRTC streaming
209
- webrtc.stream(
210
- ReplyOnPause(
211
- chat_response, # type: ignore[arg-type]
212
- input_sample_rate=24_000,
213
- output_sample_rate=24_000,
214
- can_interrupt=False,
215
- ),
216
- inputs=[webrtc, chat_state, temperature, top_k],
217
- outputs=[webrtc],
218
  )
219
 
220
- webrtc.on_additional_outputs(
221
- lambda s: s,
222
- outputs=[text_out],
223
  )
224
 
225
- clear_btn.click(clear, outputs=[chat_state, text_out])
226
-
227
 
228
  if __name__ == "__main__":
229
- demo.launch()
 
1
  """
2
+ Gradio app for LFM2-Audio speech-to-speech demo
3
+ Compatible with Hugging Face Spaces
4
  """
5
 
 
 
 
6
  import gradio as gr
7
  import numpy as np
8
  import torch
9
+ import torchaudio
10
 
11
  from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
12
 
 
28
  print(f"Models loaded on {device}")
29
 
30
 
31
+ def generate_response(audio_input, temperature, top_k, chat_state):
32
+ """Generate speech-to-speech response"""
33
+
34
+ if audio_input is None:
35
+ return None, "Please record audio first", chat_state
36
+
37
+ # Parse audio input
38
+ rate, wav = audio_input
39
+
40
+ # Convert to torch tensor
41
+ if wav.dtype == np.int16:
42
+ wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
43
+ else:
44
+ wav_tensor = torch.tensor(wav, dtype=torch.float32)
45
+
46
+ # Ensure mono and correct shape (channels, samples)
47
+ if len(wav_tensor.shape) > 1:
48
+ wav_tensor = wav_tensor.mean(dim=-1)
49
+
50
+ # add_audio expects shape (channels, samples), so add channel dimension
51
+ if len(wav_tensor.shape) == 1:
52
+ wav_tensor = wav_tensor.unsqueeze(0)
53
+
54
+ # Initialize chat state if empty
55
+ if len(chat_state.text) == 1:
56
+ chat_state.new_turn("system")
57
+ chat_state.add_text("Respond with interleaved text and audio.")
58
+ chat_state.end_turn()
59
+
60
+ # Add user audio
61
+ chat_state.new_turn("user")
62
+ chat_state.add_audio(wav_tensor, rate)
63
+ chat_state.end_turn()
64
+
65
+ # Start assistant turn
66
+ chat_state.new_turn("assistant")
67
+
68
+ # Set generation parameters
69
+ temp = None if temperature == 0 else float(temperature)
70
+ topk = None if top_k == 0 else int(top_k)
71
+
72
+ # Generate response
73
+ text_out = []
74
+ audio_out = []
75
+ modality_out = []
76
+
77
+ full_text = ""
78
+
79
+ print("Generating response...")
80
+ with torch.no_grad():
81
  for t in model.generate_interleaved(
82
+ **chat_state,
83
  max_new_tokens=1024,
84
  audio_temperature=temp,
85
  audio_top_k=topk,
86
  ):
87
+ if t.numel() == 1: # Text token
88
+ text_out.append(t)
89
+ modality_out.append(LFMModality.TEXT)
90
+ decoded = processor.text.decode(t)
91
+ full_text += decoded
92
+ print(decoded, end="", flush=True)
93
+ elif t.numel() == 8: # Audio token
94
+ audio_out.append(t)
95
+ modality_out.append(LFMModality.AUDIO_OUT)
 
96
 
97
+ print("\nGeneration complete")
98
 
99
+ # Clean up text
100
+ full_text = full_text.replace("<|text_end|>", "").strip()
 
 
 
 
101
 
102
+ # Decode audio (remove last end-of-audio token)
103
+ if len(audio_out) > 1:
104
+ mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
105
+ with torch.no_grad():
106
+ waveform = mimi.decode(mimi_codes)[0]
107
 
108
+ # Convert to numpy for Gradio
109
+ audio_np = waveform.cpu().numpy()
110
+ audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
111
+ else:
112
+ audio_output = None
113
 
114
+ # Update chat state
115
+ if text_out and audio_out:
116
+ chat_state.append(
117
+ text=torch.stack(text_out, 1),
118
+ audio_out=torch.stack(audio_out, 1),
119
+ modality_flag=torch.tensor(modality_out, device=device),
120
+ )
121
 
122
+ chat_state.end_turn()
123
+ chat_state.new_turn("user")
 
124
 
125
+ return audio_output, full_text, chat_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
 
127
 
128
+ def reset_chat():
129
+ """Reset chat state"""
130
+ return ChatState(processor), "", None
 
 
131
 
132
 
133
  # Create Gradio interface
134
+ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
135
  gr.Markdown("""
136
+ # LFM2-Audio Speech-to-Speech Chat
137
 
138
+ Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
139
 
140
  **How to use:**
141
+ 1. Click the microphone button to record your voice
142
+ 2. Adjust temperature and top-k parameters if needed (or leave defaults)
143
+ 3. Click "Generate Response"
144
+ 4. Listen to the audio response and read the text transcription
145
+
146
+ **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
 
 
 
147
  """)
148
 
149
  chat_state = gr.State(ChatState(processor))
150
 
151
  with gr.Row():
152
  with gr.Column():
153
+ audio_input = gr.Audio(
154
+ sources=["microphone"],
155
+ type="numpy",
156
+ label="Record your voice"
157
  )
158
 
159
  with gr.Row():
 
163
  value=1.0,
164
  step=0.1,
165
  label="Temperature (0 for greedy)",
166
+ info="Higher = more creative, lower = more deterministic"
167
  )
168
  top_k = gr.Slider(
169
  minimum=0,
 
171
  value=4,
172
  step=1,
173
  label="Top-k (0 for no filtering)",
174
+ info="Number of top tokens to sample from"
175
  )
176
 
177
+ generate_btn = gr.Button("Generate Response", variant="primary")
178
+ reset_btn = gr.Button("Reset Chat")
179
 
180
  with gr.Column():
181
+ text_output = gr.Textbox(
182
+ label="Assistant Response (Text)",
183
+ lines=4,
184
+ interactive=False
185
+ )
186
+ audio_output = gr.Audio(
187
+ label="Assistant Response (Audio)",
188
+ type="numpy",
189
  interactive=False
190
  )
191
 
192
  gr.Markdown("""
193
+ ### About LFM2-Audio
 
 
 
194
 
195
+ LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
196
+ - Real-time speech-to-speech conversations
197
+ - Low-latency interleaved text and audio generation
198
+ - Natural flowing conversations
199
 
200
+ [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
201
  """)
202
 
203
+ # Event handlers
204
+ generate_btn.click(
205
+ fn=generate_response,
206
+ inputs=[audio_input, temperature, top_k, chat_state],
207
+ outputs=[audio_output, text_output, chat_state]
 
 
 
 
 
208
  )
209
 
210
+ reset_btn.click(
211
+ fn=reset_chat,
212
+ outputs=[chat_state, text_output, audio_output]
213
  )
214
 
 
 
215
 
216
  if __name__ == "__main__":
217
+ demo.launch()