Rcarvalo commited on
Commit
71c51fd
Β·
verified Β·
1 Parent(s): 099ceaf

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +213 -0
app.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio app for LFM2-Audio speech-to-speech demo
3
+ Compatible with Hugging Face Spaces
4
+ """
5
+
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ import torchaudio
10
+
11
+ from liquid_audio import ChatState, LFM2AudioModel, LFM2AudioProcessor, LFMModality
12
+
13
+ # Load models
14
+ HF_REPO = "LiquidAI/LFM2-Audio-1.5B"
15
+
16
+ print("Loading processor...")
17
+ processor = LFM2AudioProcessor.from_pretrained(HF_REPO).eval()
18
+ print("Loading model...")
19
+ model = LFM2AudioModel.from_pretrained(HF_REPO).eval()
20
+ print("Loading audio codec...")
21
+ mimi = processor.mimi.eval()
22
+
23
+ # Move to CUDA if available
24
+ device = "cuda" if torch.cuda.is_available() else "cpu"
25
+ model = model.to(device)
26
+ mimi = mimi.to(device)
27
+
28
+ print(f"Models loaded on {device}")
29
+
30
+
31
+ def generate_response(audio_input, temperature, top_k, chat_state):
32
+ """Generate speech-to-speech response"""
33
+
34
+ if audio_input is None:
35
+ return None, "Please record audio first", chat_state
36
+
37
+ # Parse audio input
38
+ rate, wav = audio_input
39
+
40
+ # Convert to torch tensor
41
+ if wav.dtype == np.int16:
42
+ wav_tensor = torch.tensor(wav / 32768.0, dtype=torch.float32)
43
+ else:
44
+ wav_tensor = torch.tensor(wav, dtype=torch.float32)
45
+
46
+ # Ensure mono
47
+ if len(wav_tensor.shape) > 1:
48
+ wav_tensor = wav_tensor.mean(dim=-1)
49
+
50
+ # Initialize chat state if empty
51
+ if len(chat_state.text) == 1:
52
+ chat_state.new_turn("system")
53
+ chat_state.add_text("Respond with interleaved text and audio.")
54
+ chat_state.end_turn()
55
+
56
+ # Add user audio
57
+ chat_state.new_turn("user")
58
+ chat_state.add_audio(wav_tensor, rate)
59
+ chat_state.end_turn()
60
+
61
+ # Start assistant turn
62
+ chat_state.new_turn("assistant")
63
+
64
+ # Set generation parameters
65
+ temp = None if temperature == 0 else float(temperature)
66
+ topk = None if top_k == 0 else int(top_k)
67
+
68
+ # Generate response
69
+ text_out = []
70
+ audio_out = []
71
+ modality_out = []
72
+
73
+ full_text = ""
74
+
75
+ print("Generating response...")
76
+ with torch.no_grad():
77
+ for t in model.generate_interleaved(
78
+ **chat_state,
79
+ max_new_tokens=1024,
80
+ audio_temperature=temp,
81
+ audio_top_k=topk,
82
+ ):
83
+ if t.numel() == 1: # Text token
84
+ text_out.append(t)
85
+ modality_out.append(LFMModality.TEXT)
86
+ decoded = processor.text.decode(t)
87
+ full_text += decoded
88
+ print(decoded, end="", flush=True)
89
+ elif t.numel() == 8: # Audio token
90
+ audio_out.append(t)
91
+ modality_out.append(LFMModality.AUDIO_OUT)
92
+
93
+ print("\nGeneration complete")
94
+
95
+ # Clean up text
96
+ full_text = full_text.replace("<|text_end|>", "").strip()
97
+
98
+ # Decode audio (remove last end-of-audio token)
99
+ if len(audio_out) > 1:
100
+ mimi_codes = torch.stack(audio_out[:-1], 1).unsqueeze(0).to(device)
101
+ with torch.no_grad():
102
+ waveform = mimi.decode(mimi_codes)[0]
103
+
104
+ # Convert to numpy for Gradio
105
+ audio_np = waveform.cpu().numpy()
106
+ audio_output = (24000, audio_np.T) # Gradio expects (rate, data)
107
+ else:
108
+ audio_output = None
109
+
110
+ # Update chat state
111
+ if text_out and audio_out:
112
+ chat_state.append(
113
+ text=torch.stack(text_out, 1),
114
+ audio_out=torch.stack(audio_out, 1),
115
+ modality_flag=torch.tensor(modality_out, device=device),
116
+ )
117
+
118
+ chat_state.end_turn()
119
+ chat_state.new_turn("user")
120
+
121
+ return audio_output, full_text, chat_state
122
+
123
+
124
+ def reset_chat():
125
+ """Reset chat state"""
126
+ return ChatState(processor), "", None
127
+
128
+
129
+ # Create Gradio interface
130
+ with gr.Blocks(title="LFM2-Audio Speech-to-Speech") as demo:
131
+ gr.Markdown("""
132
+ # LFM2-Audio Speech-to-Speech Chat
133
+
134
+ Talk to LFM2-Audio! Record your voice and get a response with both text and audio.
135
+
136
+ **How to use:**
137
+ 1. Click the microphone button to record your voice
138
+ 2. Adjust temperature and top-k parameters if needed (or leave defaults)
139
+ 3. Click "Generate Response"
140
+ 4. Listen to the audio response and read the text transcription
141
+
142
+ **Note:** This model runs on GPU. If you experience long wait times, the Space might be on CPU or heavily loaded.
143
+ """)
144
+
145
+ chat_state = gr.State(ChatState(processor))
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ audio_input = gr.Audio(
150
+ sources=["microphone"],
151
+ type="numpy",
152
+ label="Record your voice"
153
+ )
154
+
155
+ with gr.Row():
156
+ temperature = gr.Slider(
157
+ minimum=0,
158
+ maximum=2.0,
159
+ value=1.0,
160
+ step=0.1,
161
+ label="Temperature (0 for greedy)",
162
+ info="Higher = more creative, lower = more deterministic"
163
+ )
164
+ top_k = gr.Slider(
165
+ minimum=0,
166
+ maximum=100,
167
+ value=4,
168
+ step=1,
169
+ label="Top-k (0 for no filtering)",
170
+ info="Number of top tokens to sample from"
171
+ )
172
+
173
+ generate_btn = gr.Button("Generate Response", variant="primary")
174
+ reset_btn = gr.Button("Reset Chat")
175
+
176
+ with gr.Column():
177
+ text_output = gr.Textbox(
178
+ label="Assistant Response (Text)",
179
+ lines=4,
180
+ interactive=False
181
+ )
182
+ audio_output = gr.Audio(
183
+ label="Assistant Response (Audio)",
184
+ type="numpy",
185
+ interactive=False
186
+ )
187
+
188
+ gr.Markdown("""
189
+ ### About LFM2-Audio
190
+
191
+ LFM2-Audio-1.5B is Liquid AI's first end-to-end audio foundation model. It supports:
192
+ - Real-time speech-to-speech conversations
193
+ - Low-latency interleaved text and audio generation
194
+ - Natural flowing conversations
195
+
196
+ [Learn more](https://www.liquid.ai/) | [GitHub](https://github.com/Liquid4All/liquid-audio/)
197
+ """)
198
+
199
+ # Event handlers
200
+ generate_btn.click(
201
+ fn=generate_response,
202
+ inputs=[audio_input, temperature, top_k, chat_state],
203
+ outputs=[audio_output, text_output, chat_state]
204
+ )
205
+
206
+ reset_btn.click(
207
+ fn=reset_chat,
208
+ outputs=[chat_state, text_output, audio_output]
209
+ )
210
+
211
+
212
+ if __name__ == "__main__":
213
+ demo.launch()