bharathkumarK commited on
Commit
cc87d33
·
verified ·
1 Parent(s): fbd30e2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +212 -50
README.md CHANGED
@@ -138,61 +138,223 @@ Real-time voice synthesis with SNAC neural codec (~0.98 kbps). Perfect for:
138
  ### Quick Start: Generate Voice with Emotions
139
 
140
  ```python
 
 
141
  import torch
142
  from transformers import AutoModelForCausalLM, AutoTokenizer
143
  from snac import SNAC
144
  import soundfile as sf
145
-
146
- # Load the best open source voice AI model
147
- model = AutoModelForCausalLM.from_pretrained(
148
- "maya-research/maya1",
149
- torch_dtype=torch.bfloat16,
150
- device_map="auto"
151
- )
152
- tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1")
153
-
154
- # Load SNAC audio decoder (24kHz)
155
- snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval().to("cuda")
156
-
157
- # Design your voice with natural language
158
- description = "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
159
- text = "Hello! This is Maya1 <laugh> the best open source voice AI model with emotions."
160
-
161
- # Create prompt with voice design
162
- prompt = f'<description="{description}"> {text}'
163
-
164
- # Generate emotional speech
165
- inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
166
- with torch.inference_mode():
167
- outputs = model.generate(
168
- **inputs,
169
- max_new_tokens=500,
170
- temperature=0.4,
171
- top_p=0.9,
172
- do_sample=True
 
 
173
  )
174
-
175
- # Extract SNAC audio tokens
176
- generated_ids = outputs[0, inputs['input_ids'].shape[1]:]
177
- snac_tokens = [t.item() for t in generated_ids if 128266 <= t <= 156937]
178
-
179
- # Decode SNAC tokens to audio frames
180
- frames = len(snac_tokens) // 7
181
- codes = [[], [], []]
182
- for i in range(frames):
183
- s = snac_tokens[i*7:(i+1)*7]
184
- codes[0].append((s[0]-128266) % 4096)
185
- codes[1].extend([(s[1]-128266) % 4096, (s[4]-128266) % 4096])
186
- codes[2].extend([(s[2]-128266) % 4096, (s[3]-128266) % 4096, (s[5]-128266) % 4096, (s[6]-128266) % 4096])
187
-
188
- # Generate final audio with SNAC decoder
189
- codes_tensor = [torch.tensor(c, dtype=torch.long, device="cuda").unsqueeze(0) for c in codes]
190
- with torch.inference_mode():
191
- audio = snac_model.decoder(snac_model.quantizer.from_codes(codes_tensor))[0, 0].cpu().numpy()
192
-
193
- # Save your emotional voice output
194
- sf.write("output.wav", audio, 24000)
195
- print("Voice generated successfully! Play output.wav")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  ```
197
 
198
  ### Advanced: Production Streaming with vLLM
 
138
  ### Quick Start: Generate Voice with Emotions
139
 
140
  ```python
141
+ #!/usr/bin/env python3
142
+
143
  import torch
144
  from transformers import AutoModelForCausalLM, AutoTokenizer
145
  from snac import SNAC
146
  import soundfile as sf
147
+ import numpy as np
148
+
149
+ CODE_START_TOKEN_ID = 128257
150
+ CODE_END_TOKEN_ID = 128258
151
+ CODE_TOKEN_OFFSET = 128266
152
+ SNAC_MIN_ID = 128266
153
+ SNAC_MAX_ID = 156937
154
+ SNAC_TOKENS_PER_FRAME = 7
155
+
156
+ SOH_ID = 128259
157
+ EOH_ID = 128260
158
+ SOA_ID = 128261
159
+ BOS_ID = 128000
160
+ TEXT_EOT_ID = 128009
161
+
162
+
163
+ def build_prompt(tokenizer, description: str, text: str) -> str:
164
+ """Build formatted prompt for Maya1."""
165
+ soh_token = tokenizer.decode([SOH_ID])
166
+ eoh_token = tokenizer.decode([EOH_ID])
167
+ soa_token = tokenizer.decode([SOA_ID])
168
+ sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
169
+ eot_token = tokenizer.decode([TEXT_EOT_ID])
170
+ bos_token = tokenizer.bos_token
171
+
172
+ formatted_text = f'<description="{description}"> {text}'
173
+
174
+ prompt = (
175
+ soh_token + bos_token + formatted_text + eot_token +
176
+ eoh_token + soa_token + sos_token
177
  )
178
+
179
+ return prompt
180
+
181
+
182
+ def extract_snac_codes(token_ids: list) -> list:
183
+ """Extract SNAC codes from generated tokens."""
184
+ try:
185
+ eos_idx = token_ids.index(CODE_END_TOKEN_ID)
186
+ except ValueError:
187
+ eos_idx = len(token_ids)
188
+
189
+ snac_codes = [
190
+ token_id for token_id in token_ids[:eos_idx]
191
+ if SNAC_MIN_ID <= token_id <= SNAC_MAX_ID
192
+ ]
193
+
194
+ return snac_codes
195
+
196
+
197
+ def unpack_snac_from_7(snac_tokens: list) -> list:
198
+ """Unpack 7-token SNAC frames to 3 hierarchical levels."""
199
+ if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
200
+ snac_tokens = snac_tokens[:-1]
201
+
202
+ frames = len(snac_tokens) // SNAC_TOKENS_PER_FRAME
203
+ snac_tokens = snac_tokens[:frames * SNAC_TOKENS_PER_FRAME]
204
+
205
+ if frames == 0:
206
+ return [[], [], []]
207
+
208
+ l1, l2, l3 = [], [], []
209
+
210
+ for i in range(frames):
211
+ slots = snac_tokens[i*7:(i+1)*7]
212
+ l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
213
+ l2.extend([
214
+ (slots[1] - CODE_TOKEN_OFFSET) % 4096,
215
+ (slots[4] - CODE_TOKEN_OFFSET) % 4096,
216
+ ])
217
+ l3.extend([
218
+ (slots[2] - CODE_TOKEN_OFFSET) % 4096,
219
+ (slots[3] - CODE_TOKEN_OFFSET) % 4096,
220
+ (slots[5] - CODE_TOKEN_OFFSET) % 4096,
221
+ (slots[6] - CODE_TOKEN_OFFSET) % 4096,
222
+ ])
223
+
224
+ return [l1, l2, l3]
225
+
226
+
227
+ def main():
228
+
229
+ # Load the best open source voice AI model
230
+ print("\n[1/3] Loading Maya1 model...")
231
+ model = AutoModelForCausalLM.from_pretrained(
232
+ "maya-research/maya1",
233
+ torch_dtype=torch.bfloat16,
234
+ device_map="auto",
235
+ trust_remote_code=True
236
+ )
237
+ tokenizer = AutoTokenizer.from_pretrained(
238
+ "maya-research/maya1",
239
+ trust_remote_code=True
240
+ )
241
+ print(f"Model loaded: {len(tokenizer)} tokens in vocabulary")
242
+
243
+ # Load SNAC audio decoder (24kHz)
244
+ print("\n[2/3] Loading SNAC audio decoder...")
245
+ snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
246
+ if torch.cuda.is_available():
247
+ snac_model = snac_model.to("cuda")
248
+ print("SNAC decoder loaded")
249
+
250
+ # Design your voice with natural language
251
+ description = "Realistic male voice in the 30s age with american accent. Normal pitch, warm timbre, conversational pacing."
252
+ text = "Hello! This is Maya1 <laugh_harder> the best open source voice AI model with emotions."
253
+
254
+ print("\n[3/3] Generating speech...")
255
+ print(f"Description: {description}")
256
+ print(f"Text: {text}")
257
+
258
+ # Create prompt with proper formatting
259
+ prompt = build_prompt(tokenizer, description, text)
260
+
261
+ # Debug: Show prompt details
262
+ print(f"\nPrompt preview (first 200 chars):")
263
+ print(f" {repr(prompt[:200])}")
264
+ print(f" Prompt length: {len(prompt)} chars")
265
+
266
+ # Generate emotional speech
267
+ inputs = tokenizer(prompt, return_tensors="pt")
268
+ print(f" Input token count: {inputs['input_ids'].shape[1]} tokens")
269
+ if torch.cuda.is_available():
270
+ inputs = {k: v.to("cuda") for k, v in inputs.items()}
271
+
272
+ with torch.inference_mode():
273
+ outputs = model.generate(
274
+ **inputs,
275
+ max_new_tokens=2048, # Increase to let model finish naturally
276
+ min_new_tokens=28, # At least 4 SNAC frames
277
+ temperature=0.4,
278
+ top_p=0.9,
279
+ repetition_penalty=1.1, # Prevent loops
280
+ do_sample=True,
281
+ eos_token_id=CODE_END_TOKEN_ID, # Stop at end of speech token
282
+ pad_token_id=tokenizer.pad_token_id,
283
+ )
284
+
285
+ # Extract generated tokens (everything after the input prompt)
286
+ generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
287
+
288
+ print(f"Generated {len(generated_ids)} tokens")
289
+
290
+ # Debug: Check what tokens we got
291
+ print(f" First 20 tokens: {generated_ids[:20]}")
292
+ print(f" Last 20 tokens: {generated_ids[-20:]}")
293
+
294
+ # Check if EOS was generated
295
+ if CODE_END_TOKEN_ID in generated_ids:
296
+ eos_position = generated_ids.index(CODE_END_TOKEN_ID)
297
+ print(f" EOS token found at position {eos_position}/{len(generated_ids)}")
298
+
299
+ # Extract SNAC audio tokens
300
+ snac_tokens = extract_snac_codes(generated_ids)
301
+
302
+ print(f"Extracted {len(snac_tokens)} SNAC tokens")
303
+
304
+ # Debug: Analyze token types
305
+ snac_count = sum(1 for t in generated_ids if SNAC_MIN_ID <= t <= SNAC_MAX_ID)
306
+ other_count = sum(1 for t in generated_ids if t < SNAC_MIN_ID or t > SNAC_MAX_ID)
307
+ print(f" SNAC tokens in output: {snac_count}")
308
+ print(f" Other tokens in output: {other_count}")
309
+
310
+ # Check for SOS token
311
+ if CODE_START_TOKEN_ID in generated_ids:
312
+ sos_pos = generated_ids.index(CODE_START_TOKEN_ID)
313
+ print(f" SOS token at position: {sos_pos}")
314
+ else:
315
+ print(f" No SOS token found in generated output!")
316
+
317
+ if len(snac_tokens) < 7:
318
+ print("Error: Not enough SNAC tokens generated")
319
+ return
320
+
321
+ # Unpack SNAC tokens to 3 hierarchical levels
322
+ levels = unpack_snac_from_7(snac_tokens)
323
+ frames = len(levels[0])
324
+
325
+ print(f"Unpacked to {frames} frames")
326
+ print(f" L1: {len(levels[0])} codes")
327
+ print(f" L2: {len(levels[1])} codes")
328
+ print(f" L3: {len(levels[2])} codes")
329
+
330
+ # Convert to tensors
331
+ device = "cuda" if torch.cuda.is_available() else "cpu"
332
+ codes_tensor = [
333
+ torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
334
+ for level in levels
335
+ ]
336
+
337
+ # Generate final audio with SNAC decoder
338
+ print("\n[4/4] Decoding to audio...")
339
+ with torch.inference_mode():
340
+ z_q = snac_model.quantizer.from_codes(codes_tensor)
341
+ audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
342
+
343
+ # Trim warmup samples (first 2048 samples)
344
+ if len(audio) > 2048:
345
+ audio = audio[2048:]
346
+
347
+ duration_sec = len(audio) / 24000
348
+ print(f"Audio generated: {len(audio)} samples ({duration_sec:.2f}s)")
349
+
350
+ # Save your emotional voice output
351
+ output_file = "output.wav"
352
+ sf.write(output_file, audio, 24000)
353
+ print(f"\nVoice generated successfully!")
354
+
355
+
356
+ if __name__ == "__main__":
357
+ main()
358
  ```
359
 
360
  ### Advanced: Production Streaming with vLLM