abidlabs HF Staff commited on
Commit
68ae9a5
·
verified ·
1 Parent(s): 8175488

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +123 -126
app.py CHANGED
@@ -55,15 +55,27 @@ tokenizer = None
55
  snac_model = None
56
  models_loaded = False
57
 
 
58
  def build_prompt(tokenizer, description: str, text: str) -> str:
59
- """Build formatted prompt for Maya1."""
 
 
 
 
 
 
 
 
 
 
 
60
  soh_token = tokenizer.decode([SOH_ID])
61
  eoh_token = tokenizer.decode([EOH_ID])
62
  soa_token = tokenizer.decode([SOA_ID])
63
  sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
64
  eot_token = tokenizer.decode([TEXT_EOT_ID])
65
  bos_token = tokenizer.bos_token
66
-
67
  formatted_text = f'<description="{description}"> {text}'
68
  prompt = (
69
  soh_token + bos_token + formatted_text + eot_token +
@@ -71,21 +83,33 @@ def build_prompt(tokenizer, description: str, text: str) -> str:
71
  )
72
  return prompt
73
 
 
74
  def unpack_snac_from_7(snac_tokens: list) -> list:
75
- """Unpack 7-token SNAC frames to 3 hierarchical levels."""
 
 
 
 
 
 
 
 
 
 
 
76
  if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
77
  snac_tokens = snac_tokens[:-1]
78
-
79
  frames = len(snac_tokens) // 7
80
  snac_tokens = snac_tokens[:frames * 7]
81
-
82
  if frames == 0:
83
  return [[], [], []]
84
-
85
  l1, l2, l3 = [], [], []
86
-
87
  for i in range(frames):
88
- slots = snac_tokens[i*7:(i+1)*7]
89
  l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
90
  l2.extend([
91
  (slots[1] - CODE_TOKEN_OFFSET) % 4096,
@@ -97,220 +121,193 @@ def unpack_snac_from_7(snac_tokens: list) -> list:
97
  (slots[5] - CODE_TOKEN_OFFSET) % 4096,
98
  (slots[6] - CODE_TOKEN_OFFSET) % 4096,
99
  ])
100
-
101
  return [l1, l2, l3]
102
 
 
103
  def load_models():
104
- """Load Maya1 Transformers model (runs once)."""
 
 
 
 
105
  global model, tokenizer, snac_model, models_loaded
106
-
107
  if models_loaded:
108
  return
109
-
110
  print("Loading Maya1 model with Transformers...")
111
  model = AutoModelForCausalLM.from_pretrained(
112
- "maya-research/maya1",
113
- torch_dtype=torch.bfloat16,
114
  device_map="auto",
115
  trust_remote_code=True
116
  )
117
- tokenizer = AutoTokenizer.from_pretrained("maya-research/maya1", trust_remote_code=True)
118
-
 
 
 
119
  print("Loading SNAC decoder...")
120
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
121
  if torch.cuda.is_available():
122
  snac_model = snac_model.to("cuda")
123
-
124
  models_loaded = True
125
  print("Models loaded successfully!")
126
 
 
127
  def preset_selected(preset_name):
128
- """Update description and text when preset is selected."""
 
 
 
 
 
 
 
 
 
 
129
  if preset_name in PRESET_CHARACTERS:
130
  char = PRESET_CHARACTERS[preset_name]
131
  return char["description"], char["example_text"]
132
  return "", ""
133
 
 
134
  @spaces.GPU
135
  def generate_speech(preset_name, description, text, temperature, max_tokens):
136
- """Generate emotional speech from description and text using Transformers."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  try:
138
- # Load models if not already loaded
139
  load_models()
140
-
141
- # Validate inputs
142
  if not description or not text:
143
  return None, "Error: Please provide both description and text!"
144
-
145
- print(f"Generating with temperature={temperature}, max_tokens={max_tokens}...")
146
-
147
- # Build prompt
148
  prompt = build_prompt(tokenizer, description, text)
149
  inputs = tokenizer(prompt, return_tensors="pt")
150
-
151
  if torch.cuda.is_available():
152
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
153
-
154
- # Generate tokens
155
  with torch.inference_mode():
156
  outputs = model.generate(
157
- **inputs,
158
  max_new_tokens=max_tokens,
159
  min_new_tokens=28,
160
- temperature=temperature,
161
- top_p=0.9,
162
  repetition_penalty=1.1,
163
  do_sample=True,
164
  eos_token_id=CODE_END_TOKEN_ID,
165
  pad_token_id=tokenizer.pad_token_id,
166
  )
167
-
168
- # Extract SNAC tokens
169
- generated_ids = outputs[0, inputs['input_ids'].shape[1]:].tolist()
170
-
171
- # Find EOS and extract SNAC codes
172
  eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
173
  snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
174
-
175
  if len(snac_tokens) < 7:
176
  return None, "Error: Not enough tokens generated. Try different text or increase max_tokens."
177
-
178
- # Unpack and decode
179
  levels = unpack_snac_from_7(snac_tokens)
180
- frames = len(levels[0])
181
-
182
  device = "cuda" if torch.cuda.is_available() else "cpu"
183
- codes_tensor = [torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0) for level in levels]
184
-
 
 
 
185
  with torch.inference_mode():
186
  z_q = snac_model.quantizer.from_codes(codes_tensor)
187
  audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
188
-
189
- # Trim warmup
190
  if len(audio) > 2048:
191
  audio = audio[2048:]
192
-
193
- # Convert to WAV and save to temporary file
194
  import tempfile
195
  import soundfile as sf
196
-
197
  audio_int16 = (audio * 32767).astype(np.int16)
198
-
199
- # Create temporary file
200
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
201
  tmp_path = tmp_file.name
202
-
203
- # Save audio
204
  sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)
205
-
206
  duration = len(audio) / AUDIO_SAMPLE_RATE
207
- status_msg = f"Generated {duration:.2f}s of emotional speech!"
208
-
209
- return tmp_path, status_msg
210
-
211
  except Exception as e:
212
  import traceback
213
  error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
214
  print(error_msg)
215
  return None, error_msg
216
 
217
- # Create Gradio interface
 
 
218
  with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
219
  gr.Markdown("""
220
  # Maya1 - Open Source Emotional Text-to-Speech
221
-
222
  **The best open source voice AI model with emotions!**
223
-
224
- Generate realistic and expressive speech with natural language voice design.
225
- Choose a preset character or create your own custom voice.
226
-
227
- [Model](https://huggingface.co/maya-research/maya1) | [GitHub](https://github.com/MayaResearch/maya1-fastapi)
228
  """)
229
-
230
  with gr.Row():
231
  with gr.Column(scale=1):
232
- gr.Markdown("### Character Selection")
233
-
234
  preset_dropdown = gr.Dropdown(
235
  choices=list(PRESET_CHARACTERS.keys()),
236
- label="Preset Characters",
237
  value=list(PRESET_CHARACTERS.keys())[0],
238
- info="Quick pick from 4 preset characters"
239
  )
240
-
241
- gr.Markdown("### Voice Design")
242
-
243
  description_input = gr.Textbox(
244
  label="Voice Description",
245
- placeholder="E.g., Male voice in their 30s with american accent. Normal pitch, warm timbre...",
246
  lines=3,
247
  value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
248
  )
249
-
250
  text_input = gr.Textbox(
251
  label="Text to Speak",
252
- placeholder="Enter text with <emotion> tags like <laugh>, <sigh>, <excited>...",
253
  lines=4,
254
  value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
255
  )
256
-
257
- with gr.Accordion("Advanced Settings", open=False):
258
- temperature_slider = gr.Slider(
259
- minimum=0.1,
260
- maximum=1.0,
261
- value=0.4,
262
- step=0.1,
263
- label="Temperature",
264
- info="Lower = more stable, Higher = more creative"
265
- )
266
-
267
- max_tokens_slider = gr.Slider(
268
- minimum=100,
269
- maximum=2048,
270
- value=1500,
271
- step=50,
272
- label="Max Tokens",
273
- info="More tokens = longer audio"
274
- )
275
-
276
- generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
277
-
278
  with gr.Column(scale=1):
279
- gr.Markdown("### Generated Audio")
280
-
281
- audio_output = gr.Audio(
282
- label="Generated Speech",
283
- type="filepath",
284
- interactive=False
285
- )
286
-
287
- status_output = gr.Textbox(
288
- label="Status",
289
- lines=3,
290
- interactive=False
291
- )
292
-
293
- gr.Markdown("""
294
- ### Supported Emotions
295
-
296
- `<angry>` `<chuckle>` `<cry>` `<disappointed>` `<excited>` `<gasp>`
297
- `<giggle>` `<laugh>` `<laugh_harder>` `<sarcastic>` `<sigh>`
298
- `<sing>` `<whisper>`
299
- """)
300
-
301
- # Event handlers
302
  preset_dropdown.change(
303
  fn=preset_selected,
304
- inputs=[preset_dropdown],
305
  outputs=[description_input, text_input]
306
  )
307
-
308
  generate_btn.click(
309
  fn=generate_speech,
310
  inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
311
  outputs=[audio_output, status_output]
312
  )
313
 
314
- if __name__ == "__main__":
315
- demo.launch()
316
 
 
 
 
55
  snac_model = None
56
  models_loaded = False
57
 
58
+
59
  def build_prompt(tokenizer, description: str, text: str) -> str:
60
+ """
61
+ Build a formatted prompt for the Maya1 text-to-speech model.
62
+ This function constructs the full input prompt expected by Maya1, including
63
+ special control tokens and a structured description tag that defines voice
64
+ characteristics and emotional delivery.
65
+ Args:
66
+ tokenizer: The tokenizer associated with the Maya1 model.
67
+ description (str): A structured natural-language description of the voice.
68
+ text (str): The text content to be synthesized into speech.
69
+ Returns:
70
+ str: A fully formatted prompt string ready for tokenization and generation.
71
+ """
72
  soh_token = tokenizer.decode([SOH_ID])
73
  eoh_token = tokenizer.decode([EOH_ID])
74
  soa_token = tokenizer.decode([SOA_ID])
75
  sos_token = tokenizer.decode([CODE_START_TOKEN_ID])
76
  eot_token = tokenizer.decode([TEXT_EOT_ID])
77
  bos_token = tokenizer.bos_token
78
+
79
  formatted_text = f'<description="{description}"> {text}'
80
  prompt = (
81
  soh_token + bos_token + formatted_text + eot_token +
 
83
  )
84
  return prompt
85
 
86
+
87
  def unpack_snac_from_7(snac_tokens: list) -> list:
88
+ """
89
+ Unpack SNAC tokens from 7-token frames into hierarchical code levels.
90
+ This function converts a flat list of SNAC token IDs produced by the model
91
+ into three hierarchical code streams required by the SNAC decoder.
92
+ Args:
93
+ snac_tokens (list): A list of integer SNAC token IDs generated by the model.
94
+ Returns:
95
+ list:
96
+ - level_1 (list[int]): Coarse acoustic codes.
97
+ - level_2 (list[int]): Mid-level acoustic codes.
98
+ - level_3 (list[int]): Fine-grained acoustic codes.
99
+ """
100
  if snac_tokens and snac_tokens[-1] == CODE_END_TOKEN_ID:
101
  snac_tokens = snac_tokens[:-1]
102
+
103
  frames = len(snac_tokens) // 7
104
  snac_tokens = snac_tokens[:frames * 7]
105
+
106
  if frames == 0:
107
  return [[], [], []]
108
+
109
  l1, l2, l3 = [], [], []
110
+
111
  for i in range(frames):
112
+ slots = snac_tokens[i * 7:(i + 1) * 7]
113
  l1.append((slots[0] - CODE_TOKEN_OFFSET) % 4096)
114
  l2.extend([
115
  (slots[1] - CODE_TOKEN_OFFSET) % 4096,
 
121
  (slots[5] - CODE_TOKEN_OFFSET) % 4096,
122
  (slots[6] - CODE_TOKEN_OFFSET) % 4096,
123
  ])
124
+
125
  return [l1, l2, l3]
126
 
127
+
128
  def load_models():
129
+ """
130
+ Load the Maya1 language model, tokenizer, and SNAC audio decoder.
131
+ This function performs one-time initialization of all required models.
132
+ Subsequent calls are no-ops to avoid reloading large model weights.
133
+ """
134
  global model, tokenizer, snac_model, models_loaded
135
+
136
  if models_loaded:
137
  return
138
+
139
  print("Loading Maya1 model with Transformers...")
140
  model = AutoModelForCausalLM.from_pretrained(
141
+ "maya-research/maya1",
142
+ torch_dtype=torch.bfloat16,
143
  device_map="auto",
144
  trust_remote_code=True
145
  )
146
+ tokenizer = AutoTokenizer.from_pretrained(
147
+ "maya-research/maya1",
148
+ trust_remote_code=True
149
+ )
150
+
151
  print("Loading SNAC decoder...")
152
  snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").eval()
153
  if torch.cuda.is_available():
154
  snac_model = snac_model.to("cuda")
155
+
156
  models_loaded = True
157
  print("Models loaded successfully!")
158
 
159
+
160
  def preset_selected(preset_name):
161
+ """
162
+ Update the voice description and example text based on a preset selection.
163
+ This function is used as a Gradio event handler to populate UI fields when
164
+ a preset character is chosen.
165
+ Args:
166
+ preset_name (str): The name of the selected preset character.
167
+ Returns:
168
+ tuple:
169
+ - description (str): The preset voice description.
170
+ - example_text (str): The preset example dialogue.
171
+ """
172
  if preset_name in PRESET_CHARACTERS:
173
  char = PRESET_CHARACTERS[preset_name]
174
  return char["description"], char["example_text"]
175
  return "", ""
176
 
177
+
178
  @spaces.GPU
179
  def generate_speech(preset_name, description, text, temperature, max_tokens):
180
+ """
181
+ Generate emotional speech audio from text and voice description.
182
+ This function runs the full Maya1 inference pipeline: prompt construction,
183
+ token generation, SNAC code extraction, audio decoding, and WAV export.
184
+ It is designed to be called directly from a Gradio interface.
185
+ Args:
186
+ preset_name (str): Name of the selected preset character.
187
+ description (str): Natural-language voice design description.
188
+ text (str): Input text containing optional emotion tags.
189
+ temperature (float): Sampling temperature controlling creativity.
190
+ max_tokens (int): Maximum number of tokens to generate.
191
+ Returns:
192
+ tuple:
193
+ - audio_path (str or None): Path to the generated WAV file.
194
+ - status_message (str): Success or error message.
195
+ """
196
  try:
 
197
  load_models()
198
+
 
199
  if not description or not text:
200
  return None, "Error: Please provide both description and text!"
201
+
 
 
 
202
  prompt = build_prompt(tokenizer, description, text)
203
  inputs = tokenizer(prompt, return_tensors="pt")
204
+
205
  if torch.cuda.is_available():
206
  inputs = {k: v.to("cuda") for k, v in inputs.items()}
207
+
 
208
  with torch.inference_mode():
209
  outputs = model.generate(
210
+ **inputs,
211
  max_new_tokens=max_tokens,
212
  min_new_tokens=28,
213
+ temperature=temperature,
214
+ top_p=0.9,
215
  repetition_penalty=1.1,
216
  do_sample=True,
217
  eos_token_id=CODE_END_TOKEN_ID,
218
  pad_token_id=tokenizer.pad_token_id,
219
  )
220
+
221
+ generated_ids = outputs[0, inputs["input_ids"].shape[1]:].tolist()
 
 
 
222
  eos_idx = generated_ids.index(CODE_END_TOKEN_ID) if CODE_END_TOKEN_ID in generated_ids else len(generated_ids)
223
  snac_tokens = [t for t in generated_ids[:eos_idx] if SNAC_MIN_ID <= t <= SNAC_MAX_ID]
224
+
225
  if len(snac_tokens) < 7:
226
  return None, "Error: Not enough tokens generated. Try different text or increase max_tokens."
227
+
 
228
  levels = unpack_snac_from_7(snac_tokens)
 
 
229
  device = "cuda" if torch.cuda.is_available() else "cpu"
230
+ codes_tensor = [
231
+ torch.tensor(level, dtype=torch.long, device=device).unsqueeze(0)
232
+ for level in levels
233
+ ]
234
+
235
  with torch.inference_mode():
236
  z_q = snac_model.quantizer.from_codes(codes_tensor)
237
  audio = snac_model.decoder(z_q)[0, 0].cpu().numpy()
238
+
 
239
  if len(audio) > 2048:
240
  audio = audio[2048:]
241
+
 
242
  import tempfile
243
  import soundfile as sf
244
+
245
  audio_int16 = (audio * 32767).astype(np.int16)
246
+
247
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
 
248
  tmp_path = tmp_file.name
249
+
 
250
  sf.write(tmp_path, audio_int16, AUDIO_SAMPLE_RATE)
251
+
252
  duration = len(audio) / AUDIO_SAMPLE_RATE
253
+ return tmp_path, f"Generated {duration:.2f}s of emotional speech!"
254
+
 
 
255
  except Exception as e:
256
  import traceback
257
  error_msg = f"Error: {str(e)}\n{traceback.format_exc()}"
258
  print(error_msg)
259
  return None, error_msg
260
 
261
+
262
+ # -------------------- Gradio App --------------------
263
+
264
  with gr.Blocks(title="Maya1 - Open Source Emotional TTS", theme=gr.themes.Soft()) as demo:
265
  gr.Markdown("""
266
  # Maya1 - Open Source Emotional Text-to-Speech
 
267
  **The best open source voice AI model with emotions!**
 
 
 
 
 
268
  """)
269
+
270
  with gr.Row():
271
  with gr.Column(scale=1):
 
 
272
  preset_dropdown = gr.Dropdown(
273
  choices=list(PRESET_CHARACTERS.keys()),
 
274
  value=list(PRESET_CHARACTERS.keys())[0],
275
+ label="Preset Characters"
276
  )
277
+
 
 
278
  description_input = gr.Textbox(
279
  label="Voice Description",
 
280
  lines=3,
281
  value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["description"]
282
  )
283
+
284
  text_input = gr.Textbox(
285
  label="Text to Speak",
 
286
  lines=4,
287
  value=PRESET_CHARACTERS[list(PRESET_CHARACTERS.keys())[0]]["example_text"]
288
  )
289
+
290
+ temperature_slider = gr.Slider(0.1, 1.0, 0.4, step=0.1, label="Temperature")
291
+ max_tokens_slider = gr.Slider(100, 2048, 1500, step=50, label="Max Tokens")
292
+
293
+ generate_btn = gr.Button("Generate Speech", variant="primary")
294
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  with gr.Column(scale=1):
296
+ audio_output = gr.Audio(type="filepath", label="Generated Audio")
297
+ status_output = gr.Textbox(label="Status")
298
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  preset_dropdown.change(
300
  fn=preset_selected,
301
+ inputs=preset_dropdown,
302
  outputs=[description_input, text_input]
303
  )
304
+
305
  generate_btn.click(
306
  fn=generate_speech,
307
  inputs=[preset_dropdown, description_input, text_input, temperature_slider, max_tokens_slider],
308
  outputs=[audio_output, status_output]
309
  )
310
 
 
 
311
 
312
+ if __name__ == "__main__":
313
+ demo.launch(mcp_server=True)