Kuangwei Chen commited on
Commit
21ee9bf
·
1 Parent(s): 0b476f3

Fix frontend

Browse files
Files changed (1) hide show
  1. app.py +66 -117
app.py CHANGED
@@ -25,7 +25,7 @@ except ImportError:
25
 
26
  spaces = _SpacesFallback()
27
 
28
- from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService, build_default_voice_presets
29
  from text_normalization_pipeline import WeTextProcessingManager, prepare_tts_request_texts
30
 
31
  APP_DIR = Path(__file__).resolve().parent
@@ -37,8 +37,6 @@ DEMO_METADATA_PATH = APP_DIR / "assets" / "demo.jsonl"
37
 
38
  MODE_VOICE_CLONE = "voice_clone"
39
 
40
- _VOICE_PRESETS = build_default_voice_presets()
41
-
42
 
43
  @dataclass(frozen=True)
44
  class DemoEntry:
@@ -88,38 +86,15 @@ def load_demo_entries() -> list[DemoEntry]:
88
  return demo_entries
89
 
90
 
91
- def build_voice_choices() -> list[tuple[str, str]]:
92
- preferred: list[tuple[str, str]] = []
93
- fallback: list[tuple[str, str]] = []
94
-
95
- for preset in _VOICE_PRESETS.values():
96
- if not preset.prompt_audio_path.is_file():
97
- continue
98
-
99
- item = (f"{preset.name} - {preset.description}", preset.name)
100
- fallback.append(item)
101
- if preset.prompt_audio_path.suffix.lower() == ".wav":
102
- preferred.append(item)
103
-
104
- return preferred or fallback
105
-
106
-
107
- VOICE_CHOICES = build_voice_choices()
108
- DEFAULT_VOICE_VALUE = (
109
- DEFAULT_VOICE
110
- if any(value == DEFAULT_VOICE for _, value in VOICE_CHOICES)
111
- else (VOICE_CHOICES[0][1] if VOICE_CHOICES else "")
112
- )
113
  DEMO_ENTRIES = load_demo_entries()
114
  DEMO_ENTRY_MAP = {entry.demo_id: entry for entry in DEMO_ENTRIES}
115
  DEMO_AUDIO_PATH_MAP = {str(entry.prompt_audio_path): entry for entry in DEMO_ENTRIES}
116
  DEMO_ENTRY_NAME_MAP = {entry.name: entry for entry in DEMO_ENTRIES}
117
- VOICE_FILE_TO_NAME = {
118
- preset.prompt_audio_path.name: preset.name
119
- for preset in _VOICE_PRESETS.values()
120
- if preset.prompt_audio_path.is_file()
121
- }
122
- DEMO_CASE_CHOICES = [("Custom Input", "")] + [(entry.name, entry.demo_id) for entry in DEMO_ENTRIES]
123
 
124
 
125
  def parse_bool_env(name: str, default: bool) -> bool:
@@ -196,61 +171,69 @@ def preload_service() -> None:
196
  def render_mode_hint() -> str:
197
  return (
198
  "Current mode: **Voice Clone** \n"
199
- "Upload a reference audio file or use a built-in preset voice. Audio is returned only after full decoding."
200
  )
201
 
202
 
203
- def resolve_default_prompt_audio_path(voice: str | None) -> str | None:
204
- if voice and voice in _VOICE_PRESETS:
205
- preset_path = _VOICE_PRESETS[voice].prompt_audio_path
206
- if preset_path.is_file():
207
- return str(preset_path)
208
- return None
 
 
 
 
 
209
 
210
 
211
  def resolve_effective_prompt_audio_path(
212
- *,
213
- voice: str,
214
  prompt_audio_path: str | None,
215
  selected_demo_audio_path: str | None,
216
  ) -> str | None:
217
  if prompt_audio_path:
218
- return str(Path(prompt_audio_path).expanduser().resolve())
 
 
219
  if selected_demo_audio_path:
220
- return str(Path(selected_demo_audio_path).expanduser().resolve())
221
- return resolve_default_prompt_audio_path(voice)
 
 
 
 
 
222
 
223
 
224
  def build_prompt_source_text(
225
  *,
226
- voice: str,
227
  prompt_audio_path: str | None,
228
  selected_demo_audio_path: str | None,
229
  ) -> str:
230
- if prompt_audio_path:
231
- return "Uploaded reference audio"
232
- if selected_demo_audio_path:
233
- demo_entry = DEMO_AUDIO_PATH_MAP.get(str(Path(selected_demo_audio_path).expanduser().resolve()))
 
 
 
 
234
  if demo_entry is not None:
235
- return f"Example case: {demo_entry.name}"
236
- return f"Example case: {Path(selected_demo_audio_path).name}"
237
- if voice:
238
- return f"Preset voice: {voice}"
239
- return "No prompt source selected"
240
 
241
 
242
  def refresh_prompt_preview(
243
- voice: str,
244
  prompt_audio_path: str | None,
245
  selected_demo_audio_path: str | None,
246
  ):
247
  preview_path = resolve_effective_prompt_audio_path(
248
- voice=voice,
249
- prompt_audio_path=prompt_audio_path,
250
- selected_demo_audio_path=selected_demo_audio_path,
251
  )
252
  prompt_source = build_prompt_source_text(
253
- voice=voice,
254
  prompt_audio_path=prompt_audio_path,
255
  selected_demo_audio_path=selected_demo_audio_path,
256
  )
@@ -259,46 +242,27 @@ def refresh_prompt_preview(
259
 
260
  def apply_demo_case_selection(
261
  demo_case_id: str,
262
- voice: str,
263
  prompt_audio_path: str | None,
264
  ):
265
- demo_case_id = normalize_demo_case_id(demo_case_id)
266
- if not demo_case_id:
267
- preview_path, prompt_source = refresh_prompt_preview(voice, prompt_audio_path, "")
268
- return (
269
- gr.update(),
270
- gr.update(),
271
- preview_path,
272
- "",
273
- gr.update(),
274
- prompt_source,
275
- )
276
-
277
- demo_entry = DEMO_ENTRY_MAP.get(demo_case_id)
278
  if demo_entry is None:
279
- preview_path, prompt_source = refresh_prompt_preview(voice, prompt_audio_path, "")
280
  return (
281
- gr.update(),
282
  gr.update(),
283
  preview_path,
284
  "",
285
- gr.update(),
286
  prompt_source,
287
  )
288
 
289
- matched_voice = VOICE_FILE_TO_NAME.get(demo_entry.prompt_audio_path.name)
290
- preview_path = str(demo_entry.prompt_audio_path)
291
- prompt_source = build_prompt_source_text(
292
- voice=matched_voice or voice,
293
- prompt_audio_path=None,
294
- selected_demo_audio_path=preview_path,
295
  )
296
  return (
297
  demo_entry.text,
298
- gr.update(value=None),
299
- preview_path,
300
  preview_path,
301
- gr.update(value=matched_voice) if matched_voice else gr.update(),
302
  prompt_source,
303
  )
304
 
@@ -313,8 +277,8 @@ def validate_request(
313
  if not normalized_text:
314
  raise ValueError("Please enter text to synthesize.")
315
 
316
- if not effective_prompt_audio_path and not DEFAULT_VOICE_VALUE:
317
- raise ValueError("No preset voice is available. Please upload a reference audio file.")
318
 
319
  return normalized_text
320
 
@@ -341,9 +305,9 @@ def estimate_gpu_duration(
341
  **kwargs,
342
  ) -> int:
343
  text = kwargs.get("text", args[0] if len(args) > 0 else "")
344
- max_new_frames = kwargs.get("max_new_frames", args[6] if len(args) > 6 else 375)
345
  voice_clone_max_text_tokens = (
346
- kwargs.get("voice_clone_max_text_tokens", args[7] if len(args) > 7 else 75)
347
  )
348
  text_len = len(str(text or "").strip())
349
  estimated = 75 + (text_len // 12) + int(max_new_frames) // 8 + int(voice_clone_max_text_tokens) // 10
@@ -353,7 +317,6 @@ def estimate_gpu_duration(
353
  @spaces.GPU(size="large", duration=estimate_gpu_duration)
354
  def run_inference(
355
  text: str,
356
- voice: str,
357
  prompt_audio_path: str | None,
358
  selected_demo_audio_path: str | None,
359
  enable_wetext_processing: bool,
@@ -375,9 +338,8 @@ def run_inference(
375
  service = get_runtime_tts_service()
376
  text_normalizer_manager = get_text_normalizer_manager() if enable_wetext_processing else None
377
  effective_prompt_audio_path = resolve_effective_prompt_audio_path(
378
- voice=voice,
379
- prompt_audio_path=prompt_audio_path,
380
- selected_demo_audio_path=selected_demo_audio_path,
381
  )
382
  normalized_text = validate_request(
383
  text=text,
@@ -386,13 +348,12 @@ def run_inference(
386
  prepared_texts = prepare_tts_request_texts(
387
  text=normalized_text,
388
  prompt_text="",
389
- voice=voice,
390
  enable_wetext=bool(enable_wetext_processing),
391
  enable_normalize_tts_text=bool(enable_normalize_tts_text),
392
  text_normalizer_manager=text_normalizer_manager,
393
  )
394
  prompt_source = build_prompt_source_text(
395
- voice=voice,
396
  prompt_audio_path=prompt_audio_path,
397
  selected_demo_audio_path=selected_demo_audio_path,
398
  )
@@ -405,7 +366,7 @@ def run_inference(
405
  result = service.synthesize(
406
  text=str(prepared_texts["text"]),
407
  mode=MODE_VOICE_CLONE,
408
- voice=voice,
409
  prompt_audio_path=effective_prompt_audio_path or None,
410
  max_new_frames=int(max_new_frames),
411
  voice_clone_max_text_tokens=int(voice_clone_max_text_tokens),
@@ -456,7 +417,7 @@ def build_demo():
456
  with gr.Column(scale=3):
457
  demo_case = gr.Dropdown(
458
  choices=DEMO_CASE_CHOICES,
459
- value="",
460
  label="Default Case",
461
  info="Select a built-in case to auto-fill the text and prompt preview.",
462
  allow_custom_value=True,
@@ -464,29 +425,24 @@ def build_demo():
464
  text = gr.Textbox(
465
  label="Target Text",
466
  lines=10,
 
467
  placeholder="Enter the text to synthesize.",
468
  )
469
  mode_hint = gr.Markdown(render_mode_hint())
470
- voice = gr.Dropdown(
471
- choices=VOICE_CHOICES,
472
- value=DEFAULT_VOICE_VALUE,
473
- label="Preset Voice",
474
- info="Used by default when no reference audio is uploaded.",
475
- )
476
  prompt_audio = gr.Audio(
477
- label="Reference Audio Upload (optional; overrides preset voice)",
478
  type="filepath",
479
  sources=["upload"],
480
  )
481
  prompt_preview = gr.Audio(
482
  label="Effective Prompt Preview",
483
- value=resolve_default_prompt_audio_path(DEFAULT_VOICE_VALUE),
484
  type="filepath",
485
  interactive=False,
486
  )
487
 
488
  gr.Markdown(
489
- "Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected example case."
490
  )
491
 
492
  with gr.Accordion("Advanced Parameters", open=False):
@@ -580,28 +536,22 @@ def build_demo():
580
  prompt_source = gr.Textbox(
581
  label="Prompt Source",
582
  value=build_prompt_source_text(
583
- voice=DEFAULT_VOICE_VALUE,
584
  prompt_audio_path=None,
585
- selected_demo_audio_path=None,
586
  ),
587
  lines=4,
588
  interactive=False,
589
  )
590
- selected_demo_audio_path = gr.State("")
591
 
592
  demo_case.change(
593
  fn=apply_demo_case_selection,
594
- inputs=[demo_case, voice, prompt_audio],
595
- outputs=[text, prompt_audio, prompt_preview, selected_demo_audio_path, voice, prompt_source],
596
- )
597
- voice.change(
598
- fn=refresh_prompt_preview,
599
- inputs=[voice, prompt_audio, selected_demo_audio_path],
600
- outputs=[prompt_preview, prompt_source],
601
  )
602
  prompt_audio.change(
603
  fn=refresh_prompt_preview,
604
- inputs=[voice, prompt_audio, selected_demo_audio_path],
605
  outputs=[prompt_preview, prompt_source],
606
  )
607
 
@@ -609,7 +559,6 @@ def build_demo():
609
  fn=run_inference,
610
  inputs=[
611
  text,
612
- voice,
613
  prompt_audio,
614
  selected_demo_audio_path,
615
  enable_wetext_processing,
 
25
 
26
  spaces = _SpacesFallback()
27
 
28
+ from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService
29
  from text_normalization_pipeline import WeTextProcessingManager, prepare_tts_request_texts
30
 
31
  APP_DIR = Path(__file__).resolve().parent
 
37
 
38
  MODE_VOICE_CLONE = "voice_clone"
39
 
 
 
40
 
41
  @dataclass(frozen=True)
42
  class DemoEntry:
 
86
  return demo_entries
87
 
88
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  DEMO_ENTRIES = load_demo_entries()
90
  DEMO_ENTRY_MAP = {entry.demo_id: entry for entry in DEMO_ENTRIES}
91
  DEMO_AUDIO_PATH_MAP = {str(entry.prompt_audio_path): entry for entry in DEMO_ENTRIES}
92
  DEMO_ENTRY_NAME_MAP = {entry.name: entry for entry in DEMO_ENTRIES}
93
+ DEFAULT_DEMO_ENTRY = DEMO_ENTRIES[0] if DEMO_ENTRIES else None
94
+ DEFAULT_DEMO_CASE_ID = DEFAULT_DEMO_ENTRY.demo_id if DEFAULT_DEMO_ENTRY is not None else ""
95
+ DEFAULT_DEMO_AUDIO_PATH = str(DEFAULT_DEMO_ENTRY.prompt_audio_path) if DEFAULT_DEMO_ENTRY is not None else ""
96
+ DEFAULT_DEMO_TEXT = DEFAULT_DEMO_ENTRY.text if DEFAULT_DEMO_ENTRY is not None else ""
97
+ DEMO_CASE_CHOICES = [(entry.name, entry.demo_id) for entry in DEMO_ENTRIES]
 
98
 
99
 
100
  def parse_bool_env(name: str, default: bool) -> bool:
 
171
  def render_mode_hint() -> str:
172
  return (
173
  "Current mode: **Voice Clone** \n"
174
+ "Select a Default Case or upload your own reference audio. Uploaded audio overrides the selected Default Case."
175
  )
176
 
177
 
178
+ def resolve_default_demo_entry() -> DemoEntry | None:
179
+ return DEFAULT_DEMO_ENTRY
180
+
181
+
182
+ def resolve_selected_demo_entry(demo_case_id: str | None) -> DemoEntry | None:
183
+ normalized_demo_case_id = normalize_demo_case_id(demo_case_id)
184
+ if normalized_demo_case_id:
185
+ demo_entry = DEMO_ENTRY_MAP.get(normalized_demo_case_id)
186
+ if demo_entry is not None:
187
+ return demo_entry
188
+ return resolve_default_demo_entry()
189
 
190
 
191
  def resolve_effective_prompt_audio_path(
 
 
192
  prompt_audio_path: str | None,
193
  selected_demo_audio_path: str | None,
194
  ) -> str | None:
195
  if prompt_audio_path:
196
+ resolved_path = Path(prompt_audio_path).expanduser().resolve()
197
+ if resolved_path.is_file():
198
+ return str(resolved_path)
199
  if selected_demo_audio_path:
200
+ resolved_path = Path(selected_demo_audio_path).expanduser().resolve()
201
+ if resolved_path.is_file():
202
+ return str(resolved_path)
203
+ demo_entry = resolve_default_demo_entry()
204
+ if demo_entry is not None:
205
+ return str(demo_entry.prompt_audio_path)
206
+ return None
207
 
208
 
209
  def build_prompt_source_text(
210
  *,
 
211
  prompt_audio_path: str | None,
212
  selected_demo_audio_path: str | None,
213
  ) -> str:
214
+ effective_prompt_audio_path = resolve_effective_prompt_audio_path(
215
+ prompt_audio_path,
216
+ selected_demo_audio_path,
217
+ )
218
+ if effective_prompt_audio_path:
219
+ if prompt_audio_path:
220
+ return f"Uploaded reference audio: {Path(effective_prompt_audio_path).name}"
221
+ demo_entry = DEMO_AUDIO_PATH_MAP.get(effective_prompt_audio_path)
222
  if demo_entry is not None:
223
+ return f"Default case: {demo_entry.name}"
224
+ return f"Default case: {Path(effective_prompt_audio_path).name}"
225
+ return "No default case available"
 
 
226
 
227
 
228
  def refresh_prompt_preview(
 
229
  prompt_audio_path: str | None,
230
  selected_demo_audio_path: str | None,
231
  ):
232
  preview_path = resolve_effective_prompt_audio_path(
233
+ prompt_audio_path,
234
+ selected_demo_audio_path,
 
235
  )
236
  prompt_source = build_prompt_source_text(
 
237
  prompt_audio_path=prompt_audio_path,
238
  selected_demo_audio_path=selected_demo_audio_path,
239
  )
 
242
 
243
  def apply_demo_case_selection(
244
  demo_case_id: str,
 
245
  prompt_audio_path: str | None,
246
  ):
247
+ demo_entry = resolve_selected_demo_entry(demo_case_id)
 
 
 
 
 
 
 
 
 
 
 
 
248
  if demo_entry is None:
249
+ preview_path, prompt_source = refresh_prompt_preview(prompt_audio_path, "")
250
  return (
 
251
  gr.update(),
252
  preview_path,
253
  "",
 
254
  prompt_source,
255
  )
256
 
257
+ selected_prompt_path = str(demo_entry.prompt_audio_path)
258
+ preview_path, prompt_source = refresh_prompt_preview(
259
+ prompt_audio_path,
260
+ selected_prompt_path,
 
 
261
  )
262
  return (
263
  demo_entry.text,
 
 
264
  preview_path,
265
+ selected_prompt_path,
266
  prompt_source,
267
  )
268
 
 
277
  if not normalized_text:
278
  raise ValueError("Please enter text to synthesize.")
279
 
280
+ if not effective_prompt_audio_path:
281
+ raise ValueError("No reference audio is available. Please select a Default Case or upload prompt audio.")
282
 
283
  return normalized_text
284
 
 
305
  **kwargs,
306
  ) -> int:
307
  text = kwargs.get("text", args[0] if len(args) > 0 else "")
308
+ max_new_frames = kwargs.get("max_new_frames", args[5] if len(args) > 5 else 375)
309
  voice_clone_max_text_tokens = (
310
+ kwargs.get("voice_clone_max_text_tokens", args[6] if len(args) > 6 else 75)
311
  )
312
  text_len = len(str(text or "").strip())
313
  estimated = 75 + (text_len // 12) + int(max_new_frames) // 8 + int(voice_clone_max_text_tokens) // 10
 
317
  @spaces.GPU(size="large", duration=estimate_gpu_duration)
318
  def run_inference(
319
  text: str,
 
320
  prompt_audio_path: str | None,
321
  selected_demo_audio_path: str | None,
322
  enable_wetext_processing: bool,
 
338
  service = get_runtime_tts_service()
339
  text_normalizer_manager = get_text_normalizer_manager() if enable_wetext_processing else None
340
  effective_prompt_audio_path = resolve_effective_prompt_audio_path(
341
+ prompt_audio_path,
342
+ selected_demo_audio_path,
 
343
  )
344
  normalized_text = validate_request(
345
  text=text,
 
348
  prepared_texts = prepare_tts_request_texts(
349
  text=normalized_text,
350
  prompt_text="",
351
+ voice=DEFAULT_VOICE,
352
  enable_wetext=bool(enable_wetext_processing),
353
  enable_normalize_tts_text=bool(enable_normalize_tts_text),
354
  text_normalizer_manager=text_normalizer_manager,
355
  )
356
  prompt_source = build_prompt_source_text(
 
357
  prompt_audio_path=prompt_audio_path,
358
  selected_demo_audio_path=selected_demo_audio_path,
359
  )
 
366
  result = service.synthesize(
367
  text=str(prepared_texts["text"]),
368
  mode=MODE_VOICE_CLONE,
369
+ voice=DEFAULT_VOICE,
370
  prompt_audio_path=effective_prompt_audio_path or None,
371
  max_new_frames=int(max_new_frames),
372
  voice_clone_max_text_tokens=int(voice_clone_max_text_tokens),
 
417
  with gr.Column(scale=3):
418
  demo_case = gr.Dropdown(
419
  choices=DEMO_CASE_CHOICES,
420
+ value=DEFAULT_DEMO_CASE_ID,
421
  label="Default Case",
422
  info="Select a built-in case to auto-fill the text and prompt preview.",
423
  allow_custom_value=True,
 
425
  text = gr.Textbox(
426
  label="Target Text",
427
  lines=10,
428
+ value=DEFAULT_DEMO_TEXT,
429
  placeholder="Enter the text to synthesize.",
430
  )
431
  mode_hint = gr.Markdown(render_mode_hint())
 
 
 
 
 
 
432
  prompt_audio = gr.Audio(
433
+ label="Reference Audio Upload (optional; overrides Default Case)",
434
  type="filepath",
435
  sources=["upload"],
436
  )
437
  prompt_preview = gr.Audio(
438
  label="Effective Prompt Preview",
439
+ value=DEFAULT_DEMO_AUDIO_PATH or None,
440
  type="filepath",
441
  interactive=False,
442
  )
443
 
444
  gr.Markdown(
445
+ "Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected Default Case."
446
  )
447
 
448
  with gr.Accordion("Advanced Parameters", open=False):
 
536
  prompt_source = gr.Textbox(
537
  label="Prompt Source",
538
  value=build_prompt_source_text(
 
539
  prompt_audio_path=None,
540
+ selected_demo_audio_path=DEFAULT_DEMO_AUDIO_PATH or None,
541
  ),
542
  lines=4,
543
  interactive=False,
544
  )
545
+ selected_demo_audio_path = gr.State(DEFAULT_DEMO_AUDIO_PATH)
546
 
547
  demo_case.change(
548
  fn=apply_demo_case_selection,
549
+ inputs=[demo_case, prompt_audio],
550
+ outputs=[text, prompt_preview, selected_demo_audio_path, prompt_source],
 
 
 
 
 
551
  )
552
  prompt_audio.change(
553
  fn=refresh_prompt_preview,
554
+ inputs=[prompt_audio, selected_demo_audio_path],
555
  outputs=[prompt_preview, prompt_source],
556
  )
557
 
 
559
  fn=run_inference,
560
  inputs=[
561
  text,
 
562
  prompt_audio,
563
  selected_demo_audio_path,
564
  enable_wetext_processing,