Spaces:
Running
Running
Kuangwei Chen commited on
Commit ·
21ee9bf
1
Parent(s): 0b476f3
Fix frontend
Browse files
app.py
CHANGED
|
@@ -25,7 +25,7 @@ except ImportError:
|
|
| 25 |
|
| 26 |
spaces = _SpacesFallback()
|
| 27 |
|
| 28 |
-
from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService
|
| 29 |
from text_normalization_pipeline import WeTextProcessingManager, prepare_tts_request_texts
|
| 30 |
|
| 31 |
APP_DIR = Path(__file__).resolve().parent
|
|
@@ -37,8 +37,6 @@ DEMO_METADATA_PATH = APP_DIR / "assets" / "demo.jsonl"
|
|
| 37 |
|
| 38 |
MODE_VOICE_CLONE = "voice_clone"
|
| 39 |
|
| 40 |
-
_VOICE_PRESETS = build_default_voice_presets()
|
| 41 |
-
|
| 42 |
|
| 43 |
@dataclass(frozen=True)
|
| 44 |
class DemoEntry:
|
|
@@ -88,38 +86,15 @@ def load_demo_entries() -> list[DemoEntry]:
|
|
| 88 |
return demo_entries
|
| 89 |
|
| 90 |
|
| 91 |
-
def build_voice_choices() -> list[tuple[str, str]]:
|
| 92 |
-
preferred: list[tuple[str, str]] = []
|
| 93 |
-
fallback: list[tuple[str, str]] = []
|
| 94 |
-
|
| 95 |
-
for preset in _VOICE_PRESETS.values():
|
| 96 |
-
if not preset.prompt_audio_path.is_file():
|
| 97 |
-
continue
|
| 98 |
-
|
| 99 |
-
item = (f"{preset.name} - {preset.description}", preset.name)
|
| 100 |
-
fallback.append(item)
|
| 101 |
-
if preset.prompt_audio_path.suffix.lower() == ".wav":
|
| 102 |
-
preferred.append(item)
|
| 103 |
-
|
| 104 |
-
return preferred or fallback
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
VOICE_CHOICES = build_voice_choices()
|
| 108 |
-
DEFAULT_VOICE_VALUE = (
|
| 109 |
-
DEFAULT_VOICE
|
| 110 |
-
if any(value == DEFAULT_VOICE for _, value in VOICE_CHOICES)
|
| 111 |
-
else (VOICE_CHOICES[0][1] if VOICE_CHOICES else "")
|
| 112 |
-
)
|
| 113 |
DEMO_ENTRIES = load_demo_entries()
|
| 114 |
DEMO_ENTRY_MAP = {entry.demo_id: entry for entry in DEMO_ENTRIES}
|
| 115 |
DEMO_AUDIO_PATH_MAP = {str(entry.prompt_audio_path): entry for entry in DEMO_ENTRIES}
|
| 116 |
DEMO_ENTRY_NAME_MAP = {entry.name: entry for entry in DEMO_ENTRIES}
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
DEMO_CASE_CHOICES = [("Custom Input", "")] + [(entry.name, entry.demo_id) for entry in DEMO_ENTRIES]
|
| 123 |
|
| 124 |
|
| 125 |
def parse_bool_env(name: str, default: bool) -> bool:
|
|
@@ -196,61 +171,69 @@ def preload_service() -> None:
|
|
| 196 |
def render_mode_hint() -> str:
|
| 197 |
return (
|
| 198 |
"Current mode: **Voice Clone** \n"
|
| 199 |
-
"
|
| 200 |
)
|
| 201 |
|
| 202 |
|
| 203 |
-
def
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
| 210 |
|
| 211 |
def resolve_effective_prompt_audio_path(
|
| 212 |
-
*,
|
| 213 |
-
voice: str,
|
| 214 |
prompt_audio_path: str | None,
|
| 215 |
selected_demo_audio_path: str | None,
|
| 216 |
) -> str | None:
|
| 217 |
if prompt_audio_path:
|
| 218 |
-
|
|
|
|
|
|
|
| 219 |
if selected_demo_audio_path:
|
| 220 |
-
|
| 221 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
|
| 224 |
def build_prompt_source_text(
|
| 225 |
*,
|
| 226 |
-
voice: str,
|
| 227 |
prompt_audio_path: str | None,
|
| 228 |
selected_demo_audio_path: str | None,
|
| 229 |
) -> str:
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
if demo_entry is not None:
|
| 235 |
-
return f"
|
| 236 |
-
return f"
|
| 237 |
-
|
| 238 |
-
return f"Preset voice: {voice}"
|
| 239 |
-
return "No prompt source selected"
|
| 240 |
|
| 241 |
|
| 242 |
def refresh_prompt_preview(
|
| 243 |
-
voice: str,
|
| 244 |
prompt_audio_path: str | None,
|
| 245 |
selected_demo_audio_path: str | None,
|
| 246 |
):
|
| 247 |
preview_path = resolve_effective_prompt_audio_path(
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
selected_demo_audio_path=selected_demo_audio_path,
|
| 251 |
)
|
| 252 |
prompt_source = build_prompt_source_text(
|
| 253 |
-
voice=voice,
|
| 254 |
prompt_audio_path=prompt_audio_path,
|
| 255 |
selected_demo_audio_path=selected_demo_audio_path,
|
| 256 |
)
|
|
@@ -259,46 +242,27 @@ def refresh_prompt_preview(
|
|
| 259 |
|
| 260 |
def apply_demo_case_selection(
|
| 261 |
demo_case_id: str,
|
| 262 |
-
voice: str,
|
| 263 |
prompt_audio_path: str | None,
|
| 264 |
):
|
| 265 |
-
|
| 266 |
-
if not demo_case_id:
|
| 267 |
-
preview_path, prompt_source = refresh_prompt_preview(voice, prompt_audio_path, "")
|
| 268 |
-
return (
|
| 269 |
-
gr.update(),
|
| 270 |
-
gr.update(),
|
| 271 |
-
preview_path,
|
| 272 |
-
"",
|
| 273 |
-
gr.update(),
|
| 274 |
-
prompt_source,
|
| 275 |
-
)
|
| 276 |
-
|
| 277 |
-
demo_entry = DEMO_ENTRY_MAP.get(demo_case_id)
|
| 278 |
if demo_entry is None:
|
| 279 |
-
preview_path, prompt_source = refresh_prompt_preview(
|
| 280 |
return (
|
| 281 |
-
gr.update(),
|
| 282 |
gr.update(),
|
| 283 |
preview_path,
|
| 284 |
"",
|
| 285 |
-
gr.update(),
|
| 286 |
prompt_source,
|
| 287 |
)
|
| 288 |
|
| 289 |
-
|
| 290 |
-
preview_path =
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
prompt_audio_path=None,
|
| 294 |
-
selected_demo_audio_path=preview_path,
|
| 295 |
)
|
| 296 |
return (
|
| 297 |
demo_entry.text,
|
| 298 |
-
gr.update(value=None),
|
| 299 |
-
preview_path,
|
| 300 |
preview_path,
|
| 301 |
-
|
| 302 |
prompt_source,
|
| 303 |
)
|
| 304 |
|
|
@@ -313,8 +277,8 @@ def validate_request(
|
|
| 313 |
if not normalized_text:
|
| 314 |
raise ValueError("Please enter text to synthesize.")
|
| 315 |
|
| 316 |
-
if not effective_prompt_audio_path
|
| 317 |
-
raise ValueError("No
|
| 318 |
|
| 319 |
return normalized_text
|
| 320 |
|
|
@@ -341,9 +305,9 @@ def estimate_gpu_duration(
|
|
| 341 |
**kwargs,
|
| 342 |
) -> int:
|
| 343 |
text = kwargs.get("text", args[0] if len(args) > 0 else "")
|
| 344 |
-
max_new_frames = kwargs.get("max_new_frames", args[
|
| 345 |
voice_clone_max_text_tokens = (
|
| 346 |
-
kwargs.get("voice_clone_max_text_tokens", args[
|
| 347 |
)
|
| 348 |
text_len = len(str(text or "").strip())
|
| 349 |
estimated = 75 + (text_len // 12) + int(max_new_frames) // 8 + int(voice_clone_max_text_tokens) // 10
|
|
@@ -353,7 +317,6 @@ def estimate_gpu_duration(
|
|
| 353 |
@spaces.GPU(size="large", duration=estimate_gpu_duration)
|
| 354 |
def run_inference(
|
| 355 |
text: str,
|
| 356 |
-
voice: str,
|
| 357 |
prompt_audio_path: str | None,
|
| 358 |
selected_demo_audio_path: str | None,
|
| 359 |
enable_wetext_processing: bool,
|
|
@@ -375,9 +338,8 @@ def run_inference(
|
|
| 375 |
service = get_runtime_tts_service()
|
| 376 |
text_normalizer_manager = get_text_normalizer_manager() if enable_wetext_processing else None
|
| 377 |
effective_prompt_audio_path = resolve_effective_prompt_audio_path(
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
selected_demo_audio_path=selected_demo_audio_path,
|
| 381 |
)
|
| 382 |
normalized_text = validate_request(
|
| 383 |
text=text,
|
|
@@ -386,13 +348,12 @@ def run_inference(
|
|
| 386 |
prepared_texts = prepare_tts_request_texts(
|
| 387 |
text=normalized_text,
|
| 388 |
prompt_text="",
|
| 389 |
-
voice=
|
| 390 |
enable_wetext=bool(enable_wetext_processing),
|
| 391 |
enable_normalize_tts_text=bool(enable_normalize_tts_text),
|
| 392 |
text_normalizer_manager=text_normalizer_manager,
|
| 393 |
)
|
| 394 |
prompt_source = build_prompt_source_text(
|
| 395 |
-
voice=voice,
|
| 396 |
prompt_audio_path=prompt_audio_path,
|
| 397 |
selected_demo_audio_path=selected_demo_audio_path,
|
| 398 |
)
|
|
@@ -405,7 +366,7 @@ def run_inference(
|
|
| 405 |
result = service.synthesize(
|
| 406 |
text=str(prepared_texts["text"]),
|
| 407 |
mode=MODE_VOICE_CLONE,
|
| 408 |
-
voice=
|
| 409 |
prompt_audio_path=effective_prompt_audio_path or None,
|
| 410 |
max_new_frames=int(max_new_frames),
|
| 411 |
voice_clone_max_text_tokens=int(voice_clone_max_text_tokens),
|
|
@@ -456,7 +417,7 @@ def build_demo():
|
|
| 456 |
with gr.Column(scale=3):
|
| 457 |
demo_case = gr.Dropdown(
|
| 458 |
choices=DEMO_CASE_CHOICES,
|
| 459 |
-
value=
|
| 460 |
label="Default Case",
|
| 461 |
info="Select a built-in case to auto-fill the text and prompt preview.",
|
| 462 |
allow_custom_value=True,
|
|
@@ -464,29 +425,24 @@ def build_demo():
|
|
| 464 |
text = gr.Textbox(
|
| 465 |
label="Target Text",
|
| 466 |
lines=10,
|
|
|
|
| 467 |
placeholder="Enter the text to synthesize.",
|
| 468 |
)
|
| 469 |
mode_hint = gr.Markdown(render_mode_hint())
|
| 470 |
-
voice = gr.Dropdown(
|
| 471 |
-
choices=VOICE_CHOICES,
|
| 472 |
-
value=DEFAULT_VOICE_VALUE,
|
| 473 |
-
label="Preset Voice",
|
| 474 |
-
info="Used by default when no reference audio is uploaded.",
|
| 475 |
-
)
|
| 476 |
prompt_audio = gr.Audio(
|
| 477 |
-
label="Reference Audio Upload (optional; overrides
|
| 478 |
type="filepath",
|
| 479 |
sources=["upload"],
|
| 480 |
)
|
| 481 |
prompt_preview = gr.Audio(
|
| 482 |
label="Effective Prompt Preview",
|
| 483 |
-
value=
|
| 484 |
type="filepath",
|
| 485 |
interactive=False,
|
| 486 |
)
|
| 487 |
|
| 488 |
gr.Markdown(
|
| 489 |
-
"Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected
|
| 490 |
)
|
| 491 |
|
| 492 |
with gr.Accordion("Advanced Parameters", open=False):
|
|
@@ -580,28 +536,22 @@ def build_demo():
|
|
| 580 |
prompt_source = gr.Textbox(
|
| 581 |
label="Prompt Source",
|
| 582 |
value=build_prompt_source_text(
|
| 583 |
-
voice=DEFAULT_VOICE_VALUE,
|
| 584 |
prompt_audio_path=None,
|
| 585 |
-
selected_demo_audio_path=None,
|
| 586 |
),
|
| 587 |
lines=4,
|
| 588 |
interactive=False,
|
| 589 |
)
|
| 590 |
-
selected_demo_audio_path = gr.State(
|
| 591 |
|
| 592 |
demo_case.change(
|
| 593 |
fn=apply_demo_case_selection,
|
| 594 |
-
inputs=[demo_case,
|
| 595 |
-
outputs=[text,
|
| 596 |
-
)
|
| 597 |
-
voice.change(
|
| 598 |
-
fn=refresh_prompt_preview,
|
| 599 |
-
inputs=[voice, prompt_audio, selected_demo_audio_path],
|
| 600 |
-
outputs=[prompt_preview, prompt_source],
|
| 601 |
)
|
| 602 |
prompt_audio.change(
|
| 603 |
fn=refresh_prompt_preview,
|
| 604 |
-
inputs=[
|
| 605 |
outputs=[prompt_preview, prompt_source],
|
| 606 |
)
|
| 607 |
|
|
@@ -609,7 +559,6 @@ def build_demo():
|
|
| 609 |
fn=run_inference,
|
| 610 |
inputs=[
|
| 611 |
text,
|
| 612 |
-
voice,
|
| 613 |
prompt_audio,
|
| 614 |
selected_demo_audio_path,
|
| 615 |
enable_wetext_processing,
|
|
|
|
| 25 |
|
| 26 |
spaces = _SpacesFallback()
|
| 27 |
|
| 28 |
+
from nano_tts_runtime import DEFAULT_VOICE, NanoTTSService
|
| 29 |
from text_normalization_pipeline import WeTextProcessingManager, prepare_tts_request_texts
|
| 30 |
|
| 31 |
APP_DIR = Path(__file__).resolve().parent
|
|
|
|
| 37 |
|
| 38 |
MODE_VOICE_CLONE = "voice_clone"
|
| 39 |
|
|
|
|
|
|
|
| 40 |
|
| 41 |
@dataclass(frozen=True)
|
| 42 |
class DemoEntry:
|
|
|
|
| 86 |
return demo_entries
|
| 87 |
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
DEMO_ENTRIES = load_demo_entries()
|
| 90 |
DEMO_ENTRY_MAP = {entry.demo_id: entry for entry in DEMO_ENTRIES}
|
| 91 |
DEMO_AUDIO_PATH_MAP = {str(entry.prompt_audio_path): entry for entry in DEMO_ENTRIES}
|
| 92 |
DEMO_ENTRY_NAME_MAP = {entry.name: entry for entry in DEMO_ENTRIES}
|
| 93 |
+
DEFAULT_DEMO_ENTRY = DEMO_ENTRIES[0] if DEMO_ENTRIES else None
|
| 94 |
+
DEFAULT_DEMO_CASE_ID = DEFAULT_DEMO_ENTRY.demo_id if DEFAULT_DEMO_ENTRY is not None else ""
|
| 95 |
+
DEFAULT_DEMO_AUDIO_PATH = str(DEFAULT_DEMO_ENTRY.prompt_audio_path) if DEFAULT_DEMO_ENTRY is not None else ""
|
| 96 |
+
DEFAULT_DEMO_TEXT = DEFAULT_DEMO_ENTRY.text if DEFAULT_DEMO_ENTRY is not None else ""
|
| 97 |
+
DEMO_CASE_CHOICES = [(entry.name, entry.demo_id) for entry in DEMO_ENTRIES]
|
|
|
|
| 98 |
|
| 99 |
|
| 100 |
def parse_bool_env(name: str, default: bool) -> bool:
|
|
|
|
| 171 |
def render_mode_hint() -> str:
|
| 172 |
return (
|
| 173 |
"Current mode: **Voice Clone** \n"
|
| 174 |
+
"Select a Default Case or upload your own reference audio. Uploaded audio overrides the selected Default Case."
|
| 175 |
)
|
| 176 |
|
| 177 |
|
| 178 |
+
def resolve_default_demo_entry() -> DemoEntry | None:
|
| 179 |
+
return DEFAULT_DEMO_ENTRY
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def resolve_selected_demo_entry(demo_case_id: str | None) -> DemoEntry | None:
|
| 183 |
+
normalized_demo_case_id = normalize_demo_case_id(demo_case_id)
|
| 184 |
+
if normalized_demo_case_id:
|
| 185 |
+
demo_entry = DEMO_ENTRY_MAP.get(normalized_demo_case_id)
|
| 186 |
+
if demo_entry is not None:
|
| 187 |
+
return demo_entry
|
| 188 |
+
return resolve_default_demo_entry()
|
| 189 |
|
| 190 |
|
| 191 |
def resolve_effective_prompt_audio_path(
|
|
|
|
|
|
|
| 192 |
prompt_audio_path: str | None,
|
| 193 |
selected_demo_audio_path: str | None,
|
| 194 |
) -> str | None:
|
| 195 |
if prompt_audio_path:
|
| 196 |
+
resolved_path = Path(prompt_audio_path).expanduser().resolve()
|
| 197 |
+
if resolved_path.is_file():
|
| 198 |
+
return str(resolved_path)
|
| 199 |
if selected_demo_audio_path:
|
| 200 |
+
resolved_path = Path(selected_demo_audio_path).expanduser().resolve()
|
| 201 |
+
if resolved_path.is_file():
|
| 202 |
+
return str(resolved_path)
|
| 203 |
+
demo_entry = resolve_default_demo_entry()
|
| 204 |
+
if demo_entry is not None:
|
| 205 |
+
return str(demo_entry.prompt_audio_path)
|
| 206 |
+
return None
|
| 207 |
|
| 208 |
|
| 209 |
def build_prompt_source_text(
|
| 210 |
*,
|
|
|
|
| 211 |
prompt_audio_path: str | None,
|
| 212 |
selected_demo_audio_path: str | None,
|
| 213 |
) -> str:
|
| 214 |
+
effective_prompt_audio_path = resolve_effective_prompt_audio_path(
|
| 215 |
+
prompt_audio_path,
|
| 216 |
+
selected_demo_audio_path,
|
| 217 |
+
)
|
| 218 |
+
if effective_prompt_audio_path:
|
| 219 |
+
if prompt_audio_path:
|
| 220 |
+
return f"Uploaded reference audio: {Path(effective_prompt_audio_path).name}"
|
| 221 |
+
demo_entry = DEMO_AUDIO_PATH_MAP.get(effective_prompt_audio_path)
|
| 222 |
if demo_entry is not None:
|
| 223 |
+
return f"Default case: {demo_entry.name}"
|
| 224 |
+
return f"Default case: {Path(effective_prompt_audio_path).name}"
|
| 225 |
+
return "No default case available"
|
|
|
|
|
|
|
| 226 |
|
| 227 |
|
| 228 |
def refresh_prompt_preview(
|
|
|
|
| 229 |
prompt_audio_path: str | None,
|
| 230 |
selected_demo_audio_path: str | None,
|
| 231 |
):
|
| 232 |
preview_path = resolve_effective_prompt_audio_path(
|
| 233 |
+
prompt_audio_path,
|
| 234 |
+
selected_demo_audio_path,
|
|
|
|
| 235 |
)
|
| 236 |
prompt_source = build_prompt_source_text(
|
|
|
|
| 237 |
prompt_audio_path=prompt_audio_path,
|
| 238 |
selected_demo_audio_path=selected_demo_audio_path,
|
| 239 |
)
|
|
|
|
| 242 |
|
| 243 |
def apply_demo_case_selection(
|
| 244 |
demo_case_id: str,
|
|
|
|
| 245 |
prompt_audio_path: str | None,
|
| 246 |
):
|
| 247 |
+
demo_entry = resolve_selected_demo_entry(demo_case_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
if demo_entry is None:
|
| 249 |
+
preview_path, prompt_source = refresh_prompt_preview(prompt_audio_path, "")
|
| 250 |
return (
|
|
|
|
| 251 |
gr.update(),
|
| 252 |
preview_path,
|
| 253 |
"",
|
|
|
|
| 254 |
prompt_source,
|
| 255 |
)
|
| 256 |
|
| 257 |
+
selected_prompt_path = str(demo_entry.prompt_audio_path)
|
| 258 |
+
preview_path, prompt_source = refresh_prompt_preview(
|
| 259 |
+
prompt_audio_path,
|
| 260 |
+
selected_prompt_path,
|
|
|
|
|
|
|
| 261 |
)
|
| 262 |
return (
|
| 263 |
demo_entry.text,
|
|
|
|
|
|
|
| 264 |
preview_path,
|
| 265 |
+
selected_prompt_path,
|
| 266 |
prompt_source,
|
| 267 |
)
|
| 268 |
|
|
|
|
| 277 |
if not normalized_text:
|
| 278 |
raise ValueError("Please enter text to synthesize.")
|
| 279 |
|
| 280 |
+
if not effective_prompt_audio_path:
|
| 281 |
+
raise ValueError("No reference audio is available. Please select a Default Case or upload prompt audio.")
|
| 282 |
|
| 283 |
return normalized_text
|
| 284 |
|
|
|
|
| 305 |
**kwargs,
|
| 306 |
) -> int:
|
| 307 |
text = kwargs.get("text", args[0] if len(args) > 0 else "")
|
| 308 |
+
max_new_frames = kwargs.get("max_new_frames", args[5] if len(args) > 5 else 375)
|
| 309 |
voice_clone_max_text_tokens = (
|
| 310 |
+
kwargs.get("voice_clone_max_text_tokens", args[6] if len(args) > 6 else 75)
|
| 311 |
)
|
| 312 |
text_len = len(str(text or "").strip())
|
| 313 |
estimated = 75 + (text_len // 12) + int(max_new_frames) // 8 + int(voice_clone_max_text_tokens) // 10
|
|
|
|
| 317 |
@spaces.GPU(size="large", duration=estimate_gpu_duration)
|
| 318 |
def run_inference(
|
| 319 |
text: str,
|
|
|
|
| 320 |
prompt_audio_path: str | None,
|
| 321 |
selected_demo_audio_path: str | None,
|
| 322 |
enable_wetext_processing: bool,
|
|
|
|
| 338 |
service = get_runtime_tts_service()
|
| 339 |
text_normalizer_manager = get_text_normalizer_manager() if enable_wetext_processing else None
|
| 340 |
effective_prompt_audio_path = resolve_effective_prompt_audio_path(
|
| 341 |
+
prompt_audio_path,
|
| 342 |
+
selected_demo_audio_path,
|
|
|
|
| 343 |
)
|
| 344 |
normalized_text = validate_request(
|
| 345 |
text=text,
|
|
|
|
| 348 |
prepared_texts = prepare_tts_request_texts(
|
| 349 |
text=normalized_text,
|
| 350 |
prompt_text="",
|
| 351 |
+
voice=DEFAULT_VOICE,
|
| 352 |
enable_wetext=bool(enable_wetext_processing),
|
| 353 |
enable_normalize_tts_text=bool(enable_normalize_tts_text),
|
| 354 |
text_normalizer_manager=text_normalizer_manager,
|
| 355 |
)
|
| 356 |
prompt_source = build_prompt_source_text(
|
|
|
|
| 357 |
prompt_audio_path=prompt_audio_path,
|
| 358 |
selected_demo_audio_path=selected_demo_audio_path,
|
| 359 |
)
|
|
|
|
| 366 |
result = service.synthesize(
|
| 367 |
text=str(prepared_texts["text"]),
|
| 368 |
mode=MODE_VOICE_CLONE,
|
| 369 |
+
voice=DEFAULT_VOICE,
|
| 370 |
prompt_audio_path=effective_prompt_audio_path or None,
|
| 371 |
max_new_frames=int(max_new_frames),
|
| 372 |
voice_clone_max_text_tokens=int(voice_clone_max_text_tokens),
|
|
|
|
| 417 |
with gr.Column(scale=3):
|
| 418 |
demo_case = gr.Dropdown(
|
| 419 |
choices=DEMO_CASE_CHOICES,
|
| 420 |
+
value=DEFAULT_DEMO_CASE_ID,
|
| 421 |
label="Default Case",
|
| 422 |
info="Select a built-in case to auto-fill the text and prompt preview.",
|
| 423 |
allow_custom_value=True,
|
|
|
|
| 425 |
text = gr.Textbox(
|
| 426 |
label="Target Text",
|
| 427 |
lines=10,
|
| 428 |
+
value=DEFAULT_DEMO_TEXT,
|
| 429 |
placeholder="Enter the text to synthesize.",
|
| 430 |
)
|
| 431 |
mode_hint = gr.Markdown(render_mode_hint())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
prompt_audio = gr.Audio(
|
| 433 |
+
label="Reference Audio Upload (optional; overrides Default Case)",
|
| 434 |
type="filepath",
|
| 435 |
sources=["upload"],
|
| 436 |
)
|
| 437 |
prompt_preview = gr.Audio(
|
| 438 |
label="Effective Prompt Preview",
|
| 439 |
+
value=DEFAULT_DEMO_AUDIO_PATH or None,
|
| 440 |
type="filepath",
|
| 441 |
interactive=False,
|
| 442 |
)
|
| 443 |
|
| 444 |
gr.Markdown(
|
| 445 |
+
"Runtime device and backbone are fixed by the Space and are not user-configurable. Uploaded reference audio overrides the selected Default Case."
|
| 446 |
)
|
| 447 |
|
| 448 |
with gr.Accordion("Advanced Parameters", open=False):
|
|
|
|
| 536 |
prompt_source = gr.Textbox(
|
| 537 |
label="Prompt Source",
|
| 538 |
value=build_prompt_source_text(
|
|
|
|
| 539 |
prompt_audio_path=None,
|
| 540 |
+
selected_demo_audio_path=DEFAULT_DEMO_AUDIO_PATH or None,
|
| 541 |
),
|
| 542 |
lines=4,
|
| 543 |
interactive=False,
|
| 544 |
)
|
| 545 |
+
selected_demo_audio_path = gr.State(DEFAULT_DEMO_AUDIO_PATH)
|
| 546 |
|
| 547 |
demo_case.change(
|
| 548 |
fn=apply_demo_case_selection,
|
| 549 |
+
inputs=[demo_case, prompt_audio],
|
| 550 |
+
outputs=[text, prompt_preview, selected_demo_audio_path, prompt_source],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 551 |
)
|
| 552 |
prompt_audio.change(
|
| 553 |
fn=refresh_prompt_preview,
|
| 554 |
+
inputs=[prompt_audio, selected_demo_audio_path],
|
| 555 |
outputs=[prompt_preview, prompt_source],
|
| 556 |
)
|
| 557 |
|
|
|
|
| 559 |
fn=run_inference,
|
| 560 |
inputs=[
|
| 561 |
text,
|
|
|
|
| 562 |
prompt_audio,
|
| 563 |
selected_demo_audio_path,
|
| 564 |
enable_wetext_processing,
|