smartwang commited on
Commit
ddefb98
·
1 Parent(s): 48360e9
Files changed (1) hide show
  1. app.py +140 -14
app.py CHANGED
@@ -13,6 +13,7 @@ import torch
13
  from huggingface_hub import snapshot_download, login
14
  from qwen_tts import Qwen3TTSModel
15
  import functools
 
16
 
17
  # 配置日志
18
  logging.basicConfig(
@@ -221,9 +222,10 @@ def infer_voice_design(part, language, voice_description):
221
  return wavs[0], sr
222
 
223
 
 
224
  @spaces.GPU
225
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
226
- """Single segment inference for Voice Clone."""
227
  # tts = BASE_MODELS[model_size]
228
  tts = load_model("Base", "0.6B")
229
  voice_clone_prompt = tts.create_voice_clone_prompt(
@@ -239,6 +241,19 @@ def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
239
  )
240
  return wavs[0], sr
241
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
243
  logger.info("正在提取参考音频特征(仅执行一次)...")
244
  tts = load_model("Base", "0.6B")
@@ -248,7 +263,16 @@ def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
248
  x_vector_only_mode=use_xvector_only
249
  )
250
  logger.info("参考音频特征提取完成。")
251
- return voice_clone_prompt
 
 
 
 
 
 
 
 
 
252
  # @spaces.GPU(duration=60)
253
  # def infer_custom_voice(model_size, part, language, speaker, instruct):
254
  # """Single segment inference for Custom Voice."""
@@ -324,6 +348,39 @@ def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector
324
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
325
  return None, f"错误: {type(e).__name__}: {e}"
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  # def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
329
  # """Generate speech using CustomVoice model with segment-based GPU allocation."""
@@ -415,47 +472,116 @@ Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team
415
 
416
  # Tab 2: Voice Clone (Base)
417
  with gr.Tab("Voice Clone (Base)"):
418
- gr.Markdown("### Clone Voice from Reference Audio")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
419
  with gr.Row():
420
  with gr.Column(scale=2):
421
  clone_ref_audio = gr.Audio(
422
- label="Reference Audio (Upload a voice sample to clone)",
423
  type="numpy",
424
  )
425
  clone_ref_text = gr.Textbox(
426
- label="Reference Text (Transcript of the reference audio)",
427
  lines=2,
428
- placeholder="Enter the exact text spoken in the reference audio...",
429
  )
430
  clone_xvector = gr.Checkbox(
431
- label="Use x-vector only (No reference text needed, but lower quality)",
432
  value=False,
433
  )
434
 
435
  with gr.Column(scale=2):
436
  clone_target_text = gr.Textbox(
437
- label="Target Text (Text to synthesize with cloned voice)",
438
  lines=4,
439
- placeholder="Enter the text you want the cloned voice to speak...",
440
  )
441
  with gr.Row():
442
  clone_language = gr.Dropdown(
443
- label="Language",
444
  choices=LANGUAGES,
445
  value="Auto",
446
  interactive=True,
447
  )
448
  clone_model_size = gr.Dropdown(
449
- label="Model Size",
450
  choices=MODEL_SIZES,
451
  value="1.7B",
452
  interactive=True,
453
  )
454
- clone_btn = gr.Button("Clone & Generate", variant="primary")
455
 
456
  with gr.Row():
457
- clone_audio_out = gr.Audio(label="Generated Audio", type="numpy")
458
- clone_status = gr.Textbox(label="Status", lines=2, interactive=False)
459
 
460
  clone_btn.click(
461
  generate_voice_clone,
 
13
  from huggingface_hub import snapshot_download, login
14
  from qwen_tts import Qwen3TTSModel
15
  import functools
16
+ import uuid
17
 
18
  # 配置日志
19
  logging.basicConfig(
 
222
  return wavs[0], sr
223
 
224
 
225
+
226
  @spaces.GPU
227
  def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
228
+ """Single segment inference for Voice Clone using reference audio."""
229
  # tts = BASE_MODELS[model_size]
230
  tts = load_model("Base", "0.6B")
231
  voice_clone_prompt = tts.create_voice_clone_prompt(
 
241
  )
242
  return wavs[0], sr
243
 
244
+ @spaces.GPU
245
+ def infer_voice_clone_from_prompt(part, language, voice_clone_prompt):
246
+ """Single segment inference for Voice Clone using pre-extracted prompt."""
247
+ tts = load_model("Base", "0.6B")
248
+ wavs, sr = tts.generate_voice_clone(
249
+ text=part,
250
+ language=language,
251
+ voice_clone_prompt=voice_clone_prompt,
252
+ max_new_tokens=2048,
253
+ )
254
+ return wavs[0], sr
255
+
256
+ @spaces.GPU
257
  def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
258
  logger.info("正在提取参考音频特征(仅执行一次)...")
259
  tts = load_model("Base", "0.6B")
 
263
  x_vector_only_mode=use_xvector_only
264
  )
265
  logger.info("参考音频特征提取完成。")
266
+
267
+ # 生成唯一的文件名
268
+ file_id = str(uuid.uuid4())[:8]
269
+ file_path = f"voice_clone_prompt_{file_id}.pt"
270
+
271
+ # 保存到文件
272
+ torch.save(voice_clone_prompt, file_path)
273
+ logger.info(f"voice_clone_prompt 已保存到: {file_path}")
274
+
275
+ return file_path
276
  # @spaces.GPU(duration=60)
277
  # def infer_custom_voice(model_size, part, language, speaker, instruct):
278
  # """Single segment inference for Custom Voice."""
 
348
  logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
349
  return None, f"错误: {type(e).__name__}: {e}"
350
 
351
+ def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language, progress=gr.Progress(track_tqdm=True)):
352
+ """Generate speech using Base (Voice Clone) model with pre-extracted prompt file."""
353
+ if not target_text or not target_text.strip():
354
+ return None, "错误:目标文本不能为空。"
355
+
356
+ if not prompt_file_path:
357
+ return None, "错误:需要提供音频特征文件。"
358
+
359
+ logger.info(f"开始 Voice Clone 生成任务(使用特征文件)。语言: {language}, 目标文本长度: {len(target_text)}, 特征文件: {prompt_file_path}")
360
+ try:
361
+ # 加载预提取的音频特征
362
+ logger.info("正在加载音频特征文件...")
363
+ voice_clone_prompt = torch.load(prompt_file_path, map_location='cpu')
364
+ logger.info("音频特征文件加载成功。")
365
+
366
+ text_parts = split_text(target_text.strip())
367
+ logger.info(f"目标目标文本已切分为 {len(text_parts)} 段。")
368
+ all_wavs = []
369
+ sr = 24000
370
+
371
+ for i, part in enumerate(progress.tqdm(text_parts, desc="正在生成分段")):
372
+ logger.info(f"正在处理第 {i+1}/{len(text_parts)} 段文本...")
373
+ wav, current_sr = infer_voice_clone_from_prompt(part, language, voice_clone_prompt)
374
+ all_wavs.append(wav)
375
+ sr = current_sr
376
+
377
+ combined_wav = np.concatenate(all_wavs)
378
+ logger.info("Voice Clone 生成任务完成,正在合并音频...")
379
+ return (sr, combined_wav), "语音克隆生成成功(使用特征文件)!"
380
+ except Exception as e:
381
+ logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
382
+ return None, f"错误: {type(e).__name__}: {e}"
383
+
384
 
385
  # def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
386
  # """Generate speech using CustomVoice model with segment-based GPU allocation."""
 
472
 
473
  # Tab 2: Voice Clone (Base)
474
  with gr.Tab("Voice Clone (Base)"):
475
+ # Section 1: Extract Voice Features
476
+ gr.Markdown("### 1. 提取音频特征")
477
+ gr.Markdown("上传参考音频并提取特征,保存为文件供后续使用。")
478
+ with gr.Row():
479
+ with gr.Column(scale=2):
480
+ extract_ref_audio = gr.Audio(
481
+ label="参考音频",
482
+ type="numpy",
483
+ )
484
+ extract_ref_text = gr.Textbox(
485
+ label="参考文本(参考音频的文字内容)",
486
+ lines=2,
487
+ placeholder="输入参考音频中的确切文字...",
488
+ )
489
+ extract_xvector = gr.Checkbox(
490
+ label="仅使用 x-vector(无需参考文本,但质量较低)",
491
+ value=False,
492
+ )
493
+ extract_btn = gr.Button("提取音频特征", variant="primary")
494
+
495
+ with gr.Column(scale=2):
496
+ extract_file_out = gr.File(label="下载特征文件 (.pt)")
497
+ extract_status = gr.Textbox(label="状态", lines=2, interactive=False)
498
+
499
+ extract_btn.click(
500
+ extract_voice_clone_prompt,
501
+ inputs=[extract_ref_audio, extract_ref_text, extract_xvector],
502
+ outputs=[extract_file_out],
503
+ api_name="extract_voice_clone_prompt"
504
+ )
505
+
506
+ gr.Markdown("---")
507
+
508
+ # Section 2: Generate Voice from Features
509
+ gr.Markdown("### 2. 使用特征文件生成语音")
510
+ gr.Markdown("上传之前提取的特征文件,快速生成语音(无需重复提取特征)。")
511
+ with gr.Row():
512
+ with gr.Column(scale=2):
513
+ prompt_file = gr.File(
514
+ label="音频特征文件 (.pt)",
515
+ )
516
+ prompt_target_text = gr.Textbox(
517
+ label="目标文本(要用克隆音色合成的文字)",
518
+ lines=4,
519
+ placeholder="输入要让克隆音色说话的文字...",
520
+ )
521
+ prompt_language = gr.Dropdown(
522
+ label="语言",
523
+ choices=LANGUAGES,
524
+ value="Auto",
525
+ interactive=True,
526
+ )
527
+ prompt_btn = gr.Button("使用特征文件生成", variant="primary")
528
+
529
+ with gr.Column(scale=2):
530
+ prompt_audio_out = gr.Audio(label="生成的音频", type="numpy")
531
+ prompt_status = gr.Textbox(label="状态", lines=2, interactive=False)
532
+
533
+ prompt_btn.click(
534
+ generate_voice_clone_from_prompt_file,
535
+ inputs=[prompt_file, prompt_target_text, prompt_language],
536
+ outputs=[prompt_audio_out, prompt_status],
537
+ api_name="generate_voice_clone_from_prompt"
538
+ )
539
+
540
+ gr.Markdown("---")
541
+
542
+ # Section 3: Traditional Voice Clone (Original)
543
+ gr.Markdown("### 3. 传统音色克隆(直接使用参考音频)")
544
+ gr.Markdown("直接上传参考音频生成语音(每次都需要提取特征)。")
545
  with gr.Row():
546
  with gr.Column(scale=2):
547
  clone_ref_audio = gr.Audio(
548
+ label="参考音频",
549
  type="numpy",
550
  )
551
  clone_ref_text = gr.Textbox(
552
+ label="参考文本",
553
  lines=2,
554
+ placeholder="输入参考音频中的确切文字...",
555
  )
556
  clone_xvector = gr.Checkbox(
557
+ label="仅使用 x-vector",
558
  value=False,
559
  )
560
 
561
  with gr.Column(scale=2):
562
  clone_target_text = gr.Textbox(
563
+ label="目标文本",
564
  lines=4,
565
+ placeholder="输入要让克隆音色说话的文字...",
566
  )
567
  with gr.Row():
568
  clone_language = gr.Dropdown(
569
+ label="语言",
570
  choices=LANGUAGES,
571
  value="Auto",
572
  interactive=True,
573
  )
574
  clone_model_size = gr.Dropdown(
575
+ label="模型大小",
576
  choices=MODEL_SIZES,
577
  value="1.7B",
578
  interactive=True,
579
  )
580
+ clone_btn = gr.Button("克隆并生成", variant="primary")
581
 
582
  with gr.Row():
583
+ clone_audio_out = gr.Audio(label="生成的音频", type="numpy")
584
+ clone_status = gr.Textbox(label="状态", lines=2, interactive=False)
585
 
586
  clone_btn.click(
587
  generate_voice_clone,