Spaces:
Running on Zero
Running on Zero
app.py
CHANGED
|
@@ -13,6 +13,7 @@ import torch
|
|
| 13 |
from huggingface_hub import snapshot_download, login
|
| 14 |
from qwen_tts import Qwen3TTSModel
|
| 15 |
import functools
|
|
|
|
| 16 |
|
| 17 |
# 配置日志
|
| 18 |
logging.basicConfig(
|
|
@@ -221,9 +222,10 @@ def infer_voice_design(part, language, voice_description):
|
|
| 221 |
return wavs[0], sr
|
| 222 |
|
| 223 |
|
|
|
|
| 224 |
@spaces.GPU
|
| 225 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 226 |
-
"""Single segment inference for Voice Clone."""
|
| 227 |
# tts = BASE_MODELS[model_size]
|
| 228 |
tts = load_model("Base", "0.6B")
|
| 229 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
|
@@ -239,6 +241,19 @@ def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
|
| 239 |
)
|
| 240 |
return wavs[0], sr
|
| 241 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
|
| 243 |
logger.info("正在提取参考音频特征(仅执行一次)...")
|
| 244 |
tts = load_model("Base", "0.6B")
|
|
@@ -248,7 +263,16 @@ def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
|
|
| 248 |
x_vector_only_mode=use_xvector_only
|
| 249 |
)
|
| 250 |
logger.info("参考音频特征提取完成。")
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
# @spaces.GPU(duration=60)
|
| 253 |
# def infer_custom_voice(model_size, part, language, speaker, instruct):
|
| 254 |
# """Single segment inference for Custom Voice."""
|
|
@@ -324,6 +348,39 @@ def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector
|
|
| 324 |
logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
|
| 325 |
return None, f"错误: {type(e).__name__}: {e}"
|
| 326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
# def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
|
| 329 |
# """Generate speech using CustomVoice model with segment-based GPU allocation."""
|
|
@@ -415,47 +472,116 @@ Built with [Qwen3-TTS](https://github.com/QwenLM/Qwen3-TTS) by Alibaba Qwen Team
|
|
| 415 |
|
| 416 |
# Tab 2: Voice Clone (Base)
|
| 417 |
with gr.Tab("Voice Clone (Base)"):
|
| 418 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 419 |
with gr.Row():
|
| 420 |
with gr.Column(scale=2):
|
| 421 |
clone_ref_audio = gr.Audio(
|
| 422 |
-
label="
|
| 423 |
type="numpy",
|
| 424 |
)
|
| 425 |
clone_ref_text = gr.Textbox(
|
| 426 |
-
label="
|
| 427 |
lines=2,
|
| 428 |
-
placeholder="
|
| 429 |
)
|
| 430 |
clone_xvector = gr.Checkbox(
|
| 431 |
-
label="
|
| 432 |
value=False,
|
| 433 |
)
|
| 434 |
|
| 435 |
with gr.Column(scale=2):
|
| 436 |
clone_target_text = gr.Textbox(
|
| 437 |
-
label="
|
| 438 |
lines=4,
|
| 439 |
-
placeholder="
|
| 440 |
)
|
| 441 |
with gr.Row():
|
| 442 |
clone_language = gr.Dropdown(
|
| 443 |
-
label="
|
| 444 |
choices=LANGUAGES,
|
| 445 |
value="Auto",
|
| 446 |
interactive=True,
|
| 447 |
)
|
| 448 |
clone_model_size = gr.Dropdown(
|
| 449 |
-
label="
|
| 450 |
choices=MODEL_SIZES,
|
| 451 |
value="1.7B",
|
| 452 |
interactive=True,
|
| 453 |
)
|
| 454 |
-
clone_btn = gr.Button("
|
| 455 |
|
| 456 |
with gr.Row():
|
| 457 |
-
clone_audio_out = gr.Audio(label="
|
| 458 |
-
clone_status = gr.Textbox(label="
|
| 459 |
|
| 460 |
clone_btn.click(
|
| 461 |
generate_voice_clone,
|
|
|
|
| 13 |
from huggingface_hub import snapshot_download, login
|
| 14 |
from qwen_tts import Qwen3TTSModel
|
| 15 |
import functools
|
| 16 |
+
import uuid
|
| 17 |
|
| 18 |
# 配置日志
|
| 19 |
logging.basicConfig(
|
|
|
|
| 222 |
return wavs[0], sr
|
| 223 |
|
| 224 |
|
| 225 |
+
|
| 226 |
@spaces.GPU
|
| 227 |
def infer_voice_clone( part, language,audio_tuple,ref_text,use_xvector_only):
|
| 228 |
+
"""Single segment inference for Voice Clone using reference audio."""
|
| 229 |
# tts = BASE_MODELS[model_size]
|
| 230 |
tts = load_model("Base", "0.6B")
|
| 231 |
voice_clone_prompt = tts.create_voice_clone_prompt(
|
|
|
|
| 241 |
)
|
| 242 |
return wavs[0], sr
|
| 243 |
|
| 244 |
+
@spaces.GPU
|
| 245 |
+
def infer_voice_clone_from_prompt(part, language, voice_clone_prompt):
|
| 246 |
+
"""Single segment inference for Voice Clone using pre-extracted prompt."""
|
| 247 |
+
tts = load_model("Base", "0.6B")
|
| 248 |
+
wavs, sr = tts.generate_voice_clone(
|
| 249 |
+
text=part,
|
| 250 |
+
language=language,
|
| 251 |
+
voice_clone_prompt=voice_clone_prompt,
|
| 252 |
+
max_new_tokens=2048,
|
| 253 |
+
)
|
| 254 |
+
return wavs[0], sr
|
| 255 |
+
|
| 256 |
+
@spaces.GPU
|
| 257 |
def extract_voice_clone_prompt(audio_tuple,ref_text,use_xvector_only):
|
| 258 |
logger.info("正在提取参考音频特征(仅执行一次)...")
|
| 259 |
tts = load_model("Base", "0.6B")
|
|
|
|
| 263 |
x_vector_only_mode=use_xvector_only
|
| 264 |
)
|
| 265 |
logger.info("参考音频特征提取完成。")
|
| 266 |
+
|
| 267 |
+
# 生成唯一的文件名
|
| 268 |
+
file_id = str(uuid.uuid4())[:8]
|
| 269 |
+
file_path = f"voice_clone_prompt_{file_id}.pt"
|
| 270 |
+
|
| 271 |
+
# 保存到文件
|
| 272 |
+
torch.save(voice_clone_prompt, file_path)
|
| 273 |
+
logger.info(f"voice_clone_prompt 已保存到: {file_path}")
|
| 274 |
+
|
| 275 |
+
return file_path
|
| 276 |
# @spaces.GPU(duration=60)
|
| 277 |
# def infer_custom_voice(model_size, part, language, speaker, instruct):
|
| 278 |
# """Single segment inference for Custom Voice."""
|
|
|
|
| 348 |
logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
|
| 349 |
return None, f"错误: {type(e).__name__}: {e}"
|
| 350 |
|
| 351 |
+
def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language, progress=gr.Progress(track_tqdm=True)):
|
| 352 |
+
"""Generate speech using Base (Voice Clone) model with pre-extracted prompt file."""
|
| 353 |
+
if not target_text or not target_text.strip():
|
| 354 |
+
return None, "错误:目标文本不能为空。"
|
| 355 |
+
|
| 356 |
+
if not prompt_file_path:
|
| 357 |
+
return None, "错误:需要提供音频特征文件。"
|
| 358 |
+
|
| 359 |
+
logger.info(f"开始 Voice Clone 生成任务(使用特征文件)。语言: {language}, 目标文本长度: {len(target_text)}, 特征文件: {prompt_file_path}")
|
| 360 |
+
try:
|
| 361 |
+
# 加载预提取的音频特征
|
| 362 |
+
logger.info("正在加载音频特征文件...")
|
| 363 |
+
voice_clone_prompt = torch.load(prompt_file_path, map_location='cpu')
|
| 364 |
+
logger.info("音频特征文件加载成功。")
|
| 365 |
+
|
| 366 |
+
text_parts = split_text(target_text.strip())
|
| 367 |
+
logger.info(f"目标目标文本已切分为 {len(text_parts)} 段。")
|
| 368 |
+
all_wavs = []
|
| 369 |
+
sr = 24000
|
| 370 |
+
|
| 371 |
+
for i, part in enumerate(progress.tqdm(text_parts, desc="正在生成分段")):
|
| 372 |
+
logger.info(f"正在处理第 {i+1}/{len(text_parts)} 段文本...")
|
| 373 |
+
wav, current_sr = infer_voice_clone_from_prompt(part, language, voice_clone_prompt)
|
| 374 |
+
all_wavs.append(wav)
|
| 375 |
+
sr = current_sr
|
| 376 |
+
|
| 377 |
+
combined_wav = np.concatenate(all_wavs)
|
| 378 |
+
logger.info("Voice Clone 生成任务完成,正在合并音频...")
|
| 379 |
+
return (sr, combined_wav), "语音克隆生成成功(使用特征文件)!"
|
| 380 |
+
except Exception as e:
|
| 381 |
+
logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
|
| 382 |
+
return None, f"错误: {type(e).__name__}: {e}"
|
| 383 |
+
|
| 384 |
|
| 385 |
# def generate_custom_voice(text, language, speaker, instruct, model_size, progress=gr.Progress(track_tqdm=True)):
|
| 386 |
# """Generate speech using CustomVoice model with segment-based GPU allocation."""
|
|
|
|
| 472 |
|
| 473 |
# Tab 2: Voice Clone (Base)
|
| 474 |
with gr.Tab("Voice Clone (Base)"):
|
| 475 |
+
# Section 1: Extract Voice Features
|
| 476 |
+
gr.Markdown("### 1. 提取音频特征")
|
| 477 |
+
gr.Markdown("上传参考音频并提取特征,保存为文件供后续使用。")
|
| 478 |
+
with gr.Row():
|
| 479 |
+
with gr.Column(scale=2):
|
| 480 |
+
extract_ref_audio = gr.Audio(
|
| 481 |
+
label="参考音频",
|
| 482 |
+
type="numpy",
|
| 483 |
+
)
|
| 484 |
+
extract_ref_text = gr.Textbox(
|
| 485 |
+
label="参考文本(参考音频的文字内容)",
|
| 486 |
+
lines=2,
|
| 487 |
+
placeholder="输入参考音频中的确切文字...",
|
| 488 |
+
)
|
| 489 |
+
extract_xvector = gr.Checkbox(
|
| 490 |
+
label="仅使用 x-vector(无需参考文本,但质量较低)",
|
| 491 |
+
value=False,
|
| 492 |
+
)
|
| 493 |
+
extract_btn = gr.Button("提取音频特征", variant="primary")
|
| 494 |
+
|
| 495 |
+
with gr.Column(scale=2):
|
| 496 |
+
extract_file_out = gr.File(label="下载特征文件 (.pt)")
|
| 497 |
+
extract_status = gr.Textbox(label="状态", lines=2, interactive=False)
|
| 498 |
+
|
| 499 |
+
extract_btn.click(
|
| 500 |
+
extract_voice_clone_prompt,
|
| 501 |
+
inputs=[extract_ref_audio, extract_ref_text, extract_xvector],
|
| 502 |
+
outputs=[extract_file_out],
|
| 503 |
+
api_name="extract_voice_clone_prompt"
|
| 504 |
+
)
|
| 505 |
+
|
| 506 |
+
gr.Markdown("---")
|
| 507 |
+
|
| 508 |
+
# Section 2: Generate Voice from Features
|
| 509 |
+
gr.Markdown("### 2. 使用特征文件生成语音")
|
| 510 |
+
gr.Markdown("上传之前提取的特征文件,快速生成语音(无需重复提取特征)。")
|
| 511 |
+
with gr.Row():
|
| 512 |
+
with gr.Column(scale=2):
|
| 513 |
+
prompt_file = gr.File(
|
| 514 |
+
label="音频特征文件 (.pt)",
|
| 515 |
+
)
|
| 516 |
+
prompt_target_text = gr.Textbox(
|
| 517 |
+
label="目标文本(要用克隆音色合成的文字)",
|
| 518 |
+
lines=4,
|
| 519 |
+
placeholder="输入要让克隆音色说话的文字...",
|
| 520 |
+
)
|
| 521 |
+
prompt_language = gr.Dropdown(
|
| 522 |
+
label="语言",
|
| 523 |
+
choices=LANGUAGES,
|
| 524 |
+
value="Auto",
|
| 525 |
+
interactive=True,
|
| 526 |
+
)
|
| 527 |
+
prompt_btn = gr.Button("使用特征文件生成", variant="primary")
|
| 528 |
+
|
| 529 |
+
with gr.Column(scale=2):
|
| 530 |
+
prompt_audio_out = gr.Audio(label="生成的音频", type="numpy")
|
| 531 |
+
prompt_status = gr.Textbox(label="状态", lines=2, interactive=False)
|
| 532 |
+
|
| 533 |
+
prompt_btn.click(
|
| 534 |
+
generate_voice_clone_from_prompt_file,
|
| 535 |
+
inputs=[prompt_file, prompt_target_text, prompt_language],
|
| 536 |
+
outputs=[prompt_audio_out, prompt_status],
|
| 537 |
+
api_name="generate_voice_clone_from_prompt"
|
| 538 |
+
)
|
| 539 |
+
|
| 540 |
+
gr.Markdown("---")
|
| 541 |
+
|
| 542 |
+
# Section 3: Traditional Voice Clone (Original)
|
| 543 |
+
gr.Markdown("### 3. 传统音色克隆(直接使用参考音频)")
|
| 544 |
+
gr.Markdown("直接上传参考音频生成语音(每次都需要提取特征)。")
|
| 545 |
with gr.Row():
|
| 546 |
with gr.Column(scale=2):
|
| 547 |
clone_ref_audio = gr.Audio(
|
| 548 |
+
label="参考音频",
|
| 549 |
type="numpy",
|
| 550 |
)
|
| 551 |
clone_ref_text = gr.Textbox(
|
| 552 |
+
label="参考文本",
|
| 553 |
lines=2,
|
| 554 |
+
placeholder="输入参考音频中的确切文字...",
|
| 555 |
)
|
| 556 |
clone_xvector = gr.Checkbox(
|
| 557 |
+
label="仅使用 x-vector",
|
| 558 |
value=False,
|
| 559 |
)
|
| 560 |
|
| 561 |
with gr.Column(scale=2):
|
| 562 |
clone_target_text = gr.Textbox(
|
| 563 |
+
label="目标文本",
|
| 564 |
lines=4,
|
| 565 |
+
placeholder="输入要让克隆音色说话的文字...",
|
| 566 |
)
|
| 567 |
with gr.Row():
|
| 568 |
clone_language = gr.Dropdown(
|
| 569 |
+
label="语言",
|
| 570 |
choices=LANGUAGES,
|
| 571 |
value="Auto",
|
| 572 |
interactive=True,
|
| 573 |
)
|
| 574 |
clone_model_size = gr.Dropdown(
|
| 575 |
+
label="模型大小",
|
| 576 |
choices=MODEL_SIZES,
|
| 577 |
value="1.7B",
|
| 578 |
interactive=True,
|
| 579 |
)
|
| 580 |
+
clone_btn = gr.Button("克隆并生成", variant="primary")
|
| 581 |
|
| 582 |
with gr.Row():
|
| 583 |
+
clone_audio_out = gr.Audio(label="生成的音频", type="numpy")
|
| 584 |
+
clone_status = gr.Textbox(label="状态", lines=2, interactive=False)
|
| 585 |
|
| 586 |
clone_btn.click(
|
| 587 |
generate_voice_clone,
|