Spaces:
Build error
Build error
积极的屁孩 commited on
Commit ·
65b6204
1
Parent(s): fbb3473
add links and style ref text for vevo-tts
Browse files
app.py
CHANGED
|
@@ -525,7 +525,7 @@ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
|
|
| 525 |
traceback.print_exc()
|
| 526 |
raise e
|
| 527 |
|
| 528 |
-
def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language="en"):
|
| 529 |
temp_ref_path = "wav/temp_ref.wav"
|
| 530 |
temp_timbre_path = "wav/temp_timbre.wav"
|
| 531 |
output_path = "wav/output_vevotts.wav"
|
|
@@ -560,6 +560,8 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
| 560 |
|
| 561 |
# 打印debug信息
|
| 562 |
print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
|
|
|
|
|
|
|
| 563 |
|
| 564 |
# 保存上传的音频
|
| 565 |
torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
|
|
@@ -603,7 +605,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
| 603 |
src_text=text,
|
| 604 |
style_ref_wav_path=temp_ref_path,
|
| 605 |
timbre_ref_wav_path=temp_timbre_path,
|
| 606 |
-
style_ref_wav_text=
|
| 607 |
src_text_language=src_language,
|
| 608 |
style_ref_wav_text_language=ref_language,
|
| 609 |
)
|
|
@@ -626,9 +628,39 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
|
|
| 626 |
raise e
|
| 627 |
|
| 628 |
# 创建Gradio界面
|
| 629 |
-
with gr.Blocks(title="
|
| 630 |
-
gr.Markdown("#
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
with gr.Tab("Vevo-Timbre"):
|
| 633 |
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
| 634 |
with gr.Row():
|
|
@@ -674,13 +706,14 @@ with gr.Blocks(title="VEVO DEMO") as demo:
|
|
| 674 |
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 675 |
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 676 |
tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
|
|
|
|
| 677 |
tts_button = gr.Button("Generate")
|
| 678 |
with gr.Column():
|
| 679 |
tts_output = gr.Audio(label="Result")
|
| 680 |
|
| 681 |
tts_button.click(
|
| 682 |
vevo_tts,
|
| 683 |
-
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_src_language, tts_ref_language],
|
| 684 |
outputs=tts_output
|
| 685 |
)
|
| 686 |
|
|
|
|
| 525 |
traceback.print_exc()
|
| 526 |
raise e
|
| 527 |
|
| 528 |
+
def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en"):
|
| 529 |
temp_ref_path = "wav/temp_ref.wav"
|
| 530 |
temp_timbre_path = "wav/temp_timbre.wav"
|
| 531 |
output_path = "wav/output_vevotts.wav"
|
|
|
|
| 560 |
|
| 561 |
# 打印debug信息
|
| 562 |
print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
|
| 563 |
+
if style_ref_text:
|
| 564 |
+
print(f"Style reference text: {style_ref_text}")
|
| 565 |
|
| 566 |
# 保存上传的音频
|
| 567 |
torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
|
|
|
|
| 605 |
src_text=text,
|
| 606 |
style_ref_wav_path=temp_ref_path,
|
| 607 |
timbre_ref_wav_path=temp_timbre_path,
|
| 608 |
+
style_ref_wav_text=style_ref_text,
|
| 609 |
src_text_language=src_language,
|
| 610 |
style_ref_wav_text_language=ref_language,
|
| 611 |
)
|
|
|
|
| 628 |
raise e
|
| 629 |
|
| 630 |
# 创建Gradio界面
|
| 631 |
+
with gr.Blocks(title="Vevo DEMO") as demo:
|
| 632 |
+
gr.Markdown("# Vevo DEMO")
|
| 633 |
+
# 添加链接标签行
|
| 634 |
+
with gr.Row(elem_id="links_row"):
|
| 635 |
+
gr.HTML("""
|
| 636 |
+
<div style="display: flex; justify-content: center; gap: 10px; margin-bottom: 20px;">
|
| 637 |
+
<a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
|
| 638 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
| 639 |
+
<span style="background-color: #c44e52; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
|
| 640 |
+
<span style="padding: 5px 10px;">arXiv</span>
|
| 641 |
+
</div>
|
| 642 |
+
</a>
|
| 643 |
+
<a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
|
| 644 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
| 645 |
+
<span style="background-color: #55a868; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Paper</span>
|
| 646 |
+
<span style="padding: 5px 10px;">ICLR</span>
|
| 647 |
+
</div>
|
| 648 |
+
</a>
|
| 649 |
+
<a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
|
| 650 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
| 651 |
+
<span style="background-color: #eeca3b; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Model</span>
|
| 652 |
+
<span style="padding: 5px 10px;">HuggingFace</span>
|
| 653 |
+
</div>
|
| 654 |
+
</a>
|
| 655 |
+
<a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
|
| 656 |
+
<div style="background-color: #4a4a4a; color: white; padding: 5px 10px; border-radius: 5px; display: flex; align-items: center;">
|
| 657 |
+
<span style="background-color: #4c72b0; padding: 5px 10px; border-radius: 0 5px 5px 0; margin-left: 5px;">Repo</span>
|
| 658 |
+
<span style="padding: 5px 10px;">GitHub</span>
|
| 659 |
+
</div>
|
| 660 |
+
</a>
|
| 661 |
+
</div>
|
| 662 |
+
""")
|
| 663 |
+
|
| 664 |
with gr.Tab("Vevo-Timbre"):
|
| 665 |
gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
|
| 666 |
with gr.Row():
|
|
|
|
| 706 |
tts_reference = gr.Audio(label="Style Reference", type="numpy")
|
| 707 |
tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
|
| 708 |
tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
|
| 709 |
+
tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
|
| 710 |
tts_button = gr.Button("Generate")
|
| 711 |
with gr.Column():
|
| 712 |
tts_output = gr.Audio(label="Result")
|
| 713 |
|
| 714 |
tts_button.click(
|
| 715 |
vevo_tts,
|
| 716 |
+
inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_ref_language],
|
| 717 |
outputs=tts_output
|
| 718 |
)
|
| 719 |
|