Spaces:

Difficult-Burger
/

vevo-test

Build error

App Files Files Community

积极的屁孩 commited on Apr 15, 2025

Commit

cc7434e

1 Parent(s): 29b1e08

adjustments

Browse files

Files changed (1) hide show

app.py +121 -93

app.py CHANGED Viewed

@@ -236,7 +236,7 @@ def vevo_style(content_wav, style_wav):
     # 检查并处理音频数据
     if content_wav is None or style_wav is None:
-        raise ValueError("请上传音频文件")
     # 处理音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -260,7 +260,7 @@ def vevo_style(content_wav, style_wav):
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("内容音频格式不正确")
     if isinstance(style_wav, tuple) and len(style_wav) == 2:
         # 确保正确的顺序 (data, sample_rate)
@@ -272,11 +272,11 @@ def vevo_style(content_wav, style_wav):
         if style_tensor.ndim == 1:
             style_tensor = style_tensor.unsqueeze(0)  # 添加通道维度
     else:
-        raise ValueError("风格音频格式不正确")
     # 打印debug信息
-    print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
-    print(f"风格音频形状: {style_tensor.shape}, 采样率: {style_sr}")
     # 保存音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
@@ -296,17 +296,17 @@ def vevo_style(content_wav, style_wav):
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("警告：生成的音频包含NaN或Inf值")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
-        print(f"处理过程中出错: {e}")
         import traceback
         traceback.print_exc()
         raise e
@@ -318,7 +318,7 @@ def vevo_timbre(content_wav, reference_wav):
     # 检查并处理音频数据
     if content_wav is None or reference_wav is None:
-        raise ValueError("请上传音频文件")
     # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -342,7 +342,7 @@ def vevo_timbre(content_wav, reference_wav):
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("内容音频格式不正确")
     # 处理参考音频格式
     if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
@@ -366,11 +366,11 @@ def vevo_timbre(content_wav, reference_wav):
         # 归一化音量
         reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("参考音频格式不正确")
     # 打印debug信息
-    print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
-    print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
     # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
@@ -389,29 +389,30 @@ def vevo_timbre(content_wav, reference_wav):
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("警告：生成的音频包含NaN或Inf值")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
-        print(f"处理过程中出错: {e}")
         import traceback
         traceback.print_exc()
         raise e
-def vevo_voice(content_wav, reference_wav):
     temp_content_path = "wav/temp_content.wav"
-    temp_reference_path = "wav/temp_reference.wav"
     output_path = "wav/output_vevovoice.wav"
     # 检查并处理音频数据
-    if content_wav is None or reference_wav is None:
-        raise ValueError("请上传音频文件")
     # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
@@ -435,39 +436,65 @@ def vevo_voice(content_wav, reference_wav):
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("内容音频格式不正确")
-    # 处理参考音频格式
-    if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
-        if isinstance(reference_wav[0], np.ndarray):
-            reference_data, reference_sr = reference_wav
         else:
-            reference_sr, reference_data = reference_wav
         # 确保是单声道
-        if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
-            reference_data = np.mean(reference_data, axis=1)
         # 重采样到24kHz
-        if reference_sr != 24000:
-            reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
-            reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
-            reference_sr = 24000
         else:
-            reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
         # 归一化音量
-        reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("参考音频格式不正确")
     # 打印debug信息
-    print(f"内容音频形状: {content_tensor.shape}, 采样率: {content_sr}")
-    print(f"参考音频形状: {reference_tensor.shape}, 采样率: {reference_sr}")
     # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
-    torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
     try:
         # 获取管道
@@ -477,23 +504,23 @@ def vevo_voice(content_wav, reference_wav):
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
-            style_ref_wav_path=temp_reference_path,
-            timbre_ref_wav_path=temp_reference_path,
         )
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("警告：生成的音频包含NaN或Inf值")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
-        print(f"处理过程中出错: {e}")
         import traceback
         traceback.print_exc()
         raise e
@@ -505,7 +532,7 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
     # 检查并处理音频数据
     if ref_wav is None:
-        raise ValueError("请上传参考音频文件")
     # 处理参考音频格式
     if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
@@ -529,10 +556,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
         # 归一化音量
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
     else:
-        raise ValueError("参考音频格式不正确")
     # 打印debug信息
-    print(f"参考音频形状: {ref_tensor.shape}, 采样率: {ref_sr}")
     # 保存上传的音频
     torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
@@ -559,10 +586,10 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
             # 归一化音量
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
-            print(f"音色参考音频形状: {timbre_tensor.shape}, 采样率: {timbre_sr}")
             torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
         else:
-            raise ValueError("音色参考音频格式不正确")
     else:
         temp_timbre_path = temp_ref_path
@@ -583,74 +610,75 @@ def vevo_tts(text, ref_wav, timbre_ref_wav=None, src_language="en", ref_language
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("警告：生成的音频包含NaN或Inf值")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        print(f"生成音频形状: {gen_audio.shape}, 最大值: {torch.max(gen_audio)}, 最小值: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
-        print(f"处理过程中出错: {e}")
         import traceback
         traceback.print_exc()
         raise e
 # 创建Gradio界面
-with gr.Blocks(title="VEVO Demo") as demo:
-    gr.Markdown("# VEVO: 多功能语音合成模型演示")
-    gr.Markdown("## 可控零样本声音模仿与风格转换")
-    with gr.Tab("风格转换 (Style)"):
-        gr.Markdown("### Vevo-Style: 保持音色但转换风格（如口音、情感等）")
         with gr.Row():
             with gr.Column():
-                style_content = gr.Audio(label="内容音频", type="numpy")
-                style_reference = gr.Audio(label="风格音频", type="numpy")
-                style_button = gr.Button("生成")
             with gr.Column():
-                style_output = gr.Audio(label="生成结果")
-        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
-    with gr.Tab("音色转换 (Timbre)"):
-        gr.Markdown("### Vevo-Timbre: 保持风格但转换音色")
         with gr.Row():
             with gr.Column():
-                timbre_content = gr.Audio(label="内容音频", type="numpy")
-                timbre_reference = gr.Audio(label="音色参考音频", type="numpy")
-                timbre_button = gr.Button("生成")
             with gr.Column():
-                timbre_output = gr.Audio(label="生成结果")
-        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
-    with gr.Tab("声音转换 (Voice)"):
-        gr.Markdown("### Vevo-Voice: 同时转换风格和音色")
         with gr.Row():
             with gr.Column():
-                voice_content = gr.Audio(label="内容音频", type="numpy")
-                voice_reference = gr.Audio(label="声音参考音频", type="numpy")
-                voice_button = gr.Button("生成")
             with gr.Column():
-                voice_output = gr.Audio(label="生成结果")
-        voice_button.click(vevo_voice, inputs=[voice_content, voice_reference], outputs=voice_output)
-    with gr.Tab("文本到语音 (TTS)"):
-        gr.Markdown("### Vevo-TTS: 风格与音色可控的文本到语音转换")
         with gr.Row():
             with gr.Column():
-                tts_text = gr.Textbox(label="输入文本", placeholder="请输入要合成的文本...", lines=3)
-                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="文本语言", value="en")
-                tts_reference = gr.Audio(label="风格参考音频", type="numpy")
-                tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="参考音频语言", value="en")
-                with gr.Accordion("高级选项", open=False):
-                    tts_timbre_reference = gr.Audio(label="音色参考音频（可选）", type="numpy")
-                tts_button = gr.Button("生成")
             with gr.Column():
-                tts_output = gr.Audio(label="生成结果")
         tts_button.click(
             vevo_tts,
@@ -659,14 +687,14 @@ with gr.Blocks(title="VEVO Demo") as demo:
         )
     gr.Markdown("""
-    ## 关于VEVO
-    VEVO是一个多功能语音合成和转换模型，提供四种主要功能：
-    1. **Vevo-Style**: 保持音色但转换风格（如口音、情感等）
-    2. **Vevo-Timbre**: 保持风格但转换音色
-    3. **Vevo-Voice**: 同时转换风格和音色
-    4. **Vevo-TTS**: 风格与音色可控的文本到语音转换
-    更多信息请访问[Amphion项目](https://github.com/open-mmlab/Amphion)
     """)
 # 启动应用

     # 检查并处理音频数据
     if content_wav is None or style_wav is None:
+        raise ValueError("Please upload audio files")
     # 处理音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid content audio format")
     if isinstance(style_wav, tuple) and len(style_wav) == 2:
         # 确保正确的顺序 (data, sample_rate)
         if style_tensor.ndim == 1:
             style_tensor = style_tensor.unsqueeze(0)  # 添加通道维度
     else:
+        raise ValueError("Invalid style audio format")
     # 打印debug信息
+    print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
+    print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
     # 保存音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
+            print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
+        print(f"Error during processing: {e}")
         import traceback
         traceback.print_exc()
         raise e
     # 检查并处理音频数据
     if content_wav is None or reference_wav is None:
+        raise ValueError("Please upload audio files")
     # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid content audio format")
     # 处理参考音频格式
     if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
         # 归一化音量
         reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid reference audio format")
     # 打印debug信息
+    print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
+    print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
     # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
+            print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
+        print(f"Error during processing: {e}")
         import traceback
         traceback.print_exc()
         raise e
+def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
     temp_content_path = "wav/temp_content.wav"
+    temp_style_path = "wav/temp_style.wav"
+    temp_timbre_path = "wav/temp_timbre.wav"
     output_path = "wav/output_vevovoice.wav"
     # 检查并处理音频数据
+    if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
+        raise ValueError("Please upload all required audio files")
     # 处理内容音频格式
     if isinstance(content_wav, tuple) and len(content_wav) == 2:
         # 归一化音量
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid content audio format")
+    # 处理风格参考音频格式
+    if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
+        if isinstance(style_reference_wav[0], np.ndarray):
+            style_data, style_sr = style_reference_wav
         else:
+            style_sr, style_data = style_reference_wav
         # 确保是单声道
+        if len(style_data.shape) > 1 and style_data.shape[1] > 1:
+            style_data = np.mean(style_data, axis=1)
         # 重采样到24kHz
+        if style_sr != 24000:
+            style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
+            style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
+            style_sr = 24000
         else:
+            style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
         # 归一化音量
+        style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
+    else:
+        raise ValueError("Invalid style reference audio format")
+    # 处理音色参考音频格式
+    if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
+        if isinstance(timbre_reference_wav[0], np.ndarray):
+            timbre_data, timbre_sr = timbre_reference_wav
+        else:
+            timbre_sr, timbre_data = timbre_reference_wav
+        # 确保是单声道
+        if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
+            timbre_data = np.mean(timbre_data, axis=1)
+        # 重采样到24kHz
+        if timbre_sr != 24000:
+            timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
+            timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
+            timbre_sr = 24000
+        else:
+            timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
+        # 归一化音量
+        timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid timbre reference audio format")
     # 打印debug信息
+    print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
+    print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
+    print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
     # 保存上传的音频
     torchaudio.save(temp_content_path, content_tensor, content_sr)
+    torchaudio.save(temp_style_path, style_tensor, style_sr)
+    torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
     try:
         # 获取管道
         gen_audio = pipeline.inference_ar_and_fm(
             src_wav_path=temp_content_path,
             src_text=None,
+            style_ref_wav_path=temp_style_path,
+            timbre_ref_wav_path=temp_timbre_path,
         )
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
+            print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
+        print(f"Error during processing: {e}")
         import traceback
         traceback.print_exc()
         raise e
     # 检查并处理音频数据
     if ref_wav is None:
+        raise ValueError("Please upload a reference audio file")
     # 处理参考音频格式
     if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
         # 归一化音量
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
     else:
+        raise ValueError("Invalid reference audio format")
     # 打印debug信息
+    print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
     # 保存上传的音频
     torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
             # 归一化音量
             timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
+            print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
             torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
         else:
+            raise ValueError("Invalid timbre reference audio format")
     else:
         temp_timbre_path = temp_ref_path
         # 检查生成音频是否为数值异常
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
+            print("Warning: Generated audio contains NaN or Inf values")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
         # 保存生成的音频
         save_audio(gen_audio, output_path=output_path)
         return output_path
     except Exception as e:
+        print(f"Error during processing: {e}")
         import traceback
         traceback.print_exc()
         raise e
 # 创建Gradio界面
+with gr.Blocks(title="VEVO DEMO") as demo:
+    gr.Markdown("# VEVO DEMO")
+    gr.Markdown("## Controllable Zero-Shot Voice Conversion and Style Transfer")
+    with gr.Tab("Vevo-Timbre"):
+        gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
         with gr.Row():
             with gr.Column():
+                timbre_content = gr.Audio(label="Content Audio", type="numpy")
+                timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                timbre_button = gr.Button("Generate")
             with gr.Column():
+                timbre_output = gr.Audio(label="Result")
+        timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
+    with gr.Tab("Vevo-Voice"):
+        gr.Markdown("### Vevo-Voice: Transfer both style and timbre with separate references")
         with gr.Row():
             with gr.Column():
+                voice_content = gr.Audio(label="Content Audio", type="numpy")
+                voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
+                voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
+                voice_button = gr.Button("Generate")
             with gr.Column():
+                voice_output = gr.Audio(label="Result")
+        voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
+    with gr.Tab("Vevo-Style"):
+        gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
         with gr.Row():
             with gr.Column():
+                style_content = gr.Audio(label="Content Audio", type="numpy")
+                style_reference = gr.Audio(label="Style Reference", type="numpy")
+                style_button = gr.Button("Generate")
             with gr.Column():
+                style_output = gr.Audio(label="Result")
+        style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
+    with gr.Tab("Vevo-TTS"):
+        gr.Markdown("### Vevo-TTS: Text-to-speech with controllable style and timbre")
         with gr.Row():
             with gr.Column():
+                tts_text = gr.Textbox(label="Input Text", placeholder="Enter text to synthesize...", lines=3)
+                tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
+                tts_reference = gr.Audio(label="Style Reference", type="numpy")
+                tts_ref_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Reference Audio Language", value="en")
+                with gr.Accordion("Advanced Options", open=False):
+                    tts_timbre_reference = gr.Audio(label="Timbre Reference (Optional)", type="numpy")
+                tts_button = gr.Button("Generate")
             with gr.Column():
+                tts_output = gr.Audio(label="Result")
         tts_button.click(
             vevo_tts,
         )
     gr.Markdown("""
+    ## About VEVO
+    VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
+    1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
+    2. **Vevo-Timbre**: Maintains style but transfers timbre
+    3. **Vevo-Voice**: Transfers both style and timbre simultaneously
+    4. **Vevo-TTS**: Text-to-speech with controllable style and timbre
+    For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
     """)
 # 启动应用