All-In-One-TTS / app.py
KaixuanGuo's picture
Update app.py
16c3779 verified
import gradio as gr
from gradio_client import Client, handle_file
import warnings
import asyncio
# 忽略异步循环关闭时的资源警告
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*")
MODEL_SIZES = ["0.6B", "1.7B"]
# Speaker and language choices for CustomVoice model
SPEAKERS = [
"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
]
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]
S1_PROMPT_WAV = "assets/audios/female_mandarin.wav"
S2_PROMPT_WAV = "assets/audios/male_mandarin.wav"
# 1. 连接到目标 TTS 模型 (Hugging Face Spaces 互联)
# 注意:这相当于把计算压力交给了原模型所在的服务器
client_map = {}
soulx_name = "SoulX-Podcast-1.7B"
qwen3tts_name = "Qwen3-TTS"
mosstts_name = "MOSS-TTS-Family"
mosstts_name1 = "MOSS-TTS"
mosstts_name2 = "MOSS-TTSD-v1.0"
mosstts_name3 = "MOSS-VoiceGenerator"
cosy_name = "Fun-CosyVoice3-0.5B"
ming_name = "ming-uniaudio-demo"
client_map[soulx_name] = Client(f"Soul-AILab/{soulx_name}")
client_map[qwen3tts_name] = Client(f"Qwen/{qwen3tts_name}")
client_map[mosstts_name1] = Client(f"OpenMOSS-Team/{mosstts_name1}")
client_map[mosstts_name2] = Client(f"OpenMOSS-Team/{mosstts_name2}")
client_map[mosstts_name3] = Client(f"OpenMOSS-Team/{mosstts_name3}")
client_map[cosy_name] = Client(f"FunAudioLLM/{cosy_name}")
client_map[ming_name] = Client(f"cafe3310/{ming_name}")
def generate_tts_custom(text, lang, speaker, model_size, model_name):
if not text:
return None
try:
# 2. 调用原模型的 API 接口
# 经过分析,该模型的 predict 接口通常返回一个音频文件路径
result = client_map[model_name].predict(
text=text,
language="Auto",
speaker="Ryan",
instruct="",
model_size="1.7B",
api_name="/generate_custom_voice"
)
print(result)
return result
# if isinstance(result, (tuple, list)):
# audio_path = result[0]
# print(f"Successfully extracted path: {audio_path}")
# return audio_path
except Exception as e:
print(f"Error: {e}")
return None
def generate_voice_design(text, lang, instruct, model_name):
if not text:
return None
try:
# 2. 调用原模型的 API 接口
# 经过分析,该模型的 predict 接口通常返回一个音频文件路径
if model_name == qwen3tts_name:
result = client_map[model_name].predict(
text=text,
language="Auto",
voice_description=instruct,
api_name="/generate_voice_design"
)
elif model_name == mosstts_name3:
result = client_map[model_name].predict(
text=text,
instruction=instruct,
temperature=1.5,
top_p=0.6,
top_k=50,
repetition_penalty=1.1,
max_new_tokens=4096,
api_name="/run_inference"
)
elif model_name == ming_name:
pass
print(result)
return result
# if isinstance(result, (tuple, list)):
# audio_path = result[0]
# print(f"Successfully extracted path: {audio_path}")
# return audio_path
except Exception as e:
print(f"Error: {e}")
return None
def generate_voice_clone(s1_wav, s1_txt, text, lang, model_name):
if not text:
return None
try:
# 2. 调用原模型的 API 接口
# 经过分析,该模型的 predict 接口通常返回一个音频文件路径
if model_name == 'Qwen3-TTS':
result = client_map[model_name].predict(
ref_audio=handle_file(s1_wav),
ref_text=s1_txt,
target_text=text,
language="Auto",
use_xvector_only=False,
model_size="1.7B",
api_name="/generate_voice_clone"
)
elif model_name == 'MOSS-TTS':
result = client_map[model_name].predict(
text=text,
reference_audio=handle_file(s1_wav),
mode_with_reference="Clone",
duration_control_enabled=False,
duration_tokens=1,
temperature=1.7,
top_p=0.8,
top_k=25,
repetition_penalty=1,
max_new_tokens=4096,
api_name="/run_inference"
)
elif model_name == cosy_name:
result = client_map[model_name].predict(
tts_text=text,
mode_value="zero_shot",
prompt_text=s1_txt,
prompt_wav_upload=handle_file(s1_wav),
prompt_wav_record=handle_file(s1_wav),
instruct_text="You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
seed=0,
stream=False,
ui_lang="Zh",
api_name="/generate_audio"
)
if isinstance(result, str):
result = (result, "voice clone successfully!")
else:
raise
print(result)
return result
# if isinstance(result, (tuple, list)):
# audio_path = result[0]
# print(f"Successfully extracted path: {audio_path}")
# return audio_path
except Exception as e:
print(f"Error: {e}")
return None
def generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name):
if not text:
return None
try:
# 2. 调用原模型的 API 接口
# 经过分析,该模型的 predict 接口通常返回一个音频文件路径
if model_name == soulx_name:
result = client_map[model_name].predict(
target_text=text,
spk1_prompt_text=s1_txt,
spk1_prompt_audio=handle_file(s1_wav),
spk1_dialect_prompt_text="",
spk2_prompt_text=s2_txt,
spk2_prompt_audio=handle_file(s2_wav),
spk2_dialect_prompt_text="",
seed=1988,
api_name="/dialogue_synthesis_function"
)
elif model_name == mosstts_name2:
result = client_map[model_name].predict(
speaker_count=2,
param_1=handle_file(s1_wav),
param_2=handle_file(s2_wav),
param_3=handle_file(s1_wav),
param_4=handle_file(s1_wav),
param_5=handle_file(s1_wav),
param_6=s1_txt,
param_7=s2_txt,
param_8="Hello",
param_9="Hello!!",
param_10="Hello!!",
param_11=text,
param_12=True,
param_13=False,
param_14=1.1,
param_15=0.9,
param_16=50,
param_17=1.1,
param_18=2000,
api_name="/run_inference"
)
else:
raise
print(result)
if not isinstance(result, tuple):
result = (result, "podcast geenration completed successfully!")
return result
# if isinstance(result, (tuple, list)):
# audio_path = result[0]
# print(f"Successfully extracted path: {audio_path}")
# return audio_path
except Exception as e:
print(f"Error: {e}")
return None
def build_ui():
# 3. 构建极简界面
# with gr.Blocks(title="All-In-One-TTS") as demo:
# gr.Markdown("# 🎙️ All-In-One-TTS")
# gr.Markdown("Compare leading TTS models. Starting with **Qwen3-TTS-1.7B**.")
# with gr.Row():
# input_text = gr.Textbox(
# label="Input Text",
# placeholder="Type something here...",
# value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities."
# )
# submit_btn = gr.Button("Generate", variant="primary")
# output_audio = gr.Audio(label="TTS Output", type="filepath")
# # 绑定逻辑
# submit_btn.click(
# fn=tts_engine,
# inputs=input_text,
# outputs=output_audio
# )
# custom_css = """
# .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
# .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
# .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
# """
# 自定义 CSS:进一步增强 Tab 字体和布局美感
custom_css = """
/* 1. 基础 Tab 样式:加粗、增大字号 */
.mode-tabs button, .model-tabs button {
font-weight: 700 !important; /* 强制加粗 */
font-size: 16px !important; /* 稍微放大字号 */
color: #4b5563 !important; # 默认灰色
}
/* 3. 背景与面板美化 */
# .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
# .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
# .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
"""
with gr.Blocks(title="All-In-One-TTS", css=custom_css) as demo:
gr.Markdown("# 🎛️ TTS All-In-One Unified Panel")
# --- 核心状态中转站 (Textbox) ---
# 调试阶段可见,运行稳定后可设为 visible=False
mode_selection = gr.Textbox(value="voice clone", visible=True, label="Current Mode")
model_selection = gr.Textbox(value=qwen3tts_name, visible=True, label="Current Model")
# --- 第一行 Tab:选择功能模式 ---
with gr.Tabs(elem_classes="mode-tabs") as mode_tabs:
# 使用 'as' 将 Tab 实例赋值给变量,以便后续绑定
with gr.Tab("Voice Clone", id="voice clone") as tab_mode_2:
#gr.Markdown("### 当前模式:声音克隆 (Voice Clone)")
pass
with gr.Tab("CustomVoice", id="customvoice") as tab_mode_3:
#gr.Markdown("### 当前模式:自定义声音 (CustomVoice)")
pass
with gr.Tab("Podcast", id="podcast") as tab_mode_4:
#gr.Markdown("### 当前模式:播客制作 (Podcast)")
pass
with gr.Tab("Style Instruction", id="style instruction") as tab_mode_1:
#gr.Markdown("### 当前模式:风格指令 (Style Instruction)")
pass
with gr.Tab("TTA", id="tta") as tab_mode_5:
#gr.Markdown("### 当前模式:音效生成 (TTA)")
pass
with gr.Tab("Speech with BGM", id="speed with bgm") as tab_mode_6:
#gr.Markdown("### 当前模式:音效生成 (TTA)")
pass
# --- 第二行 Tab:选择模型后端 ---
with gr.Tabs(elem_classes="model-tabs") as model_tabs:
with gr.Tab(qwen3tts_name, id=qwen3tts_name) as tab_model_1:
#gr.Markdown(f"**当前后端引擎:** {qwen3tts_name}")
pass
with gr.Tab(cosy_name, id=cosy_name) as tab_model_3:
#gr.Markdown(f"**当前后端引擎:** {cosy_name}")
pass
with gr.Tab(soulx_name, id=soulx_name) as tab_model_2:
#gr.Markdown(f"**当前后端引擎:** {soulx_name}")
pass
with gr.Tab(mosstts_name1, id=mosstts_name1) as tab_model_4:
#gr.Markdown(f"**当前后端引擎:** {mosstts_name1}")
pass
with gr.Tab(mosstts_name2, id=mosstts_name2) as tab_model_5:
#gr.Markdown(f"**当前后端引擎:** {mosstts_name2}")
pass
with gr.Tab(mosstts_name3, id=mosstts_name3) as tab_model_6:
#gr.Markdown(f"**当前后端引擎:** {mosstts_name3}")
pass
with gr.Tab("Ming-Omni-TTS", id=ming_name) as tab_model_7:
#gr.Markdown(f"**当前后端引擎:** {soulx_name}")
pass
gr.Markdown("---")
with gr.Row():
# --- 第一列:Style Instruction ---
with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
gr.Markdown("### 🎨 Style & Design")
design_instruct = gr.Textbox(
label="Style Instruction / Voice Description",
lines=18,
placeholder="Describe voice style or custom voice...",
value="展现出悲苦沙哑的声音质感,语速偏慢,情绪浓烈且带有哭腔,以标准普通话缓慢诉说,情感强烈,语调哀怨高亢,音高起伏大。"
)
# --- 第二列:SPK1 Material ---
with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
gr.Markdown("### 👤 SPK1 Clone")
spk1_ref_audio = gr.Audio(label="SPK1 Audio", type="filepath")
spk1_ref_text = gr.Textbox(label="SPK1 Text", lines=3)
# --- 第三列:SPK2 Material ---
with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
gr.Markdown("### 👥 SPK2 Clone")
spk2_ref_audio = gr.Audio(label="SPK2 Audio", type="filepath")
spk2_ref_text = gr.Textbox(label="SPK2 Text", lines=3)
# --- 第四列:Target & Config ---
with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
gr.Markdown("### 📝 Synthesis")
shared_text = gr.Textbox(
label="Target Text / Script",
lines=5,
value="[S1] 哈喽,AI时代的冲浪先锋们!欢迎! [S2] 欢迎啊!"
)
with gr.Row():
shared_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto")
tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan")
with gr.Row():
shared_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B")
seed_input = gr.Number(label="Seed", value=1988, precision=0)
# --- 核心修改:缩减为一个按钮 ---
gr.Markdown("---")
with gr.Row():
# 跨越全宽的大按钮
generate_btn = gr.Button("🚀 Start Generating Audio", variant="primary", size="lg")
# --- 1. 定义校验逻辑 ---
def validate_combination(mode, model):
# 在这里定义你的禁止规则
# 示例:假设 soulx 不支持 podcast 和 voice clone
# is_invalid = (model == soulx_name and mode in ["customvoice", "voice clone", "style instruction"])
if model == soulx_name:
is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
elif model == qwen3tts_name:
is_invalid = mode in ["podcast", "tta", "speech with bgm"]
elif model == cosy_name:
is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
elif model == mosstts_name1:
is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
elif model == mosstts_name2:
is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
elif model == mosstts_name3:
is_invalid = mode in ["customvoice", "voice clone", "podcast", "tta", "speech with bgm"]
elif model == ming_name:
is_invalid = mode in ["customvoice", "podcast"]
else:
raise
if is_invalid:
# 返回禁用状态和警告文字
return gr.update(interactive=False, value=f"🚫 {model} 不支持 {mode} 模式")
else:
# 返回正常状态和原始文字
return gr.update(interactive=True, value="🚀 Start Generating Audio")
# --- 2. 绑定联动事件 ---
# 只要隐藏的 Textbox 值变了,就触发校验
# 记得在 mode_selection 和 model_selection 定义处确保它们能触发 change
mode_selection.change(
fn=validate_combination,
inputs=[mode_selection, model_selection],
outputs=generate_btn
)
model_selection.change(
fn=validate_combination,
inputs=[mode_selection, model_selection],
outputs=generate_btn
)
# --- 显式绑定:点击特定 Tab 时直接回传对应字符串 ---
# 模式切换绑定
tab_mode_1.select(fn=lambda: "style instruction", outputs=mode_selection)
tab_mode_2.select(fn=lambda: "voice clone", outputs=mode_selection)
tab_mode_3.select(fn=lambda: "customvoice", outputs=mode_selection)
tab_mode_4.select(fn=lambda: "podcast", outputs=mode_selection)
tab_mode_5.select(fn=lambda: "TTA", outputs=mode_selection)
tab_mode_6.select(fn=lambda: "Speech with BGM", outputs=mode_selection)
# 模型切换绑定
tab_model_1.select(fn=lambda: qwen3tts_name, outputs=model_selection)
tab_model_2.select(fn=lambda: soulx_name, outputs=model_selection)
tab_model_3.select(fn=lambda: cosy_name, outputs=model_selection)
tab_model_4.select(fn=lambda: mosstts_name1, outputs=model_selection)
tab_model_5.select(fn=lambda: mosstts_name2, outputs=model_selection)
tab_model_6.select(fn=lambda: mosstts_name3, outputs=model_selection)
tab_model_7.select(fn=lambda: ming_name, outputs=model_selection)
# --- 全局输出 ---
with gr.Row():
with gr.Column():
shared_audio_out = gr.Audio(label="Final Generated Audio", type="numpy")
shared_status = gr.Textbox(label="System Status", lines=2, interactive=False)
# --- 统一分流逻辑 ---
def unified_generation(
mode, model_name,
text, lang, model_size, speaker, instruct,
s1_wav, s1_txt, s2_wav, s2_txt, seed
):
# --- 调试打印区 ---
print("="*30)
print(f"【Mode Selected】: {mode}")
print(f"【Model Selected】: {model_name}")
print(f"【Target Text】: {text}")
print(f"【Language】: {lang}")
print(f"【Speaker】: {speaker}")
print(f"【Style Instruct】: {instruct}")
print(f"【SPK1 Audio Path】: {s1_wav}")
print(f"【SPK1 Text】: {s1_txt}")
print(f"【SPK2 Audio Path】: {s2_wav}")
print(f"【SPK2 Text】: {s2_txt}")
print(f"【Seed】: {seed}")
print("="*30)
# 1. 判断是否是 Podcast 模式 (两人都有音频)
if mode == "podcast":
assert s1_wav and s2_wav
status = "Detected: Podcast Mode (Multi-Speaker)"
result = generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name)
return result[0], f"{status}\n{result[1]}"
# 2. 判断是否是 Voice Clone 模式 (只有 S1 音频)
elif mode == "voice clone":
assert s1_wav
status = "Detected: Voice Clone Mode (Single Speaker)"
result = generate_voice_clone(s1_wav, s1_txt, text, lang, model_name)
return result[0], f"{status}\n{result[1]}"
# 3. 默认走 Voice Design 或 TTS (根据 instruct 是否为空)
elif mode == "style instruction":
assert instruct
status = "Detected: Voice Design Mode"
result = generate_voice_design(text, lang, instruct, model_name)
return result[0], f"{status}\n{result[1]}"
elif mode == 'customvoice':
status = f"Detected: Standard TTS Mode (Speaker: {speaker})"
result = generate_tts_custom(text, lang, speaker, model_size, model_name)
return result[0], f"{status}\n{result[1]}"
else:
print(f"{mode} not sopported yet!")
raise
# --- 按钮绑定 ---
def clear_output():
return None, "⏳ Analysis in progress... deciding mode..."
generate_btn.click(
fn=clear_output,
outputs=[shared_audio_out, shared_status]
).then(
fn=unified_generation,
inputs=[
mode_selection,
model_selection,
shared_text, shared_language, shared_model_size, tts_speaker, design_instruct,
spk1_ref_audio, spk1_ref_text, spk2_ref_audio, spk2_ref_text, seed_input
],
outputs=[shared_audio_out, shared_status]
)
return demo
# 启动服务
if __name__ == "__main__":
demo = build_ui()
demo.launch()