import gradio as gr from gradio_client import Client, handle_file import warnings import asyncio # 忽略异步循环关闭时的资源警告 warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*") MODEL_SIZES = ["0.6B", "1.7B"] # Speaker and language choices for CustomVoice model SPEAKERS = [ "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian" ] LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"] S1_PROMPT_WAV = "assets/audios/female_mandarin.wav" S2_PROMPT_WAV = "assets/audios/male_mandarin.wav" # 1. 连接到目标 TTS 模型 (Hugging Face Spaces 互联) # 注意:这相当于把计算压力交给了原模型所在的服务器 client_map = {} soulx_name = "SoulX-Podcast-1.7B" qwen3tts_name = "Qwen3-TTS" mosstts_name = "MOSS-TTS-Family" mosstts_name1 = "MOSS-TTS" mosstts_name2 = "MOSS-TTSD-v1.0" mosstts_name3 = "MOSS-VoiceGenerator" cosy_name = "Fun-CosyVoice3-0.5B" ming_name = "ming-uniaudio-demo" client_map[soulx_name] = Client(f"Soul-AILab/{soulx_name}") client_map[qwen3tts_name] = Client(f"Qwen/{qwen3tts_name}") client_map[mosstts_name1] = Client(f"OpenMOSS-Team/{mosstts_name1}") client_map[mosstts_name2] = Client(f"OpenMOSS-Team/{mosstts_name2}") client_map[mosstts_name3] = Client(f"OpenMOSS-Team/{mosstts_name3}") client_map[cosy_name] = Client(f"FunAudioLLM/{cosy_name}") client_map[ming_name] = Client(f"cafe3310/{ming_name}") def generate_tts_custom(text, lang, speaker, model_size, model_name): if not text: return None try: # 2. 调用原模型的 API 接口 # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 result = client_map[model_name].predict( text=text, language="Auto", speaker="Ryan", instruct="", model_size="1.7B", api_name="/generate_custom_voice" ) print(result) return result # if isinstance(result, (tuple, list)): # audio_path = result[0] # print(f"Successfully extracted path: {audio_path}") # return audio_path except Exception as e: print(f"Error: {e}") return None def generate_voice_design(text, lang, instruct, model_name): if not text: return None try: # 2. 调用原模型的 API 接口 # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 if model_name == qwen3tts_name: result = client_map[model_name].predict( text=text, language="Auto", voice_description=instruct, api_name="/generate_voice_design" ) elif model_name == mosstts_name3: result = client_map[model_name].predict( text=text, instruction=instruct, temperature=1.5, top_p=0.6, top_k=50, repetition_penalty=1.1, max_new_tokens=4096, api_name="/run_inference" ) elif model_name == ming_name: pass print(result) return result # if isinstance(result, (tuple, list)): # audio_path = result[0] # print(f"Successfully extracted path: {audio_path}") # return audio_path except Exception as e: print(f"Error: {e}") return None def generate_voice_clone(s1_wav, s1_txt, text, lang, model_name): if not text: return None try: # 2. 调用原模型的 API 接口 # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 if model_name == 'Qwen3-TTS': result = client_map[model_name].predict( ref_audio=handle_file(s1_wav), ref_text=s1_txt, target_text=text, language="Auto", use_xvector_only=False, model_size="1.7B", api_name="/generate_voice_clone" ) elif model_name == 'MOSS-TTS': result = client_map[model_name].predict( text=text, reference_audio=handle_file(s1_wav), mode_with_reference="Clone", duration_control_enabled=False, duration_tokens=1, temperature=1.7, top_p=0.8, top_k=25, repetition_penalty=1, max_new_tokens=4096, api_name="/run_inference" ) elif model_name == cosy_name: result = client_map[model_name].predict( tts_text=text, mode_value="zero_shot", prompt_text=s1_txt, prompt_wav_upload=handle_file(s1_wav), prompt_wav_record=handle_file(s1_wav), instruct_text="You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>", seed=0, stream=False, ui_lang="Zh", api_name="/generate_audio" ) if isinstance(result, str): result = (result, "voice clone successfully!") else: raise print(result) return result # if isinstance(result, (tuple, list)): # audio_path = result[0] # print(f"Successfully extracted path: {audio_path}") # return audio_path except Exception as e: print(f"Error: {e}") return None def generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name): if not text: return None try: # 2. 调用原模型的 API 接口 # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 if model_name == soulx_name: result = client_map[model_name].predict( target_text=text, spk1_prompt_text=s1_txt, spk1_prompt_audio=handle_file(s1_wav), spk1_dialect_prompt_text="", spk2_prompt_text=s2_txt, spk2_prompt_audio=handle_file(s2_wav), spk2_dialect_prompt_text="", seed=1988, api_name="/dialogue_synthesis_function" ) elif model_name == mosstts_name2: result = client_map[model_name].predict( speaker_count=2, param_1=handle_file(s1_wav), param_2=handle_file(s2_wav), param_3=handle_file(s1_wav), param_4=handle_file(s1_wav), param_5=handle_file(s1_wav), param_6=s1_txt, param_7=s2_txt, param_8="Hello", param_9="Hello!!", param_10="Hello!!", param_11=text, param_12=True, param_13=False, param_14=1.1, param_15=0.9, param_16=50, param_17=1.1, param_18=2000, api_name="/run_inference" ) else: raise print(result) if not isinstance(result, tuple): result = (result, "podcast geenration completed successfully!") return result # if isinstance(result, (tuple, list)): # audio_path = result[0] # print(f"Successfully extracted path: {audio_path}") # return audio_path except Exception as e: print(f"Error: {e}") return None def build_ui(): # 3. 构建极简界面 # with gr.Blocks(title="All-In-One-TTS") as demo: # gr.Markdown("# 🎙️ All-In-One-TTS") # gr.Markdown("Compare leading TTS models. Starting with **Qwen3-TTS-1.7B**.") # with gr.Row(): # input_text = gr.Textbox( # label="Input Text", # placeholder="Type something here...", # value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities." # ) # submit_btn = gr.Button("Generate", variant="primary") # output_audio = gr.Audio(label="TTS Output", type="filepath") # # 绑定逻辑 # submit_btn.click( # fn=tts_engine, # inputs=input_text, # outputs=output_audio # ) # custom_css = """ # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; } # """ # 自定义 CSS:进一步增强 Tab 字体和布局美感 custom_css = """ /* 1. 基础 Tab 样式:加粗、增大字号 */ .mode-tabs button, .model-tabs button { font-weight: 700 !important; /* 强制加粗 */ font-size: 16px !important; /* 稍微放大字号 */ color: #4b5563 !important; # 默认灰色 } /* 3. 背景与面板美化 */ # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; } """ with gr.Blocks(title="All-In-One-TTS", css=custom_css) as demo: gr.Markdown("# 🎛️ TTS All-In-One Unified Panel") # --- 核心状态中转站 (Textbox) --- # 调试阶段可见,运行稳定后可设为 visible=False mode_selection = gr.Textbox(value="voice clone", visible=True, label="Current Mode") model_selection = gr.Textbox(value=qwen3tts_name, visible=True, label="Current Model") # --- 第一行 Tab:选择功能模式 --- with gr.Tabs(elem_classes="mode-tabs") as mode_tabs: # 使用 'as' 将 Tab 实例赋值给变量,以便后续绑定 with gr.Tab("Voice Clone", id="voice clone") as tab_mode_2: #gr.Markdown("### 当前模式:声音克隆 (Voice Clone)") pass with gr.Tab("CustomVoice", id="customvoice") as tab_mode_3: #gr.Markdown("### 当前模式:自定义声音 (CustomVoice)") pass with gr.Tab("Podcast", id="podcast") as tab_mode_4: #gr.Markdown("### 当前模式:播客制作 (Podcast)") pass with gr.Tab("Style Instruction", id="style instruction") as tab_mode_1: #gr.Markdown("### 当前模式:风格指令 (Style Instruction)") pass with gr.Tab("TTA", id="tta") as tab_mode_5: #gr.Markdown("### 当前模式:音效生成 (TTA)") pass with gr.Tab("Speech with BGM", id="speed with bgm") as tab_mode_6: #gr.Markdown("### 当前模式:音效生成 (TTA)") pass # --- 第二行 Tab:选择模型后端 --- with gr.Tabs(elem_classes="model-tabs") as model_tabs: with gr.Tab(qwen3tts_name, id=qwen3tts_name) as tab_model_1: #gr.Markdown(f"**当前后端引擎:** {qwen3tts_name}") pass with gr.Tab(cosy_name, id=cosy_name) as tab_model_3: #gr.Markdown(f"**当前后端引擎:** {cosy_name}") pass with gr.Tab(soulx_name, id=soulx_name) as tab_model_2: #gr.Markdown(f"**当前后端引擎:** {soulx_name}") pass with gr.Tab(mosstts_name1, id=mosstts_name1) as tab_model_4: #gr.Markdown(f"**当前后端引擎:** {mosstts_name1}") pass with gr.Tab(mosstts_name2, id=mosstts_name2) as tab_model_5: #gr.Markdown(f"**当前后端引擎:** {mosstts_name2}") pass with gr.Tab(mosstts_name3, id=mosstts_name3) as tab_model_6: #gr.Markdown(f"**当前后端引擎:** {mosstts_name3}") pass with gr.Tab("Ming-Omni-TTS", id=ming_name) as tab_model_7: #gr.Markdown(f"**当前后端引擎:** {soulx_name}") pass gr.Markdown("---") with gr.Row(): # --- 第一列:Style Instruction --- with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): gr.Markdown("### 🎨 Style & Design") design_instruct = gr.Textbox( label="Style Instruction / Voice Description", lines=18, placeholder="Describe voice style or custom voice...", value="展现出悲苦沙哑的声音质感,语速偏慢,情绪浓烈且带有哭腔,以标准普通话缓慢诉说,情感强烈,语调哀怨高亢,音高起伏大。" ) # --- 第二列:SPK1 Material --- with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): gr.Markdown("### 👤 SPK1 Clone") spk1_ref_audio = gr.Audio(label="SPK1 Audio", type="filepath") spk1_ref_text = gr.Textbox(label="SPK1 Text", lines=3) # --- 第三列:SPK2 Material --- with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): gr.Markdown("### 👥 SPK2 Clone") spk2_ref_audio = gr.Audio(label="SPK2 Audio", type="filepath") spk2_ref_text = gr.Textbox(label="SPK2 Text", lines=3) # --- 第四列:Target & Config --- with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): gr.Markdown("### 📝 Synthesis") shared_text = gr.Textbox( label="Target Text / Script", lines=5, value="[S1] 哈喽,AI时代的冲浪先锋们!欢迎! [S2] 欢迎啊!" ) with gr.Row(): shared_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto") tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan") with gr.Row(): shared_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B") seed_input = gr.Number(label="Seed", value=1988, precision=0) # --- 核心修改:缩减为一个按钮 --- gr.Markdown("---") with gr.Row(): # 跨越全宽的大按钮 generate_btn = gr.Button("🚀 Start Generating Audio", variant="primary", size="lg") # --- 1. 定义校验逻辑 --- def validate_combination(mode, model): # 在这里定义你的禁止规则 # 示例:假设 soulx 不支持 podcast 和 voice clone # is_invalid = (model == soulx_name and mode in ["customvoice", "voice clone", "style instruction"]) if model == soulx_name: is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"] elif model == qwen3tts_name: is_invalid = mode in ["podcast", "tta", "speech with bgm"] elif model == cosy_name: is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"] elif model == mosstts_name1: is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"] elif model == mosstts_name2: is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"] elif model == mosstts_name3: is_invalid = mode in ["customvoice", "voice clone", "podcast", "tta", "speech with bgm"] elif model == ming_name: is_invalid = mode in ["customvoice", "podcast"] else: raise if is_invalid: # 返回禁用状态和警告文字 return gr.update(interactive=False, value=f"🚫 {model} 不支持 {mode} 模式") else: # 返回正常状态和原始文字 return gr.update(interactive=True, value="🚀 Start Generating Audio") # --- 2. 绑定联动事件 --- # 只要隐藏的 Textbox 值变了,就触发校验 # 记得在 mode_selection 和 model_selection 定义处确保它们能触发 change mode_selection.change( fn=validate_combination, inputs=[mode_selection, model_selection], outputs=generate_btn ) model_selection.change( fn=validate_combination, inputs=[mode_selection, model_selection], outputs=generate_btn ) # --- 显式绑定:点击特定 Tab 时直接回传对应字符串 --- # 模式切换绑定 tab_mode_1.select(fn=lambda: "style instruction", outputs=mode_selection) tab_mode_2.select(fn=lambda: "voice clone", outputs=mode_selection) tab_mode_3.select(fn=lambda: "customvoice", outputs=mode_selection) tab_mode_4.select(fn=lambda: "podcast", outputs=mode_selection) tab_mode_5.select(fn=lambda: "TTA", outputs=mode_selection) tab_mode_6.select(fn=lambda: "Speech with BGM", outputs=mode_selection) # 模型切换绑定 tab_model_1.select(fn=lambda: qwen3tts_name, outputs=model_selection) tab_model_2.select(fn=lambda: soulx_name, outputs=model_selection) tab_model_3.select(fn=lambda: cosy_name, outputs=model_selection) tab_model_4.select(fn=lambda: mosstts_name1, outputs=model_selection) tab_model_5.select(fn=lambda: mosstts_name2, outputs=model_selection) tab_model_6.select(fn=lambda: mosstts_name3, outputs=model_selection) tab_model_7.select(fn=lambda: ming_name, outputs=model_selection) # --- 全局输出 --- with gr.Row(): with gr.Column(): shared_audio_out = gr.Audio(label="Final Generated Audio", type="numpy") shared_status = gr.Textbox(label="System Status", lines=2, interactive=False) # --- 统一分流逻辑 --- def unified_generation( mode, model_name, text, lang, model_size, speaker, instruct, s1_wav, s1_txt, s2_wav, s2_txt, seed ): # --- 调试打印区 --- print("="*30) print(f"【Mode Selected】: {mode}") print(f"【Model Selected】: {model_name}") print(f"【Target Text】: {text}") print(f"【Language】: {lang}") print(f"【Speaker】: {speaker}") print(f"【Style Instruct】: {instruct}") print(f"【SPK1 Audio Path】: {s1_wav}") print(f"【SPK1 Text】: {s1_txt}") print(f"【SPK2 Audio Path】: {s2_wav}") print(f"【SPK2 Text】: {s2_txt}") print(f"【Seed】: {seed}") print("="*30) # 1. 判断是否是 Podcast 模式 (两人都有音频) if mode == "podcast": assert s1_wav and s2_wav status = "Detected: Podcast Mode (Multi-Speaker)" result = generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name) return result[0], f"{status}\n{result[1]}" # 2. 判断是否是 Voice Clone 模式 (只有 S1 音频) elif mode == "voice clone": assert s1_wav status = "Detected: Voice Clone Mode (Single Speaker)" result = generate_voice_clone(s1_wav, s1_txt, text, lang, model_name) return result[0], f"{status}\n{result[1]}" # 3. 默认走 Voice Design 或 TTS (根据 instruct 是否为空) elif mode == "style instruction": assert instruct status = "Detected: Voice Design Mode" result = generate_voice_design(text, lang, instruct, model_name) return result[0], f"{status}\n{result[1]}" elif mode == 'customvoice': status = f"Detected: Standard TTS Mode (Speaker: {speaker})" result = generate_tts_custom(text, lang, speaker, model_size, model_name) return result[0], f"{status}\n{result[1]}" else: print(f"{mode} not sopported yet!") raise # --- 按钮绑定 --- def clear_output(): return None, "⏳ Analysis in progress... deciding mode..." generate_btn.click( fn=clear_output, outputs=[shared_audio_out, shared_status] ).then( fn=unified_generation, inputs=[ mode_selection, model_selection, shared_text, shared_language, shared_model_size, tts_speaker, design_instruct, spk1_ref_audio, spk1_ref_text, spk2_ref_audio, spk2_ref_text, seed_input ], outputs=[shared_audio_out, shared_status] ) return demo # 启动服务 if __name__ == "__main__": demo = build_ui() demo.launch()