Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from gradio_client import Client, handle_file | |
| import warnings | |
| import asyncio | |
| # 忽略异步循环关闭时的资源警告 | |
| warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*") | |
| MODEL_SIZES = ["0.6B", "1.7B"] | |
| # Speaker and language choices for CustomVoice model | |
| SPEAKERS = [ | |
| "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian" | |
| ] | |
| LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"] | |
| S1_PROMPT_WAV = "assets/audios/female_mandarin.wav" | |
| S2_PROMPT_WAV = "assets/audios/male_mandarin.wav" | |
| # 1. 连接到目标 TTS 模型 (Hugging Face Spaces 互联) | |
| # 注意:这相当于把计算压力交给了原模型所在的服务器 | |
| client_map = {} | |
| soulx_name = "SoulX-Podcast-1.7B" | |
| qwen3tts_name = "Qwen3-TTS" | |
| mosstts_name = "MOSS-TTS-Family" | |
| mosstts_name1 = "MOSS-TTS" | |
| mosstts_name2 = "MOSS-TTSD-v1.0" | |
| mosstts_name3 = "MOSS-VoiceGenerator" | |
| cosy_name = "Fun-CosyVoice3-0.5B" | |
| ming_name = "ming-uniaudio-demo" | |
| client_map[soulx_name] = Client(f"Soul-AILab/{soulx_name}") | |
| client_map[qwen3tts_name] = Client(f"Qwen/{qwen3tts_name}") | |
| client_map[mosstts_name1] = Client(f"OpenMOSS-Team/{mosstts_name1}") | |
| client_map[mosstts_name2] = Client(f"OpenMOSS-Team/{mosstts_name2}") | |
| client_map[mosstts_name3] = Client(f"OpenMOSS-Team/{mosstts_name3}") | |
| client_map[cosy_name] = Client(f"FunAudioLLM/{cosy_name}") | |
| client_map[ming_name] = Client(f"cafe3310/{ming_name}") | |
| def generate_tts_custom(text, lang, speaker, model_size, model_name): | |
| if not text: | |
| return None | |
| try: | |
| # 2. 调用原模型的 API 接口 | |
| # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 | |
| result = client_map[model_name].predict( | |
| text=text, | |
| language="Auto", | |
| speaker="Ryan", | |
| instruct="", | |
| model_size="1.7B", | |
| api_name="/generate_custom_voice" | |
| ) | |
| print(result) | |
| return result | |
| # if isinstance(result, (tuple, list)): | |
| # audio_path = result[0] | |
| # print(f"Successfully extracted path: {audio_path}") | |
| # return audio_path | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |
| def generate_voice_design(text, lang, instruct, model_name): | |
| if not text: | |
| return None | |
| try: | |
| # 2. 调用原模型的 API 接口 | |
| # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 | |
| if model_name == qwen3tts_name: | |
| result = client_map[model_name].predict( | |
| text=text, | |
| language="Auto", | |
| voice_description=instruct, | |
| api_name="/generate_voice_design" | |
| ) | |
| elif model_name == mosstts_name3: | |
| result = client_map[model_name].predict( | |
| text=text, | |
| instruction=instruct, | |
| temperature=1.5, | |
| top_p=0.6, | |
| top_k=50, | |
| repetition_penalty=1.1, | |
| max_new_tokens=4096, | |
| api_name="/run_inference" | |
| ) | |
| elif model_name == ming_name: | |
| pass | |
| print(result) | |
| return result | |
| # if isinstance(result, (tuple, list)): | |
| # audio_path = result[0] | |
| # print(f"Successfully extracted path: {audio_path}") | |
| # return audio_path | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |
| def generate_voice_clone(s1_wav, s1_txt, text, lang, model_name): | |
| if not text: | |
| return None | |
| try: | |
| # 2. 调用原模型的 API 接口 | |
| # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 | |
| if model_name == 'Qwen3-TTS': | |
| result = client_map[model_name].predict( | |
| ref_audio=handle_file(s1_wav), | |
| ref_text=s1_txt, | |
| target_text=text, | |
| language="Auto", | |
| use_xvector_only=False, | |
| model_size="1.7B", | |
| api_name="/generate_voice_clone" | |
| ) | |
| elif model_name == 'MOSS-TTS': | |
| result = client_map[model_name].predict( | |
| text=text, | |
| reference_audio=handle_file(s1_wav), | |
| mode_with_reference="Clone", | |
| duration_control_enabled=False, | |
| duration_tokens=1, | |
| temperature=1.7, | |
| top_p=0.8, | |
| top_k=25, | |
| repetition_penalty=1, | |
| max_new_tokens=4096, | |
| api_name="/run_inference" | |
| ) | |
| elif model_name == cosy_name: | |
| result = client_map[model_name].predict( | |
| tts_text=text, | |
| mode_value="zero_shot", | |
| prompt_text=s1_txt, | |
| prompt_wav_upload=handle_file(s1_wav), | |
| prompt_wav_record=handle_file(s1_wav), | |
| instruct_text="You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>", | |
| seed=0, | |
| stream=False, | |
| ui_lang="Zh", | |
| api_name="/generate_audio" | |
| ) | |
| if isinstance(result, str): | |
| result = (result, "voice clone successfully!") | |
| else: | |
| raise | |
| print(result) | |
| return result | |
| # if isinstance(result, (tuple, list)): | |
| # audio_path = result[0] | |
| # print(f"Successfully extracted path: {audio_path}") | |
| # return audio_path | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |
| def generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name): | |
| if not text: | |
| return None | |
| try: | |
| # 2. 调用原模型的 API 接口 | |
| # 经过分析,该模型的 predict 接口通常返回一个音频文件路径 | |
| if model_name == soulx_name: | |
| result = client_map[model_name].predict( | |
| target_text=text, | |
| spk1_prompt_text=s1_txt, | |
| spk1_prompt_audio=handle_file(s1_wav), | |
| spk1_dialect_prompt_text="", | |
| spk2_prompt_text=s2_txt, | |
| spk2_prompt_audio=handle_file(s2_wav), | |
| spk2_dialect_prompt_text="", | |
| seed=1988, | |
| api_name="/dialogue_synthesis_function" | |
| ) | |
| elif model_name == mosstts_name2: | |
| result = client_map[model_name].predict( | |
| speaker_count=2, | |
| param_1=handle_file(s1_wav), | |
| param_2=handle_file(s2_wav), | |
| param_3=handle_file(s1_wav), | |
| param_4=handle_file(s1_wav), | |
| param_5=handle_file(s1_wav), | |
| param_6=s1_txt, | |
| param_7=s2_txt, | |
| param_8="Hello", | |
| param_9="Hello!!", | |
| param_10="Hello!!", | |
| param_11=text, | |
| param_12=True, | |
| param_13=False, | |
| param_14=1.1, | |
| param_15=0.9, | |
| param_16=50, | |
| param_17=1.1, | |
| param_18=2000, | |
| api_name="/run_inference" | |
| ) | |
| else: | |
| raise | |
| print(result) | |
| if not isinstance(result, tuple): | |
| result = (result, "podcast geenration completed successfully!") | |
| return result | |
| # if isinstance(result, (tuple, list)): | |
| # audio_path = result[0] | |
| # print(f"Successfully extracted path: {audio_path}") | |
| # return audio_path | |
| except Exception as e: | |
| print(f"Error: {e}") | |
| return None | |
| def build_ui(): | |
| # 3. 构建极简界面 | |
| # with gr.Blocks(title="All-In-One-TTS") as demo: | |
| # gr.Markdown("# 🎙️ All-In-One-TTS") | |
| # gr.Markdown("Compare leading TTS models. Starting with **Qwen3-TTS-1.7B**.") | |
| # with gr.Row(): | |
| # input_text = gr.Textbox( | |
| # label="Input Text", | |
| # placeholder="Type something here...", | |
| # value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities." | |
| # ) | |
| # submit_btn = gr.Button("Generate", variant="primary") | |
| # output_audio = gr.Audio(label="TTS Output", type="filepath") | |
| # # 绑定逻辑 | |
| # submit_btn.click( | |
| # fn=tts_engine, | |
| # inputs=input_text, | |
| # outputs=output_audio | |
| # ) | |
| # custom_css = """ | |
| # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } | |
| # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } | |
| # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; } | |
| # """ | |
| # 自定义 CSS:进一步增强 Tab 字体和布局美感 | |
| custom_css = """ | |
| /* 1. 基础 Tab 样式:加粗、增大字号 */ | |
| .mode-tabs button, .model-tabs button { | |
| font-weight: 700 !important; /* 强制加粗 */ | |
| font-size: 16px !important; /* 稍微放大字号 */ | |
| color: #4b5563 !important; # 默认灰色 | |
| } | |
| /* 3. 背景与面板美化 */ | |
| # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } | |
| # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); } | |
| # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; } | |
| """ | |
| with gr.Blocks(title="All-In-One-TTS", css=custom_css) as demo: | |
| gr.Markdown("# 🎛️ TTS All-In-One Unified Panel") | |
| # --- 核心状态中转站 (Textbox) --- | |
| # 调试阶段可见,运行稳定后可设为 visible=False | |
| mode_selection = gr.Textbox(value="voice clone", visible=True, label="Current Mode") | |
| model_selection = gr.Textbox(value=qwen3tts_name, visible=True, label="Current Model") | |
| # --- 第一行 Tab:选择功能模式 --- | |
| with gr.Tabs(elem_classes="mode-tabs") as mode_tabs: | |
| # 使用 'as' 将 Tab 实例赋值给变量,以便后续绑定 | |
| with gr.Tab("Voice Clone", id="voice clone") as tab_mode_2: | |
| #gr.Markdown("### 当前模式:声音克隆 (Voice Clone)") | |
| pass | |
| with gr.Tab("CustomVoice", id="customvoice") as tab_mode_3: | |
| #gr.Markdown("### 当前模式:自定义声音 (CustomVoice)") | |
| pass | |
| with gr.Tab("Podcast", id="podcast") as tab_mode_4: | |
| #gr.Markdown("### 当前模式:播客制作 (Podcast)") | |
| pass | |
| with gr.Tab("Style Instruction", id="style instruction") as tab_mode_1: | |
| #gr.Markdown("### 当前模式:风格指令 (Style Instruction)") | |
| pass | |
| with gr.Tab("TTA", id="tta") as tab_mode_5: | |
| #gr.Markdown("### 当前模式:音效生成 (TTA)") | |
| pass | |
| with gr.Tab("Speech with BGM", id="speed with bgm") as tab_mode_6: | |
| #gr.Markdown("### 当前模式:音效生成 (TTA)") | |
| pass | |
| # --- 第二行 Tab:选择模型后端 --- | |
| with gr.Tabs(elem_classes="model-tabs") as model_tabs: | |
| with gr.Tab(qwen3tts_name, id=qwen3tts_name) as tab_model_1: | |
| #gr.Markdown(f"**当前后端引擎:** {qwen3tts_name}") | |
| pass | |
| with gr.Tab(cosy_name, id=cosy_name) as tab_model_3: | |
| #gr.Markdown(f"**当前后端引擎:** {cosy_name}") | |
| pass | |
| with gr.Tab(soulx_name, id=soulx_name) as tab_model_2: | |
| #gr.Markdown(f"**当前后端引擎:** {soulx_name}") | |
| pass | |
| with gr.Tab(mosstts_name1, id=mosstts_name1) as tab_model_4: | |
| #gr.Markdown(f"**当前后端引擎:** {mosstts_name1}") | |
| pass | |
| with gr.Tab(mosstts_name2, id=mosstts_name2) as tab_model_5: | |
| #gr.Markdown(f"**当前后端引擎:** {mosstts_name2}") | |
| pass | |
| with gr.Tab(mosstts_name3, id=mosstts_name3) as tab_model_6: | |
| #gr.Markdown(f"**当前后端引擎:** {mosstts_name3}") | |
| pass | |
| with gr.Tab("Ming-Omni-TTS", id=ming_name) as tab_model_7: | |
| #gr.Markdown(f"**当前后端引擎:** {soulx_name}") | |
| pass | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| # --- 第一列:Style Instruction --- | |
| with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): | |
| gr.Markdown("### 🎨 Style & Design") | |
| design_instruct = gr.Textbox( | |
| label="Style Instruction / Voice Description", | |
| lines=18, | |
| placeholder="Describe voice style or custom voice...", | |
| value="展现出悲苦沙哑的声音质感,语速偏慢,情绪浓烈且带有哭腔,以标准普通话缓慢诉说,情感强烈,语调哀怨高亢,音高起伏大。" | |
| ) | |
| # --- 第二列:SPK1 Material --- | |
| with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): | |
| gr.Markdown("### 👤 SPK1 Clone") | |
| spk1_ref_audio = gr.Audio(label="SPK1 Audio", type="filepath") | |
| spk1_ref_text = gr.Textbox(label="SPK1 Text", lines=3) | |
| # --- 第三列:SPK2 Material --- | |
| with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): | |
| gr.Markdown("### 👥 SPK2 Clone") | |
| spk2_ref_audio = gr.Audio(label="SPK2 Audio", type="filepath") | |
| spk2_ref_text = gr.Textbox(label="SPK2 Text", lines=3) | |
| # --- 第四列:Target & Config --- | |
| with gr.Column(scale=1, variant="panel", elem_classes="column-panel"): | |
| gr.Markdown("### 📝 Synthesis") | |
| shared_text = gr.Textbox( | |
| label="Target Text / Script", | |
| lines=5, | |
| value="[S1] 哈喽,AI时代的冲浪先锋们!欢迎! [S2] 欢迎啊!" | |
| ) | |
| with gr.Row(): | |
| shared_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto") | |
| tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan") | |
| with gr.Row(): | |
| shared_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B") | |
| seed_input = gr.Number(label="Seed", value=1988, precision=0) | |
| # --- 核心修改:缩减为一个按钮 --- | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| # 跨越全宽的大按钮 | |
| generate_btn = gr.Button("🚀 Start Generating Audio", variant="primary", size="lg") | |
| # --- 1. 定义校验逻辑 --- | |
| def validate_combination(mode, model): | |
| # 在这里定义你的禁止规则 | |
| # 示例:假设 soulx 不支持 podcast 和 voice clone | |
| # is_invalid = (model == soulx_name and mode in ["customvoice", "voice clone", "style instruction"]) | |
| if model == soulx_name: | |
| is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"] | |
| elif model == qwen3tts_name: | |
| is_invalid = mode in ["podcast", "tta", "speech with bgm"] | |
| elif model == cosy_name: | |
| is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"] | |
| elif model == mosstts_name1: | |
| is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"] | |
| elif model == mosstts_name2: | |
| is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"] | |
| elif model == mosstts_name3: | |
| is_invalid = mode in ["customvoice", "voice clone", "podcast", "tta", "speech with bgm"] | |
| elif model == ming_name: | |
| is_invalid = mode in ["customvoice", "podcast"] | |
| else: | |
| raise | |
| if is_invalid: | |
| # 返回禁用状态和警告文字 | |
| return gr.update(interactive=False, value=f"🚫 {model} 不支持 {mode} 模式") | |
| else: | |
| # 返回正常状态和原始文字 | |
| return gr.update(interactive=True, value="🚀 Start Generating Audio") | |
| # --- 2. 绑定联动事件 --- | |
| # 只要隐藏的 Textbox 值变了,就触发校验 | |
| # 记得在 mode_selection 和 model_selection 定义处确保它们能触发 change | |
| mode_selection.change( | |
| fn=validate_combination, | |
| inputs=[mode_selection, model_selection], | |
| outputs=generate_btn | |
| ) | |
| model_selection.change( | |
| fn=validate_combination, | |
| inputs=[mode_selection, model_selection], | |
| outputs=generate_btn | |
| ) | |
| # --- 显式绑定:点击特定 Tab 时直接回传对应字符串 --- | |
| # 模式切换绑定 | |
| tab_mode_1.select(fn=lambda: "style instruction", outputs=mode_selection) | |
| tab_mode_2.select(fn=lambda: "voice clone", outputs=mode_selection) | |
| tab_mode_3.select(fn=lambda: "customvoice", outputs=mode_selection) | |
| tab_mode_4.select(fn=lambda: "podcast", outputs=mode_selection) | |
| tab_mode_5.select(fn=lambda: "TTA", outputs=mode_selection) | |
| tab_mode_6.select(fn=lambda: "Speech with BGM", outputs=mode_selection) | |
| # 模型切换绑定 | |
| tab_model_1.select(fn=lambda: qwen3tts_name, outputs=model_selection) | |
| tab_model_2.select(fn=lambda: soulx_name, outputs=model_selection) | |
| tab_model_3.select(fn=lambda: cosy_name, outputs=model_selection) | |
| tab_model_4.select(fn=lambda: mosstts_name1, outputs=model_selection) | |
| tab_model_5.select(fn=lambda: mosstts_name2, outputs=model_selection) | |
| tab_model_6.select(fn=lambda: mosstts_name3, outputs=model_selection) | |
| tab_model_7.select(fn=lambda: ming_name, outputs=model_selection) | |
| # --- 全局输出 --- | |
| with gr.Row(): | |
| with gr.Column(): | |
| shared_audio_out = gr.Audio(label="Final Generated Audio", type="numpy") | |
| shared_status = gr.Textbox(label="System Status", lines=2, interactive=False) | |
| # --- 统一分流逻辑 --- | |
| def unified_generation( | |
| mode, model_name, | |
| text, lang, model_size, speaker, instruct, | |
| s1_wav, s1_txt, s2_wav, s2_txt, seed | |
| ): | |
| # --- 调试打印区 --- | |
| print("="*30) | |
| print(f"【Mode Selected】: {mode}") | |
| print(f"【Model Selected】: {model_name}") | |
| print(f"【Target Text】: {text}") | |
| print(f"【Language】: {lang}") | |
| print(f"【Speaker】: {speaker}") | |
| print(f"【Style Instruct】: {instruct}") | |
| print(f"【SPK1 Audio Path】: {s1_wav}") | |
| print(f"【SPK1 Text】: {s1_txt}") | |
| print(f"【SPK2 Audio Path】: {s2_wav}") | |
| print(f"【SPK2 Text】: {s2_txt}") | |
| print(f"【Seed】: {seed}") | |
| print("="*30) | |
| # 1. 判断是否是 Podcast 模式 (两人都有音频) | |
| if mode == "podcast": | |
| assert s1_wav and s2_wav | |
| status = "Detected: Podcast Mode (Multi-Speaker)" | |
| result = generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name) | |
| return result[0], f"{status}\n{result[1]}" | |
| # 2. 判断是否是 Voice Clone 模式 (只有 S1 音频) | |
| elif mode == "voice clone": | |
| assert s1_wav | |
| status = "Detected: Voice Clone Mode (Single Speaker)" | |
| result = generate_voice_clone(s1_wav, s1_txt, text, lang, model_name) | |
| return result[0], f"{status}\n{result[1]}" | |
| # 3. 默认走 Voice Design 或 TTS (根据 instruct 是否为空) | |
| elif mode == "style instruction": | |
| assert instruct | |
| status = "Detected: Voice Design Mode" | |
| result = generate_voice_design(text, lang, instruct, model_name) | |
| return result[0], f"{status}\n{result[1]}" | |
| elif mode == 'customvoice': | |
| status = f"Detected: Standard TTS Mode (Speaker: {speaker})" | |
| result = generate_tts_custom(text, lang, speaker, model_size, model_name) | |
| return result[0], f"{status}\n{result[1]}" | |
| else: | |
| print(f"{mode} not sopported yet!") | |
| raise | |
| # --- 按钮绑定 --- | |
| def clear_output(): | |
| return None, "⏳ Analysis in progress... deciding mode..." | |
| generate_btn.click( | |
| fn=clear_output, | |
| outputs=[shared_audio_out, shared_status] | |
| ).then( | |
| fn=unified_generation, | |
| inputs=[ | |
| mode_selection, | |
| model_selection, | |
| shared_text, shared_language, shared_model_size, tts_speaker, design_instruct, | |
| spk1_ref_audio, spk1_ref_text, spk2_ref_audio, spk2_ref_text, seed_input | |
| ], | |
| outputs=[shared_audio_out, shared_status] | |
| ) | |
| return demo | |
| # 启动服务 | |
| if __name__ == "__main__": | |
| demo = build_ui() | |
| demo.launch() |