Spaces:

KaixuanGuo
/

All-In-One-TTS

Running

File size: 21,752 Bytes

import gradio as gr
from gradio_client import Client, handle_file
import warnings
import asyncio

# 忽略异步循环关闭时的资源警告
warnings.filterwarnings("ignore", category=RuntimeWarning, message=".*Event loop is closed.*")

MODEL_SIZES = ["0.6B", "1.7B"]

# Speaker and language choices for CustomVoice model
SPEAKERS = [
    "Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
]
LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]


S1_PROMPT_WAV = "assets/audios/female_mandarin.wav"  
S2_PROMPT_WAV = "assets/audios/male_mandarin.wav" 

# 1. 连接到目标 TTS 模型 (Hugging Face Spaces 互联)
# 注意：这相当于把计算压力交给了原模型所在的服务器
client_map = {}
soulx_name = "SoulX-Podcast-1.7B"
qwen3tts_name = "Qwen3-TTS"
mosstts_name = "MOSS-TTS-Family"
mosstts_name1 = "MOSS-TTS"
mosstts_name2 = "MOSS-TTSD-v1.0"
mosstts_name3 = "MOSS-VoiceGenerator"
cosy_name = "Fun-CosyVoice3-0.5B"
ming_name = "ming-uniaudio-demo"

client_map[soulx_name] = Client(f"Soul-AILab/{soulx_name}")
client_map[qwen3tts_name] = Client(f"Qwen/{qwen3tts_name}")
client_map[mosstts_name1] = Client(f"OpenMOSS-Team/{mosstts_name1}")
client_map[mosstts_name2] = Client(f"OpenMOSS-Team/{mosstts_name2}")
client_map[mosstts_name3] = Client(f"OpenMOSS-Team/{mosstts_name3}")
client_map[cosy_name] = Client(f"FunAudioLLM/{cosy_name}")
client_map[ming_name] = Client(f"cafe3310/{ming_name}")

def generate_tts_custom(text, lang, speaker, model_size, model_name):
    if not text:
        return None
    try:
        # 2. 调用原模型的 API 接口
        # 经过分析，该模型的 predict 接口通常返回一个音频文件路径
        result = client_map[model_name].predict(
            text=text,
            language="Auto",
            speaker="Ryan",
            instruct="",
            model_size="1.7B",
            api_name="/generate_custom_voice"
        )
        print(result)
        return result
        # if isinstance(result, (tuple, list)):
        #     audio_path = result[0]
        #     print(f"Successfully extracted path: {audio_path}")
        #     return audio_path
    except Exception as e:
        print(f"Error: {e}")
        return None

def generate_voice_design(text, lang, instruct, model_name):
    if not text:
        return None
    try:
        # 2. 调用原模型的 API 接口
        # 经过分析，该模型的 predict 接口通常返回一个音频文件路径
        if model_name == qwen3tts_name:
            result = client_map[model_name].predict(
                text=text,
                language="Auto",
                voice_description=instruct,
                api_name="/generate_voice_design"
                )
        elif model_name == mosstts_name3:
            result = client_map[model_name].predict(
                text=text,
                instruction=instruct,
                temperature=1.5,
                top_p=0.6,
                top_k=50,
                repetition_penalty=1.1,
                max_new_tokens=4096,
                api_name="/run_inference"
            )
        elif model_name == ming_name:
            pass
        print(result)
        return result
        # if isinstance(result, (tuple, list)):
        #     audio_path = result[0]
        #     print(f"Successfully extracted path: {audio_path}")
        #     return audio_path
    except Exception as e:
        print(f"Error: {e}")
        return None

def generate_voice_clone(s1_wav, s1_txt, text, lang, model_name):
    if not text:
        return None
    try:
        # 2. 调用原模型的 API 接口
        # 经过分析，该模型的 predict 接口通常返回一个音频文件路径
        if model_name == 'Qwen3-TTS':
            result = client_map[model_name].predict(
            ref_audio=handle_file(s1_wav),
            ref_text=s1_txt,
            target_text=text,
            language="Auto",
            use_xvector_only=False,
            model_size="1.7B",
            api_name="/generate_voice_clone"
    )
        elif model_name == 'MOSS-TTS':
            result = client_map[model_name].predict(
            text=text,
            reference_audio=handle_file(s1_wav),
            mode_with_reference="Clone",
            duration_control_enabled=False,
            duration_tokens=1,
            temperature=1.7,
            top_p=0.8,
            top_k=25,
            repetition_penalty=1,
            max_new_tokens=4096,
            api_name="/run_inference"
        )
        elif model_name == cosy_name:
            result = client_map[model_name].predict(
            tts_text=text,
            mode_value="zero_shot",
            prompt_text=s1_txt,
            prompt_wav_upload=handle_file(s1_wav),
            prompt_wav_record=handle_file(s1_wav),
            instruct_text="You are a helpful assistant. 请非常开心地说一句话。<|endofprompt|>",
            seed=0,
            stream=False,
            ui_lang="Zh",
            api_name="/generate_audio"
        )   
            if isinstance(result, str):
                result = (result, "voice clone successfully!")
        else:
            raise
        print(result)
        return result
        # if isinstance(result, (tuple, list)):
        #     audio_path = result[0]
        #     print(f"Successfully extracted path: {audio_path}")
        #     return audio_path
        
    except Exception as e:
        print(f"Error: {e}")
        return None

def generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name):
    if not text:
        return None
    try:
        # 2. 调用原模型的 API 接口
        # 经过分析，该模型的 predict 接口通常返回一个音频文件路径
        if model_name == soulx_name:
            result = client_map[model_name].predict(
                target_text=text,
                spk1_prompt_text=s1_txt,
                spk1_prompt_audio=handle_file(s1_wav),
                spk1_dialect_prompt_text="",
                spk2_prompt_text=s2_txt,
                spk2_prompt_audio=handle_file(s2_wav),
                spk2_dialect_prompt_text="",
                seed=1988,
                api_name="/dialogue_synthesis_function"
            )
        elif model_name == mosstts_name2:
            result = client_map[model_name].predict(
                speaker_count=2,
                param_1=handle_file(s1_wav),
                param_2=handle_file(s2_wav),
                param_3=handle_file(s1_wav),
                param_4=handle_file(s1_wav),
                param_5=handle_file(s1_wav),
                param_6=s1_txt,
                param_7=s2_txt,
                param_8="Hello",
                param_9="Hello!!",
                param_10="Hello!!",
                param_11=text,
                param_12=True,
                param_13=False,
                param_14=1.1,
                param_15=0.9,
                param_16=50,
                param_17=1.1,
                param_18=2000,
                api_name="/run_inference"
            )
        else:
            raise
        print(result)
        if not isinstance(result, tuple):
            result = (result, "podcast geenration completed successfully!")
        return result
        # if isinstance(result, (tuple, list)):
        #     audio_path = result[0]
        #     print(f"Successfully extracted path: {audio_path}")
        #     return audio_path
    except Exception as e:
        print(f"Error: {e}")
        return None

def build_ui():
    # 3. 构建极简界面
    # with gr.Blocks(title="All-In-One-TTS") as demo:
    #     gr.Markdown("# 🎙️ All-In-One-TTS")
    #     gr.Markdown("Compare leading TTS models. Starting with **Qwen3-TTS-1.7B**.")
        
    #     with gr.Row():
    #         input_text = gr.Textbox(
    #             label="Input Text", 
    #             placeholder="Type something here...",
    #             value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities."
    #         )
    #         submit_btn = gr.Button("Generate", variant="primary")
        
    #     output_audio = gr.Audio(label="TTS Output", type="filepath")

    #     # 绑定逻辑
    #     submit_btn.click(
    #         fn=tts_engine,
    #         inputs=input_text,
    #         outputs=output_audio
    #     )

    # custom_css = """
    # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
    # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
    # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
    # """
    
    # 自定义 CSS：进一步增强 Tab 字体和布局美感
    custom_css = """
    /* 1. 基础 Tab 样式：加粗、增大字号 */
    .mode-tabs button, .model-tabs button {
        font-weight: 700 !important; /* 强制加粗 */
        font-size: 16px !important;  /* 稍微放大字号 */
        color: #4b5563 !important;   # 默认灰色
    }
    /* 3. 背景与面板美化 */
    # .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
    # .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
    # .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
    """
    
    with gr.Blocks(title="All-In-One-TTS", css=custom_css) as demo:
        gr.Markdown("# 🎛️ TTS All-In-One Unified Panel")
        
        # --- 核心状态中转站 (Textbox) ---
        # 调试阶段可见，运行稳定后可设为 visible=False
        mode_selection = gr.Textbox(value="voice clone", visible=True, label="Current Mode") 
        model_selection = gr.Textbox(value=qwen3tts_name, visible=True, label="Current Model")

        # --- 第一行 Tab：选择功能模式 ---
        with gr.Tabs(elem_classes="mode-tabs") as mode_tabs:
            # 使用 'as' 将 Tab 实例赋值给变量，以便后续绑定
            with gr.Tab("Voice Clone", id="voice clone") as tab_mode_2: 
                #gr.Markdown("### 当前模式：声音克隆 (Voice Clone)")
                pass
            with gr.Tab("CustomVoice", id="customvoice") as tab_mode_3: 
                #gr.Markdown("### 当前模式：自定义声音 (CustomVoice)")
                pass
            with gr.Tab("Podcast", id="podcast") as tab_mode_4: 
                #gr.Markdown("### 当前模式：播客制作 (Podcast)")
                pass
            with gr.Tab("Style Instruction", id="style instruction") as tab_mode_1: 
                #gr.Markdown("### 当前模式：风格指令 (Style Instruction)")
                pass
            with gr.Tab("TTA", id="tta") as tab_mode_5: 
                #gr.Markdown("### 当前模式：音效生成 (TTA)")
                pass
            with gr.Tab("Speech with BGM", id="speed with bgm") as tab_mode_6: 
                #gr.Markdown("### 当前模式：音效生成 (TTA)")
                pass

        # --- 第二行 Tab：选择模型后端 ---
        with gr.Tabs(elem_classes="model-tabs") as model_tabs:
            with gr.Tab(qwen3tts_name, id=qwen3tts_name) as tab_model_1: 
                #gr.Markdown(f"**当前后端引擎:** {qwen3tts_name}")
                pass
            with gr.Tab(cosy_name, id=cosy_name) as tab_model_3: 
                #gr.Markdown(f"**当前后端引擎:** {cosy_name}")
                pass
            with gr.Tab(soulx_name, id=soulx_name) as tab_model_2: 
                #gr.Markdown(f"**当前后端引擎:** {soulx_name}")
                pass
            with gr.Tab(mosstts_name1, id=mosstts_name1) as tab_model_4: 
                #gr.Markdown(f"**当前后端引擎:** {mosstts_name1}")
                pass
            with gr.Tab(mosstts_name2, id=mosstts_name2) as tab_model_5: 
                #gr.Markdown(f"**当前后端引擎:** {mosstts_name2}")
                pass
            with gr.Tab(mosstts_name3, id=mosstts_name3) as tab_model_6: 
                #gr.Markdown(f"**当前后端引擎:** {mosstts_name3}")
                pass
            with gr.Tab("Ming-Omni-TTS", id=ming_name) as tab_model_7: 
                #gr.Markdown(f"**当前后端引擎:** {soulx_name}")
                pass


        gr.Markdown("---")

        with gr.Row():
            # --- 第一列：Style Instruction ---
            with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
                gr.Markdown("### 🎨 Style & Design")
                design_instruct = gr.Textbox(
                    label="Style Instruction / Voice Description",
                    lines=18, 
                    placeholder="Describe voice style or custom voice...",
                    value="展现出悲苦沙哑的声音质感,语速偏慢,情绪浓烈且带有哭腔,以标准普通话缓慢诉说,情感强烈,语调哀怨高亢,音高起伏大。"
                )

            # --- 第二列：SPK1 Material ---
            with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
                gr.Markdown("### 👤 SPK1 Clone")
                spk1_ref_audio = gr.Audio(label="SPK1 Audio", type="filepath")
                spk1_ref_text = gr.Textbox(label="SPK1 Text", lines=3)

            # --- 第三列：SPK2 Material ---
            with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
                gr.Markdown("### 👥 SPK2 Clone")
                spk2_ref_audio = gr.Audio(label="SPK2 Audio", type="filepath")
                spk2_ref_text = gr.Textbox(label="SPK2 Text", lines=3)

            # --- 第四列：Target & Config ---
            with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
                gr.Markdown("### 📝 Synthesis")
                shared_text = gr.Textbox(
                    label="Target Text / Script",
                    lines=5,
                    value="[S1] 哈喽，AI时代的冲浪先锋们！欢迎！ [S2] 欢迎啊！"
                )
                with gr.Row():
                    shared_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto")
                    tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan")
                with gr.Row():
                    shared_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B")
                    seed_input = gr.Number(label="Seed", value=1988, precision=0)

        # --- 核心修改：缩减为一个按钮 ---
        gr.Markdown("---")
        with gr.Row():
            # 跨越全宽的大按钮
            generate_btn = gr.Button("🚀 Start Generating Audio", variant="primary", size="lg")

        # --- 1. 定义校验逻辑 ---
        def validate_combination(mode, model):
            # 在这里定义你的禁止规则
            # 示例：假设 soulx 不支持 podcast 和 voice clone
            # is_invalid = (model == soulx_name and mode in ["customvoice", "voice clone", "style instruction"])
            if model == soulx_name:
                is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
            elif model == qwen3tts_name:
                is_invalid = mode in ["podcast", "tta", "speech with bgm"]
            elif model == cosy_name:
                is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
            elif model == mosstts_name1:
                is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
            elif model == mosstts_name2:
                is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
            elif model == mosstts_name3:
                is_invalid = mode in ["customvoice", "voice clone", "podcast", "tta", "speech with bgm"]
            elif model == ming_name:
                is_invalid = mode in ["customvoice", "podcast"]
            else:
                raise
            if is_invalid:
                # 返回禁用状态和警告文字
                return gr.update(interactive=False, value=f"🚫 {model} 不支持 {mode} 模式")
            else:
                # 返回正常状态和原始文字
                return gr.update(interactive=True, value="🚀 Start Generating Audio")

        # --- 2. 绑定联动事件 ---
        # 只要隐藏的 Textbox 值变了，就触发校验
        # 记得在 mode_selection 和 model_selection 定义处确保它们能触发 change
        mode_selection.change(
            fn=validate_combination,
            inputs=[mode_selection, model_selection],
            outputs=generate_btn
        )
        model_selection.change(
            fn=validate_combination,
            inputs=[mode_selection, model_selection],
            outputs=generate_btn
        )

        # --- 显式绑定：点击特定 Tab 时直接回传对应字符串 ---
        # 模式切换绑定
        tab_mode_1.select(fn=lambda: "style instruction", outputs=mode_selection)
        tab_mode_2.select(fn=lambda: "voice clone", outputs=mode_selection)
        tab_mode_3.select(fn=lambda: "customvoice", outputs=mode_selection)
        tab_mode_4.select(fn=lambda: "podcast", outputs=mode_selection)
        tab_mode_5.select(fn=lambda: "TTA", outputs=mode_selection)
        tab_mode_6.select(fn=lambda: "Speech with BGM", outputs=mode_selection)

        # 模型切换绑定
        tab_model_1.select(fn=lambda: qwen3tts_name, outputs=model_selection)
        tab_model_2.select(fn=lambda: soulx_name, outputs=model_selection)
        tab_model_3.select(fn=lambda: cosy_name, outputs=model_selection)
        tab_model_4.select(fn=lambda: mosstts_name1, outputs=model_selection)
        tab_model_5.select(fn=lambda: mosstts_name2, outputs=model_selection)
        tab_model_6.select(fn=lambda: mosstts_name3, outputs=model_selection)
        tab_model_7.select(fn=lambda: ming_name, outputs=model_selection)
        
        
        # --- 全局输出 ---
        with gr.Row():
            with gr.Column():
                shared_audio_out = gr.Audio(label="Final Generated Audio", type="numpy")
                shared_status = gr.Textbox(label="System Status", lines=2, interactive=False)

        # --- 统一分流逻辑 ---
        def unified_generation(
            mode, model_name,
            text, lang, model_size, speaker, instruct, 
            s1_wav, s1_txt, s2_wav, s2_txt, seed
        ):
            # --- 调试打印区 ---
            print("="*30)
            print(f"【Mode Selected】: {mode}")
            print(f"【Model Selected】: {model_name}")
            print(f"【Target Text】: {text}")
            print(f"【Language】: {lang}")
            print(f"【Speaker】: {speaker}")
            print(f"【Style Instruct】: {instruct}")
            print(f"【SPK1 Audio Path】: {s1_wav}")
            print(f"【SPK1 Text】: {s1_txt}")
            print(f"【SPK2 Audio Path】: {s2_wav}")
            print(f"【SPK2 Text】: {s2_txt}")
            print(f"【Seed】: {seed}")
            print("="*30)
            
            # 1. 判断是否是 Podcast 模式 (两人都有音频)
            if mode == "podcast":
                assert s1_wav and s2_wav
                status = "Detected: Podcast Mode (Multi-Speaker)"
                result = generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name)
                return result[0], f"{status}\n{result[1]}"
            
            # 2. 判断是否是 Voice Clone 模式 (只有 S1 音频)
            elif mode == "voice clone":
                assert s1_wav
                status = "Detected: Voice Clone Mode (Single Speaker)"
                result = generate_voice_clone(s1_wav, s1_txt, text, lang, model_name)
                return result[0], f"{status}\n{result[1]}"
            
            # 3. 默认走 Voice Design 或 TTS (根据 instruct 是否为空)
            elif mode == "style instruction":
                assert instruct
                status = "Detected: Voice Design Mode"
                result = generate_voice_design(text, lang, instruct, model_name)
                return result[0], f"{status}\n{result[1]}"
            
            elif mode == 'customvoice':
                status = f"Detected: Standard TTS Mode (Speaker: {speaker})"
                result = generate_tts_custom(text, lang, speaker, model_size, model_name)
                return result[0], f"{status}\n{result[1]}"

            else:
                print(f"{mode} not sopported yet!")
                raise

        # --- 按钮绑定 ---
        def clear_output():
            return None, "⏳ Analysis in progress... deciding mode..."

        generate_btn.click(
            fn=clear_output,
            outputs=[shared_audio_out, shared_status]
        ).then(
            fn=unified_generation,
            inputs=[
                mode_selection, 
                model_selection,
                shared_text, shared_language, shared_model_size, tts_speaker, design_instruct,
                spk1_ref_audio, spk1_ref_text, spk2_ref_audio, spk2_ref_text, seed_input
            ],
            outputs=[shared_audio_out, shared_status]
        )
    return demo

# 启动服务
if __name__ == "__main__":
    demo = build_ui()
    demo.launch()