Spaces:

KaixuanGuo
/

All-In-One-TTS

Sleeping

App Files Files Community

All-In-One-TTS / app.py

KaixuanGuo

Update app.py

16c3779 verified 20 days ago

raw

history blame contribute delete

21.8 kB

	import gradio as gr
	from gradio_client import Client, handle_file
	import warnings
	import asyncio

	# 忽略异步循环关闭时的资源警告
	warnings.filterwarnings("ignore", category=RuntimeWarning, message=".Event loop is closed.")

	MODEL_SIZES = ["0.6B", "1.7B"]

	# Speaker and language choices for CustomVoice model
	SPEAKERS = [
	"Aiden", "Dylan", "Eric", "Ono_anna", "Ryan", "Serena", "Sohee", "Uncle_fu", "Vivian"
	]
	LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]


	S1_PROMPT_WAV = "assets/audios/female_mandarin.wav"
	S2_PROMPT_WAV = "assets/audios/male_mandarin.wav"

	# 1. 连接到目标 TTS 模型 (Hugging Face Spaces 互联)
	# 注意：这相当于把计算压力交给了原模型所在的服务器
	client_map = {}
	soulx_name = "SoulX-Podcast-1.7B"
	qwen3tts_name = "Qwen3-TTS"
	mosstts_name = "MOSS-TTS-Family"
	mosstts_name1 = "MOSS-TTS"
	mosstts_name2 = "MOSS-TTSD-v1.0"
	mosstts_name3 = "MOSS-VoiceGenerator"
	cosy_name = "Fun-CosyVoice3-0.5B"
	ming_name = "ming-uniaudio-demo"

	client_map[soulx_name] = Client(f"Soul-AILab/{soulx_name}")
	client_map[qwen3tts_name] = Client(f"Qwen/{qwen3tts_name}")
	client_map[mosstts_name1] = Client(f"OpenMOSS-Team/{mosstts_name1}")
	client_map[mosstts_name2] = Client(f"OpenMOSS-Team/{mosstts_name2}")
	client_map[mosstts_name3] = Client(f"OpenMOSS-Team/{mosstts_name3}")
	client_map[cosy_name] = Client(f"FunAudioLLM/{cosy_name}")
	client_map[ming_name] = Client(f"cafe3310/{ming_name}")

	def generate_tts_custom(text, lang, speaker, model_size, model_name):
	if not text:
	return None
	try:
	# 2. 调用原模型的 API 接口
	# 经过分析，该模型的 predict 接口通常返回一个音频文件路径
	result = client_map[model_name].predict(
	text=text,
	language="Auto",
	speaker="Ryan",
	instruct="",
	model_size="1.7B",
	api_name="/generate_custom_voice"
	)
	print(result)
	return result
	# if isinstance(result, (tuple, list)):
	# audio_path = result[0]
	# print(f"Successfully extracted path: {audio_path}")
	# return audio_path
	except Exception as e:
	print(f"Error: {e}")
	return None

	def generate_voice_design(text, lang, instruct, model_name):
	if not text:
	return None
	try:
	# 2. 调用原模型的 API 接口
	# 经过分析，该模型的 predict 接口通常返回一个音频文件路径
	if model_name == qwen3tts_name:
	result = client_map[model_name].predict(
	text=text,
	language="Auto",
	voice_description=instruct,
	api_name="/generate_voice_design"
	)
	elif model_name == mosstts_name3:
	result = client_map[model_name].predict(
	text=text,
	instruction=instruct,
	temperature=1.5,
	top_p=0.6,
	top_k=50,
	repetition_penalty=1.1,
	max_new_tokens=4096,
	api_name="/run_inference"
	)
	elif model_name == ming_name:
	pass
	print(result)
	return result
	# if isinstance(result, (tuple, list)):
	# audio_path = result[0]
	# print(f"Successfully extracted path: {audio_path}")
	# return audio_path
	except Exception as e:
	print(f"Error: {e}")
	return None

	def generate_voice_clone(s1_wav, s1_txt, text, lang, model_name):
	if not text:
	return None
	try:
	# 2. 调用原模型的 API 接口
	# 经过分析，该模型的 predict 接口通常返回一个音频文件路径
	if model_name == 'Qwen3-TTS':
	result = client_map[model_name].predict(
	ref_audio=handle_file(s1_wav),
	ref_text=s1_txt,
	target_text=text,
	language="Auto",
	use_xvector_only=False,
	model_size="1.7B",
	api_name="/generate_voice_clone"
	)
	elif model_name == 'MOSS-TTS':
	result = client_map[model_name].predict(
	text=text,
	reference_audio=handle_file(s1_wav),
	mode_with_reference="Clone",
	duration_control_enabled=False,
	duration_tokens=1,
	temperature=1.7,
	top_p=0.8,
	top_k=25,
	repetition_penalty=1,
	max_new_tokens=4096,
	api_name="/run_inference"
	)
	elif model_name == cosy_name:
	result = client_map[model_name].predict(
	tts_text=text,
	mode_value="zero_shot",
	prompt_text=s1_txt,
	prompt_wav_upload=handle_file(s1_wav),
	prompt_wav_record=handle_file(s1_wav),
	instruct_text="You are a helpful assistant. 请非常开心地说一句话。<\|endofprompt\|>",
	seed=0,
	stream=False,
	ui_lang="Zh",
	api_name="/generate_audio"
	)
	if isinstance(result, str):
	result = (result, "voice clone successfully!")
	else:
	raise
	print(result)
	return result
	# if isinstance(result, (tuple, list)):
	# audio_path = result[0]
	# print(f"Successfully extracted path: {audio_path}")
	# return audio_path

	except Exception as e:
	print(f"Error: {e}")
	return None

	def generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name):
	if not text:
	return None
	try:
	# 2. 调用原模型的 API 接口
	# 经过分析，该模型的 predict 接口通常返回一个音频文件路径
	if model_name == soulx_name:
	result = client_map[model_name].predict(
	target_text=text,
	spk1_prompt_text=s1_txt,
	spk1_prompt_audio=handle_file(s1_wav),
	spk1_dialect_prompt_text="",
	spk2_prompt_text=s2_txt,
	spk2_prompt_audio=handle_file(s2_wav),
	spk2_dialect_prompt_text="",
	seed=1988,
	api_name="/dialogue_synthesis_function"
	)
	elif model_name == mosstts_name2:
	result = client_map[model_name].predict(
	speaker_count=2,
	param_1=handle_file(s1_wav),
	param_2=handle_file(s2_wav),
	param_3=handle_file(s1_wav),
	param_4=handle_file(s1_wav),
	param_5=handle_file(s1_wav),
	param_6=s1_txt,
	param_7=s2_txt,
	param_8="Hello",
	param_9="Hello!!",
	param_10="Hello!!",
	param_11=text,
	param_12=True,
	param_13=False,
	param_14=1.1,
	param_15=0.9,
	param_16=50,
	param_17=1.1,
	param_18=2000,
	api_name="/run_inference"
	)
	else:
	raise
	print(result)
	if not isinstance(result, tuple):
	result = (result, "podcast geenration completed successfully!")
	return result
	# if isinstance(result, (tuple, list)):
	# audio_path = result[0]
	# print(f"Successfully extracted path: {audio_path}")
	# return audio_path
	except Exception as e:
	print(f"Error: {e}")
	return None

	def build_ui():
	# 3. 构建极简界面
	# with gr.Blocks(title="All-In-One-TTS") as demo:
	# gr.Markdown("# 🎙️ All-In-One-TTS")
	# gr.Markdown("Compare leading TTS models. Starting with Qwen3-TTS-1.7B.")

	# with gr.Row():
	# input_text = gr.Textbox(
	# label="Input Text",
	# placeholder="Type something here...",
	# value="Hello! Welcome to Text-to-Speech system. This is a demo of our TTS capabilities."
	# )
	# submit_btn = gr.Button("Generate", variant="primary")

	# output_audio = gr.Audio(label="TTS Output", type="filepath")

	# # 绑定逻辑
	# submit_btn.click(
	# fn=tts_engine,
	# inputs=input_text,
	# outputs=output_audio
	# )

	# custom_css = """
	# .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
	# .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
	# .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
	# """

	# 自定义 CSS：进一步增强 Tab 字体和布局美感
	custom_css = """
	/* 1. 基础 Tab 样式：加粗、增大字号 */
	.mode-tabs button, .model-tabs button {
	font-weight: 700 !important; /* 强制加粗 */
	font-size: 16px !important; /* 稍微放大字号 */
	color: #4b5563 !important; # 默认灰色
	}
	/* 3. 背景与面板美化 */
	# .mode-tabs { background: #f0f2f5; border-radius: 10px; padding: 10px; margin-bottom: 10px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
	# .model-tabs { background: #e6f7ff; border-radius: 10px; padding: 10px; margin-bottom: 20px; box-shadow: 0 2px 4px rgba(0,0,0,0.05); }
	# .column-panel { background: #ffffff; border: 1px solid #e5e7eb; border-radius: 8px; padding: 15px !important; }
	"""

	with gr.Blocks(title="All-In-One-TTS", css=custom_css) as demo:
	gr.Markdown("# 🎛️ TTS All-In-One Unified Panel")

	# --- 核心状态中转站 (Textbox) ---
	# 调试阶段可见，运行稳定后可设为 visible=False
	mode_selection = gr.Textbox(value="voice clone", visible=True, label="Current Mode")
	model_selection = gr.Textbox(value=qwen3tts_name, visible=True, label="Current Model")

	# --- 第一行 Tab：选择功能模式 ---
	with gr.Tabs(elem_classes="mode-tabs") as mode_tabs:
	# 使用 'as' 将 Tab 实例赋值给变量，以便后续绑定
	with gr.Tab("Voice Clone", id="voice clone") as tab_mode_2:
	#gr.Markdown("### 当前模式：声音克隆 (Voice Clone)")
	pass
	with gr.Tab("CustomVoice", id="customvoice") as tab_mode_3:
	#gr.Markdown("### 当前模式：自定义声音 (CustomVoice)")
	pass
	with gr.Tab("Podcast", id="podcast") as tab_mode_4:
	#gr.Markdown("### 当前模式：播客制作 (Podcast)")
	pass
	with gr.Tab("Style Instruction", id="style instruction") as tab_mode_1:
	#gr.Markdown("### 当前模式：风格指令 (Style Instruction)")
	pass
	with gr.Tab("TTA", id="tta") as tab_mode_5:
	#gr.Markdown("### 当前模式：音效生成 (TTA)")
	pass
	with gr.Tab("Speech with BGM", id="speed with bgm") as tab_mode_6:
	#gr.Markdown("### 当前模式：音效生成 (TTA)")
	pass

	# --- 第二行 Tab：选择模型后端 ---
	with gr.Tabs(elem_classes="model-tabs") as model_tabs:
	with gr.Tab(qwen3tts_name, id=qwen3tts_name) as tab_model_1:
	#gr.Markdown(f"当前后端引擎: {qwen3tts_name}")
	pass
	with gr.Tab(cosy_name, id=cosy_name) as tab_model_3:
	#gr.Markdown(f"当前后端引擎: {cosy_name}")
	pass
	with gr.Tab(soulx_name, id=soulx_name) as tab_model_2:
	#gr.Markdown(f"当前后端引擎: {soulx_name}")
	pass
	with gr.Tab(mosstts_name1, id=mosstts_name1) as tab_model_4:
	#gr.Markdown(f"当前后端引擎: {mosstts_name1}")
	pass
	with gr.Tab(mosstts_name2, id=mosstts_name2) as tab_model_5:
	#gr.Markdown(f"当前后端引擎: {mosstts_name2}")
	pass
	with gr.Tab(mosstts_name3, id=mosstts_name3) as tab_model_6:
	#gr.Markdown(f"当前后端引擎: {mosstts_name3}")
	pass
	with gr.Tab("Ming-Omni-TTS", id=ming_name) as tab_model_7:
	#gr.Markdown(f"当前后端引擎: {soulx_name}")
	pass


	gr.Markdown("---")

	with gr.Row():
	# --- 第一列：Style Instruction ---
	with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
	gr.Markdown("### 🎨 Style & Design")
	design_instruct = gr.Textbox(
	label="Style Instruction / Voice Description",
	lines=18,
	placeholder="Describe voice style or custom voice...",
	value="展现出悲苦沙哑的声音质感,语速偏慢,情绪浓烈且带有哭腔,以标准普通话缓慢诉说,情感强烈,语调哀怨高亢,音高起伏大。"
	)

	# --- 第二列：SPK1 Material ---
	with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
	gr.Markdown("### 👤 SPK1 Clone")
	spk1_ref_audio = gr.Audio(label="SPK1 Audio", type="filepath")
	spk1_ref_text = gr.Textbox(label="SPK1 Text", lines=3)

	# --- 第三列：SPK2 Material ---
	with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
	gr.Markdown("### 👥 SPK2 Clone")
	spk2_ref_audio = gr.Audio(label="SPK2 Audio", type="filepath")
	spk2_ref_text = gr.Textbox(label="SPK2 Text", lines=3)

	# --- 第四列：Target & Config ---
	with gr.Column(scale=1, variant="panel", elem_classes="column-panel"):
	gr.Markdown("### 📝 Synthesis")
	shared_text = gr.Textbox(
	label="Target Text / Script",
	lines=5,
	value="[S1] 哈喽，AI时代的冲浪先锋们！欢迎！ [S2] 欢迎啊！"
	)
	with gr.Row():
	shared_language = gr.Dropdown(label="Language", choices=LANGUAGES, value="Auto")
	tts_speaker = gr.Dropdown(label="Speaker", choices=SPEAKERS, value="Ryan")
	with gr.Row():
	shared_model_size = gr.Dropdown(label="Model Size", choices=MODEL_SIZES, value="1.7B")
	seed_input = gr.Number(label="Seed", value=1988, precision=0)

	# --- 核心修改：缩减为一个按钮 ---
	gr.Markdown("---")
	with gr.Row():
	# 跨越全宽的大按钮
	generate_btn = gr.Button("🚀 Start Generating Audio", variant="primary", size="lg")

	# --- 1. 定义校验逻辑 ---
	def validate_combination(mode, model):
	# 在这里定义你的禁止规则
	# 示例：假设 soulx 不支持 podcast 和 voice clone
	# is_invalid = (model == soulx_name and mode in ["customvoice", "voice clone", "style instruction"])
	if model == soulx_name:
	is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
	elif model == qwen3tts_name:
	is_invalid = mode in ["podcast", "tta", "speech with bgm"]
	elif model == cosy_name:
	is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
	elif model == mosstts_name1:
	is_invalid = mode in ["customvoice", "style instruction", "podcast", "tta", "speech with bgm"]
	elif model == mosstts_name2:
	is_invalid = mode in ["customvoice", "voice clone", "style instruction", "tta", "speech with bgm"]
	elif model == mosstts_name3:
	is_invalid = mode in ["customvoice", "voice clone", "podcast", "tta", "speech with bgm"]
	elif model == ming_name:
	is_invalid = mode in ["customvoice", "podcast"]
	else:
	raise
	if is_invalid:
	# 返回禁用状态和警告文字
	return gr.update(interactive=False, value=f"🚫 {model} 不支持 {mode} 模式")
	else:
	# 返回正常状态和原始文字
	return gr.update(interactive=True, value="🚀 Start Generating Audio")

	# --- 2. 绑定联动事件 ---
	# 只要隐藏的 Textbox 值变了，就触发校验
	# 记得在 mode_selection 和 model_selection 定义处确保它们能触发 change
	mode_selection.change(
	fn=validate_combination,
	inputs=[mode_selection, model_selection],
	outputs=generate_btn
	)
	model_selection.change(
	fn=validate_combination,
	inputs=[mode_selection, model_selection],
	outputs=generate_btn
	)

	# --- 显式绑定：点击特定 Tab 时直接回传对应字符串 ---
	# 模式切换绑定
	tab_mode_1.select(fn=lambda: "style instruction", outputs=mode_selection)
	tab_mode_2.select(fn=lambda: "voice clone", outputs=mode_selection)
	tab_mode_3.select(fn=lambda: "customvoice", outputs=mode_selection)
	tab_mode_4.select(fn=lambda: "podcast", outputs=mode_selection)
	tab_mode_5.select(fn=lambda: "TTA", outputs=mode_selection)
	tab_mode_6.select(fn=lambda: "Speech with BGM", outputs=mode_selection)

	# 模型切换绑定
	tab_model_1.select(fn=lambda: qwen3tts_name, outputs=model_selection)
	tab_model_2.select(fn=lambda: soulx_name, outputs=model_selection)
	tab_model_3.select(fn=lambda: cosy_name, outputs=model_selection)
	tab_model_4.select(fn=lambda: mosstts_name1, outputs=model_selection)
	tab_model_5.select(fn=lambda: mosstts_name2, outputs=model_selection)
	tab_model_6.select(fn=lambda: mosstts_name3, outputs=model_selection)
	tab_model_7.select(fn=lambda: ming_name, outputs=model_selection)


	# --- 全局输出 ---
	with gr.Row():
	with gr.Column():
	shared_audio_out = gr.Audio(label="Final Generated Audio", type="numpy")
	shared_status = gr.Textbox(label="System Status", lines=2, interactive=False)

	# --- 统一分流逻辑 ---
	def unified_generation(
	mode, model_name,
	text, lang, model_size, speaker, instruct,
	s1_wav, s1_txt, s2_wav, s2_txt, seed
	):
	# --- 调试打印区 ---
	print("="*30)
	print(f"【Mode Selected】: {mode}")
	print(f"【Model Selected】: {model_name}")
	print(f"【Target Text】: {text}")
	print(f"【Language】: {lang}")
	print(f"【Speaker】: {speaker}")
	print(f"【Style Instruct】: {instruct}")
	print(f"【SPK1 Audio Path】: {s1_wav}")
	print(f"【SPK1 Text】: {s1_txt}")
	print(f"【SPK2 Audio Path】: {s2_wav}")
	print(f"【SPK2 Text】: {s2_txt}")
	print(f"【Seed】: {seed}")
	print("="*30)

	# 1. 判断是否是 Podcast 模式 (两人都有音频)
	if mode == "podcast":
	assert s1_wav and s2_wav
	status = "Detected: Podcast Mode (Multi-Speaker)"
	result = generate_podcast(text, s1_wav, s1_txt, s2_wav, s2_txt, lang, seed, model_name)
	return result[0], f"{status}\n{result[1]}"

	# 2. 判断是否是 Voice Clone 模式 (只有 S1 音频)
	elif mode == "voice clone":
	assert s1_wav
	status = "Detected: Voice Clone Mode (Single Speaker)"
	result = generate_voice_clone(s1_wav, s1_txt, text, lang, model_name)
	return result[0], f"{status}\n{result[1]}"

	# 3. 默认走 Voice Design 或 TTS (根据 instruct 是否为空)
	elif mode == "style instruction":
	assert instruct
	status = "Detected: Voice Design Mode"
	result = generate_voice_design(text, lang, instruct, model_name)
	return result[0], f"{status}\n{result[1]}"

	elif mode == 'customvoice':
	status = f"Detected: Standard TTS Mode (Speaker: {speaker})"
	result = generate_tts_custom(text, lang, speaker, model_size, model_name)
	return result[0], f"{status}\n{result[1]}"

	else:
	print(f"{mode} not sopported yet!")
	raise

	# --- 按钮绑定 ---
	def clear_output():
	return None, "⏳ Analysis in progress... deciding mode..."

	generate_btn.click(
	fn=clear_output,
	outputs=[shared_audio_out, shared_status]
	).then(
	fn=unified_generation,
	inputs=[
	mode_selection,
	model_selection,
	shared_text, shared_language, shared_model_size, tts_speaker, design_instruct,
	spk1_ref_audio, spk1_ref_text, spk2_ref_audio, spk2_ref_text, seed_input
	],
	outputs=[shared_audio_out, shared_status]
	)
	return demo

	# 启动服务
	if __name__ == "__main__":
	demo = build_ui()
	demo.launch()