Spaces:

smartwang
/

magicvoice

Running on Zero

15ec850 19 days ago

15.8 kB

	# coding=utf-8
	import os
	import sys
	import logging
	import spaces
	import gradio as gr
	import numpy as np
	import torch
	from huggingface_hub import snapshot_download, login
	from qwen_tts import Qwen3TTSModel
	from qwen_tts.inference.qwen3_tts_model import VoiceClonePromptItem
	import functools
	import uuid
	import random
	import whisper
	import librosa
	from opencc import OpenCC

	# 配置日志
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[logging.StreamHandler(sys.stdout)]
	)
	logger = logging.getLogger("Qwen3-TTS-Demo")

	# 初始化简繁转换器
	cc = OpenCC('t2s')

	HF_TOKEN = os.environ.get('HF_TOKEN')
	if HF_TOKEN:
	login(token=HF_TOKEN)

	MODEL_SIZES = ["0.6B", "1.7B"]
	LANGUAGES = ["Auto", "Chinese", "English", "Japanese", "Korean", "French", "German", "Spanish", "Portuguese", "Russian"]

	def seed_everything(seed=42):
	random.seed(seed)
	np.random.seed(seed)
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)
	torch.cuda.manual_seed_all(seed)
	torch.backends.cudnn.deterministic = True
	torch.backends.cudnn.benchmark = False

	def get_model_path(model_type: str, model_size: str) -> str:
	return snapshot_download(f"Qwen/Qwen3-TTS-12Hz-{model_size}-{model_type}")

	@functools.lru_cache(maxsize=1)
	def load_model(model_type, model_size):
	path = get_model_path(model_type, model_size)
	return Qwen3TTSModel.from_pretrained(
	path,
	device_map="cuda",
	dtype=torch.bfloat16,
	token=HF_TOKEN,
	attn_implementation="kernels-community/flash-attn3"
	)

	@functools.lru_cache(maxsize=1)
	def load_whisper_model(model_name="large-v3"):
	model = whisper.load_model(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
	return model

	def _normalize_audio(wav, eps=1e-12, clip=True):
	x = np.asarray(wav)
	if np.issubdtype(x.dtype, np.integer):
	info = np.iinfo(x.dtype)
	y = x.astype(np.float32) / max(abs(info.min), info.max)
	elif np.issubdtype(x.dtype, np.floating):
	y = x.astype(np.float32)
	m = np.max(np.abs(y)) if y.size else 0.0
	if m > 1.0 + 1e-6:
	y = y / (m + eps)
	else:
	raise TypeError(f"Unsupported dtype: {x.dtype}")
	if clip:
	y = np.clip(y, -1.0, 1.0)
	if y.ndim > 1:
	y = np.mean(y, axis=-1).astype(np.float32)
	return y

	def _audio_to_tuple(audio):
	if audio is None:
	return None
	if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[0], int):
	sr, wav = audio
	wav = _normalize_audio(wav)
	return wav, int(sr)
	if isinstance(audio, dict) and "sampling_rate" in audio and "data" in audio:
	sr = int(audio["sampling_rate"])
	wav = _normalize_audio(audio["data"])
	return wav, sr
	return None

	@spaces.GPU
	def infer_voice_design(part, language, voice_description):
	voice_design_model = load_model("VoiceDesign","1.7B")
	seed_everything(42)
	wavs, sr = voice_design_model.generate_voice_design(
	text=part,
	language=language,
	instruct=voice_description.strip(),
	non_streaming_mode=True,
	max_new_tokens=2048,
	)
	return wavs[0], sr

	@spaces.GPU
	def infer_voice_clone(part, language, audio_tuple, ref_text, use_xvector_only):
	tts = load_model("Base", "0.6B")
	voice_clone_prompt = tts.create_voice_clone_prompt(
	ref_audio=audio_tuple,
	ref_text=ref_text.strip() if ref_text else None,
	x_vector_only_mode=use_xvector_only
	)
	wavs, sr = tts.generate_voice_clone(
	text=part,
	language=language,
	voice_clone_prompt=voice_clone_prompt,
	max_new_tokens=2048,
	seed=42,
	temperature=0.3,
	top_p=0.85
	)
	return wavs[0], sr

	@spaces.GPU
	def infer_voice_clone_from_prompt(part, language, prompt_file_path):
	loaded_data = torch.load(prompt_file_path, map_location='cuda', weights_only=False)
	if isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], VoiceClonePromptItem):
	voice_clone_prompt = loaded_data
	elif isinstance(loaded_data, list) and len(loaded_data) > 0 and isinstance(loaded_data[0], dict):
	voice_clone_prompt = [VoiceClonePromptItem(**item) for item in loaded_data]
	else:
	voice_clone_prompt = loaded_data
	if isinstance(voice_clone_prompt, list):
	for item in voice_clone_prompt:
	if item.ref_code is not None and item.ref_code.ndim == 3:
	item.ref_code = item.ref_code.squeeze(0)
	tts = load_model("Base", "0.6B")
	wavs, sr = tts.generate_voice_clone(
	text=part,
	language=language,
	voice_clone_prompt=voice_clone_prompt,
	max_new_tokens=2048,
	seed=42,
	temperature=0.3,
	top_p=0.85
	)
	return wavs[0], sr

	@spaces.GPU
	def extract_voice_clone_prompt(ref_audio, ref_text, use_xvector_only):
	tts = load_model("Base", "0.6B")
	seed_everything(42)
	audio_tuple = _audio_to_tuple(ref_audio)
	if audio_tuple is None:
	return None, "错误：需要参考音频。"
	r_text = ref_text
	uxo = use_xvector_only
	if not r_text or (isinstance(r_text, str) and not r_text.strip()):
	whisper_size = "base"
	try:
	whisper_model = load_whisper_model(whisper_size)
	audio_data, sr = audio_tuple
	if sr != 16000:
	whisper_audio = librosa.resample(audio_data, orig_sr=sr, target_sr=16000)
	else:
	whisper_audio = audio_data
	result = whisper_model.transcribe(whisper_audio)

	res_val = result.get("text", "")
	if isinstance(res_val, list) and len(res_val) > 0:
	res_val = res_val[0]
	if not isinstance(res_val, str):
	res_val = str(res_val)
	r_text = cc.convert(res_val.strip())
	uxo = False
	except Exception as e:
	logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
	uxo = True
	# return None, f"错误：语音识别失败且未提供参考文本。{str(e)}"

	r_text_str = ""
	if isinstance(r_text, str):
	r_text_str = r_text.strip()
	elif isinstance(r_text, list) and len(r_text) > 0 and isinstance(r_text[0], str):
	r_text_str = r_text[0].strip()

	logger.info(f"语音识别成功：{r_text_str}")
	voice_clone_prompt_items = tts.create_voice_clone_prompt(
	ref_audio=audio_tuple,
	ref_text=r_text_str if r_text_str else None,
	x_vector_only_mode=uxo
	)
	prompt_data = []
	for item in voice_clone_prompt_items:
	prompt_data.append({
	"ref_code": item.ref_code,
	"ref_spk_embedding": item.ref_spk_embedding,
	"x_vector_only_mode": item.x_vector_only_mode,
	"icl_mode": item.icl_mode,
	"ref_text": item.ref_text
	})
	file_id = str(uuid.uuid4())[:8]
	file_path = f"voice_clone_prompt_{file_id}.pt"
	torch.save(prompt_data, file_path)
	return file_path

	def generate_voice_design(text, language, voice_description):
	if not text or not text.strip():
	return None, "错误：文本不能为空。"
	if not voice_description or not voice_description.strip():
	return None, "错误：语音描述不能为空。"
	try:
	wav, sr = infer_voice_design(text.strip(), language, voice_description)
	return (sr, wav), "语音设计生成成功！"
	except Exception as e:
	logger.error(f"Voice Design 生成失败: {str(e)}", exc_info=True)
	return None, f"错误: {e}"

	def generate_voice_clone(ref_audio, ref_text, target_text, language, use_xvector_only):
	t_text = target_text.strip() if isinstance(target_text, str) else ""
	if not t_text:
	return None, "错误：目标文本不能为空。"
	audio_tuple = _audio_to_tuple(ref_audio)
	if audio_tuple is None:
	return None, "错误：需要参考音频。"
	r_text = ref_text.strip() if isinstance(ref_text, str) else ""
	if not use_xvector_only and not r_text:
	return None, "错误：未启用 '仅使用 x-vector' 时需要参考文本。"
	try:
	wav, sr = infer_voice_clone(t_text, language, audio_tuple, r_text, use_xvector_only)
	return (sr, wav), "语音克隆生成成功！"
	except Exception as e:
	logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
	return None, f"错误: {e}"

	def generate_voice_clone_from_prompt_file(prompt_file_path, target_text, language):
	t_text = target_text.strip() if isinstance(target_text, str) else ""
	if not t_text:
	return None, "错误：目标文本不能为空。"
	if not prompt_file_path:
	return None, "错误：需要提供音频特征文件。"
	try:
	wav, sr = infer_voice_clone_from_prompt(t_text, language, prompt_file_path)
	return (sr, wav), "语音克隆生成成功（使用特征文件）！"
	except Exception as e:
	logger.error(f"Voice Clone 生成失败: {str(e)}", exc_info=True)
	return None, f"错误: {e}"

	@spaces.GPU
	def infer_whisper_audio(audio_path, model_size="base"):
	if not audio_path:
	return "错误：请上传音频文件或进行录音。"
	try:
	model = load_whisper_model(model_size)
	result = model.transcribe(audio_path)

	res_val = result.get("text", "")
	if isinstance(res_val, list) and len(res_val) > 0:
	res_val = res_val[0]
	if not isinstance(res_val, str):
	res_val = str(res_val)

	return cc.convert(res_val.strip())
	except Exception as e:
	logger.error(f"Whisper 识别失败: {str(e)}", exc_info=True)
	return f"识别出错: {e}"

	def build_ui():
	theme = gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"])
	with gr.Blocks(theme=theme, title="Qwen3-TTS Demo") as demo:
	gr.Markdown("# Qwen3-TTS Demo")
	with gr.Tabs():
	with gr.Tab("ASR (Whisper)"):
	with gr.Row():
	with gr.Column():
	asr_audio_input = gr.Audio(label="输入音频", type="filepath", sources=["microphone", "upload"])
	asr_model_size = gr.Dropdown(label="Whisper 模型大小", choices=["base", "small", "medium", "large-v3"], value="base")
	asr_btn = gr.Button("开始识别", variant="primary")
	with gr.Column():
	asr_text_output = gr.Textbox(label="识别结果", lines=10, show_copy_button=True)
	asr_btn.click(infer_whisper_audio, inputs=[asr_audio_input, asr_model_size], outputs=[asr_text_output])
	with gr.Tab("Voice Design"):
	with gr.Row():
	with gr.Column():
	design_text = gr.Textbox(label="目标文本", lines=4, value="It's in the top drawer... wait, it's empty?")
	design_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
	design_instruct = gr.Textbox(label="语音描述", lines=3, value="Speak in an incredulous tone.")
	design_btn = gr.Button("开始生成", variant="primary")
	with gr.Column():
	design_audio_out = gr.Audio(label="生成音频", type="numpy")
	design_status = gr.Textbox(label="状态", interactive=False)
	design_btn.click(generate_voice_design, inputs=[design_text, design_language, design_instruct], outputs=[design_audio_out, design_status],api_name="generate_voice_design")
	with gr.Tab("Voice Clone (Base)"):
	gr.Markdown("### 1. 提取音频特征")
	with gr.Row():
	with gr.Column():
	extract_ref_audio = gr.Audio(label="参考音频", type="numpy")
	extract_ref_text = gr.Textbox(label="参考文本", lines=2)
	extract_xvector = gr.Checkbox(label="仅使用 x-vector", value=False)
	extract_btn = gr.Button("提取音频特征", variant="primary")
	with gr.Column():
	extract_file_out = gr.File(label="特征文件 (.pt)")
	extract_btn.click(extract_voice_clone_prompt, inputs=[extract_ref_audio, extract_ref_text, extract_xvector], outputs=[extract_file_out],api_name="extract_voice_clone_prompt")
	gr.Markdown("### 2. 使用特征文件生成")
	with gr.Row():
	with gr.Column():
	prompt_file = gr.File(label="特征文件 (.pt)")
	prompt_target_text = gr.Textbox(label="目标文本", lines=4)
	prompt_language = gr.Dropdown(label="语言", choices=LANGUAGES, value="Auto")
	prompt_btn = gr.Button("使用特征文件生成", variant="primary")
	with gr.Column():
	prompt_audio_out = gr.Audio(label="生成音频", type="numpy")
	prompt_status = gr.Textbox(label="状态", interactive=False)
	prompt_btn.click(generate_voice_clone_from_prompt_file, inputs=[prompt_file, prompt_target_text, prompt_language], outputs=[prompt_audio_out, prompt_status],api_name="generate_voice_clone_from_prompt")
	gr.Markdown("---")

	# Section 3: Traditional Voice Clone (Original)
	gr.Markdown("### 3. 传统音色克隆（直接使用参考音频）")
	gr.Markdown("直接上传参考音频生成语音（每次都需要提取特征）。")
	with gr.Row():
	with gr.Column(scale=2):
	clone_ref_audio = gr.Audio(
	label="参考音频",
	type="numpy",
	)
	clone_ref_text = gr.Textbox(
	label="参考文本",
	lines=2,
	placeholder="输入参考音频中的确切文字...",
	)
	clone_xvector = gr.Checkbox(
	label="仅使用 x-vector",
	value=False,
	)

	with gr.Column(scale=2):
	clone_target_text = gr.Textbox(
	label="目标文本",
	lines=4,
	placeholder="输入要让克隆音色说话的文字...",
	)
	with gr.Row():
	clone_language = gr.Dropdown(
	label="语言",
	choices=LANGUAGES,
	value="Auto",
	interactive=True,
	)
	clone_btn = gr.Button("克隆并生成", variant="primary")

	with gr.Row():
	clone_audio_out = gr.Audio(label="生成的音频", type="numpy")
	clone_status = gr.Textbox(label="状态", lines=2, interactive=False)

	clone_btn.click(
	generate_voice_clone,
	inputs=[clone_ref_audio, clone_ref_text, clone_target_text, clone_language, clone_xvector],
	outputs=[clone_audio_out, clone_status],
	api_name="generate_voice_clone"
	)

	return demo

	if __name__ == "__main__":
	build_ui().launch()