Spaces:

recentechstudio
/

CosyVoice3

Running

aal-hawa

add

b6daf2c 7 days ago

3.99 kB

	import gradio as gr
	import torch
	import tempfile
	import torchaudio
	import os
	import sys
	from pathlib import Path

	# ============================================================
	# CosyVoice3 – Text-to-Speech with Voice Cloning
	# ============================================================
	WORK_DIR = Path.cwd()
	COSYVOICE_DIR = WORK_DIR / "CosyVoice"
	MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"

	cosyvoice = None

	def setup_cosyvoice():
	import subprocess
	from huggingface_hub import snapshot_download

	if not COSYVOICE_DIR.exists():
	print("Cloning CosyVoice repository ...")
	subprocess.run(
	["git", "clone", "--recursive",
	"https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)],
	check=True
	)
	if not MODEL_DIR.exists():
	print("Downloading CosyVoice3 model weights ...")
	snapshot_download(
	"FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
	local_dir=str(MODEL_DIR),
	)
	sys.path.insert(0, str(COSYVOICE_DIR))
	sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS"))

	def load_cosyvoice():
	global cosyvoice
	if cosyvoice is not None:
	return
	setup_cosyvoice()
	from cosyvoice.cli.cosyvoice import AutoModel
	print("Loading CosyVoice3 model ...")
	cosyvoice = AutoModel(
	model_dir=str(MODEL_DIR),
	load_trt=False,
	fp16=False
	)
	print("CosyVoice3 loaded.")

	def tts_speak(text, prompt_audio=None):
	load_cosyvoice()

	if not text.strip():
	return None, "Please enter text."

	if prompt_audio is None:
	return None, "Please upload a short voice sample (3-10 seconds) for voice cloning."

	sr, audio_data = prompt_audio
	audio_tensor = torch.from_numpy(audio_data).float()
	if audio_tensor.dim() == 2:
	audio_tensor = audio_tensor.mean(dim=1)
	if audio_tensor.dim() == 1:
	audio_tensor = audio_tensor.unsqueeze(0)

	if sr != 16000:
	resampler = torchaudio.transforms.Resample(sr, 16000)
	audio_tensor = resampler(audio_tensor)

	prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	torchaudio.save(prompt_path.name, audio_tensor, 16000)

	try:
	prompt_text = "You are a helpful assistant.<\|endofprompt\|>"
	speech_list = []
	for result in cosyvoice.inference_zero_shot(
	text, prompt_text, prompt_path.name, stream=False, speed=1.0
	):
	speech_list.append(result["tts_speech"])
	output = torch.concat(speech_list, dim=1)
	output_np = output.numpy().flatten()
	return (24000, output_np), "Speech generated successfully!"
	except Exception as e:
	return None, f"TTS Error: {str(e)}"
	finally:
	if os.path.exists(prompt_path.name):
	os.remove(prompt_path.name)

	# ============================================================
	# Gradio Interface
	# ============================================================
	with gr.Blocks(title="CosyVoice3 TTS") as demo:
	gr.Markdown("""
	# 🔊 CosyVoice3 – Text-to-Speech
	Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice.
	""")

	with gr.Row():
	with gr.Column():
	tts_text = gr.Textbox(
	label="Text to Speak",
	value="Hello, welcome to the text to speech demo.",
	lines=3
	)
	prompt_audio = gr.Audio(
	sources=["upload"],
	type="numpy",
	label="Voice Sample (3-10 sec)"
	)
	generate_btn = gr.Button("Generate Speech", variant="primary")
	with gr.Column():
	tts_audio = gr.Audio(label="Generated Speech")
	tts_status = gr.Textbox(label="Status")

	generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status])

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0")