CosyVoice3 / app.py
aal-hawa
add
b6daf2c
import gradio as gr
import torch
import tempfile
import torchaudio
import os
import sys
from pathlib import Path
# ============================================================
# CosyVoice3 – Text-to-Speech with Voice Cloning
# ============================================================
WORK_DIR = Path.cwd()
COSYVOICE_DIR = WORK_DIR / "CosyVoice"
MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"
cosyvoice = None
def setup_cosyvoice():
import subprocess
from huggingface_hub import snapshot_download
if not COSYVOICE_DIR.exists():
print("Cloning CosyVoice repository ...")
subprocess.run(
["git", "clone", "--recursive",
"https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)],
check=True
)
if not MODEL_DIR.exists():
print("Downloading CosyVoice3 model weights ...")
snapshot_download(
"FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
local_dir=str(MODEL_DIR),
)
sys.path.insert(0, str(COSYVOICE_DIR))
sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS"))
def load_cosyvoice():
global cosyvoice
if cosyvoice is not None:
return
setup_cosyvoice()
from cosyvoice.cli.cosyvoice import AutoModel
print("Loading CosyVoice3 model ...")
cosyvoice = AutoModel(
model_dir=str(MODEL_DIR),
load_trt=False,
fp16=False
)
print("CosyVoice3 loaded.")
def tts_speak(text, prompt_audio=None):
load_cosyvoice()
if not text.strip():
return None, "Please enter text."
if prompt_audio is None:
return None, "Please upload a short voice sample (3-10 seconds) for voice cloning."
sr, audio_data = prompt_audio
audio_tensor = torch.from_numpy(audio_data).float()
if audio_tensor.dim() == 2:
audio_tensor = audio_tensor.mean(dim=1)
if audio_tensor.dim() == 1:
audio_tensor = audio_tensor.unsqueeze(0)
if sr != 16000:
resampler = torchaudio.transforms.Resample(sr, 16000)
audio_tensor = resampler(audio_tensor)
prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
torchaudio.save(prompt_path.name, audio_tensor, 16000)
try:
prompt_text = "You are a helpful assistant.<|endofprompt|>"
speech_list = []
for result in cosyvoice.inference_zero_shot(
text, prompt_text, prompt_path.name, stream=False, speed=1.0
):
speech_list.append(result["tts_speech"])
output = torch.concat(speech_list, dim=1)
output_np = output.numpy().flatten()
return (24000, output_np), "Speech generated successfully!"
except Exception as e:
return None, f"TTS Error: {str(e)}"
finally:
if os.path.exists(prompt_path.name):
os.remove(prompt_path.name)
# ============================================================
# Gradio Interface
# ============================================================
with gr.Blocks(title="CosyVoice3 TTS") as demo:
gr.Markdown("""
# πŸ”Š CosyVoice3 – Text-to-Speech
Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice.
""")
with gr.Row():
with gr.Column():
tts_text = gr.Textbox(
label="Text to Speak",
value="Hello, welcome to the text to speech demo.",
lines=3
)
prompt_audio = gr.Audio(
sources=["upload"],
type="numpy",
label="Voice Sample (3-10 sec)"
)
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
tts_audio = gr.Audio(label="Generated Speech")
tts_status = gr.Textbox(label="Status")
generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status])
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0")