Spaces:

recentechstudio
/

CosyVoice3

Running

File size: 3,993 Bytes

import gradio as gr
import torch
import tempfile
import torchaudio
import os
import sys
from pathlib import Path

# ============================================================
# CosyVoice3 – Text-to-Speech with Voice Cloning
# ============================================================
WORK_DIR = Path.cwd()
COSYVOICE_DIR = WORK_DIR / "CosyVoice"
MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"

cosyvoice = None

def setup_cosyvoice():
    import subprocess
    from huggingface_hub import snapshot_download

    if not COSYVOICE_DIR.exists():
        print("Cloning CosyVoice repository ...")
        subprocess.run(
            ["git", "clone", "--recursive",
             "https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)],
            check=True
        )
    if not MODEL_DIR.exists():
        print("Downloading CosyVoice3 model weights ...")
        snapshot_download(
            "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
            local_dir=str(MODEL_DIR),
        )
    sys.path.insert(0, str(COSYVOICE_DIR))
    sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS"))

def load_cosyvoice():
    global cosyvoice
    if cosyvoice is not None:
        return
    setup_cosyvoice()
    from cosyvoice.cli.cosyvoice import AutoModel
    print("Loading CosyVoice3 model ...")
    cosyvoice = AutoModel(
        model_dir=str(MODEL_DIR),
        load_trt=False,
        fp16=False
    )
    print("CosyVoice3 loaded.")

def tts_speak(text, prompt_audio=None):
    load_cosyvoice()

    if not text.strip():
        return None, "Please enter text."

    if prompt_audio is None:
        return None, "Please upload a short voice sample (3-10 seconds) for voice cloning."

    sr, audio_data = prompt_audio
    audio_tensor = torch.from_numpy(audio_data).float()
    if audio_tensor.dim() == 2:
        audio_tensor = audio_tensor.mean(dim=1)
    if audio_tensor.dim() == 1:
        audio_tensor = audio_tensor.unsqueeze(0)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio_tensor = resampler(audio_tensor)

    prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    torchaudio.save(prompt_path.name, audio_tensor, 16000)

    try:
        prompt_text = "You are a helpful assistant.<|endofprompt|>"
        speech_list = []
        for result in cosyvoice.inference_zero_shot(
            text, prompt_text, prompt_path.name, stream=False, speed=1.0
        ):
            speech_list.append(result["tts_speech"])
        output = torch.concat(speech_list, dim=1)
        output_np = output.numpy().flatten()
        return (24000, output_np), "Speech generated successfully!"
    except Exception as e:
        return None, f"TTS Error: {str(e)}"
    finally:
        if os.path.exists(prompt_path.name):
            os.remove(prompt_path.name)

# ============================================================
# Gradio Interface
# ============================================================
with gr.Blocks(title="CosyVoice3 TTS") as demo:
    gr.Markdown("""
    # 🔊 CosyVoice3 – Text-to-Speech
    Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice.
    """)

    with gr.Row():
        with gr.Column():
            tts_text = gr.Textbox(
                label="Text to Speak",
                value="Hello, welcome to the text to speech demo.",
                lines=3
            )
            prompt_audio = gr.Audio(
                sources=["upload"],
                type="numpy",
                label="Voice Sample (3-10 sec)"
            )
            generate_btn = gr.Button("Generate Speech", variant="primary")
        with gr.Column():
            tts_audio = gr.Audio(label="Generated Speech")
            tts_status = gr.Textbox(label="Status")

    generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")