File size: 3,993 Bytes
eef4d32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6daf2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import gradio as gr
import torch
import tempfile
import torchaudio
import os
import sys
from pathlib import Path

# ============================================================
# CosyVoice3 – Text-to-Speech with Voice Cloning
# ============================================================
WORK_DIR = Path.cwd()
COSYVOICE_DIR = WORK_DIR / "CosyVoice"
MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"

cosyvoice = None

def setup_cosyvoice():
    import subprocess
    from huggingface_hub import snapshot_download

    if not COSYVOICE_DIR.exists():
        print("Cloning CosyVoice repository ...")
        subprocess.run(
            ["git", "clone", "--recursive",
             "https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)],
            check=True
        )
    if not MODEL_DIR.exists():
        print("Downloading CosyVoice3 model weights ...")
        snapshot_download(
            "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
            local_dir=str(MODEL_DIR),
        )
    sys.path.insert(0, str(COSYVOICE_DIR))
    sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS"))

def load_cosyvoice():
    global cosyvoice
    if cosyvoice is not None:
        return
    setup_cosyvoice()
    from cosyvoice.cli.cosyvoice import AutoModel
    print("Loading CosyVoice3 model ...")
    cosyvoice = AutoModel(
        model_dir=str(MODEL_DIR),
        load_trt=False,
        fp16=False
    )
    print("CosyVoice3 loaded.")

def tts_speak(text, prompt_audio=None):
    load_cosyvoice()

    if not text.strip():
        return None, "Please enter text."

    if prompt_audio is None:
        return None, "Please upload a short voice sample (3-10 seconds) for voice cloning."

    sr, audio_data = prompt_audio
    audio_tensor = torch.from_numpy(audio_data).float()
    if audio_tensor.dim() == 2:
        audio_tensor = audio_tensor.mean(dim=1)
    if audio_tensor.dim() == 1:
        audio_tensor = audio_tensor.unsqueeze(0)

    if sr != 16000:
        resampler = torchaudio.transforms.Resample(sr, 16000)
        audio_tensor = resampler(audio_tensor)

    prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    torchaudio.save(prompt_path.name, audio_tensor, 16000)

    try:
        prompt_text = "You are a helpful assistant.<|endofprompt|>"
        speech_list = []
        for result in cosyvoice.inference_zero_shot(
            text, prompt_text, prompt_path.name, stream=False, speed=1.0
        ):
            speech_list.append(result["tts_speech"])
        output = torch.concat(speech_list, dim=1)
        output_np = output.numpy().flatten()
        return (24000, output_np), "Speech generated successfully!"
    except Exception as e:
        return None, f"TTS Error: {str(e)}"
    finally:
        if os.path.exists(prompt_path.name):
            os.remove(prompt_path.name)

# ============================================================
# Gradio Interface
# ============================================================
with gr.Blocks(title="CosyVoice3 TTS") as demo:
    gr.Markdown("""
    # 🔊 CosyVoice3 – Text-to-Speech
    Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice.
    """)

    with gr.Row():
        with gr.Column():
            tts_text = gr.Textbox(
                label="Text to Speak",
                value="Hello, welcome to the text to speech demo.",
                lines=3
            )
            prompt_audio = gr.Audio(
                sources=["upload"],
                type="numpy",
                label="Voice Sample (3-10 sec)"
            )
            generate_btn = gr.Button("Generate Speech", variant="primary")
        with gr.Column():
            tts_audio = gr.Audio(label="Generated Speech")
            tts_status = gr.Textbox(label="Status")

    generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status])

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0")