Spaces:

recentechstudio
/

CosyVoice3

Running

App Files Files Community

aal-hawa commited on 15 days ago

Commit

eef4d32

1 Parent(s): a54038e

add

Browse files

Files changed (2) hide show

app.py +120 -0
requirements.txt +11 -0

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import gradio as gr
+import torch
+import tempfile
+import torchaudio
+import os
+import sys
+from pathlib import Path
+# ============================================================
+# CosyVoice3 – Text-to-Speech with Voice Cloning
+# ============================================================
+WORK_DIR = Path.cwd()
+COSYVOICE_DIR = WORK_DIR / "CosyVoice"
+MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B"
+cosyvoice = None
+def setup_cosyvoice():
+    import subprocess
+    from huggingface_hub import snapshot_download
+    if not COSYVOICE_DIR.exists():
+        print("Cloning CosyVoice repository ...")
+        subprocess.run(
+            ["git", "clone", "--recursive",
+             "https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)],
+            check=True
+        )
+    if not MODEL_DIR.exists():
+        print("Downloading CosyVoice3 model weights ...")
+        snapshot_download(
+            "FunAudioLLM/Fun-CosyVoice3-0.5B-2512",
+            local_dir=str(MODEL_DIR),
+        )
+    sys.path.insert(0, str(COSYVOICE_DIR))
+    sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS"))
+def load_cosyvoice():
+    global cosyvoice
+    if cosyvoice is not None:
+        return
+    setup_cosyvoice()
+    from cosyvoice.cli.cosyvoice import AutoModel
+    print("Loading CosyVoice3 model ...")
+    cosyvoice = AutoModel(
+        model_dir=str(MODEL_DIR),
+        load_trt=False,
+        fp16=False
+    )
+    print("CosyVoice3 loaded.")
+def tts_speak(text, prompt_audio=None):
+    load_cosyvoice()
+    if not text.strip():
+        return None, "Please enter text."
+    if prompt_audio is None:
+        return None, "Please upload a short voice sample (3-10 seconds) for voice cloning."
+    sr, audio_data = prompt_audio
+    audio_tensor = torch.from_numpy(audio_data).float()
+    if audio_tensor.dim() == 2:
+        audio_tensor = audio_tensor.mean(dim=1)
+    if audio_tensor.dim() == 1:
+        audio_tensor = audio_tensor.unsqueeze(0)
+    if sr != 16000:
+        resampler = torchaudio.transforms.Resample(sr, 16000)
+        audio_tensor = resampler(audio_tensor)
+    prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    torchaudio.save(prompt_path.name, audio_tensor, 16000)
+    try:
+        prompt_text = "You are a helpful assistant.<|endofprompt|>"
+        speech_list = []
+        for result in cosyvoice.inference_zero_shot(
+            text, prompt_text, prompt_path.name, stream=False, speed=1.0
+        ):
+            speech_list.append(result["tts_speech"])
+        output = torch.concat(speech_list, dim=1)
+        output_np = output.numpy().flatten()
+        return (24000, output_np), "Speech generated successfully!"
+    except Exception as e:
+        return None, f"TTS Error: {str(e)}"
+    finally:
+        if os.path.exists(prompt_path.name):
+            os.remove(prompt_path.name)
+# ============================================================
+# Gradio Interface
+# ============================================================
+with gr.Blocks(title="CosyVoice3 TTS") as demo:
+    gr.Markdown("""
+    # 🔊 CosyVoice3 – Text-to-Speech
+    Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice.
+    """)
+    with gr.Row():
+        with gr.Column():
+            tts_text = gr.Textbox(
+                label="Text to Speak",
+                value="Hello, welcome to the text to speech demo.",
+                lines=3
+            )
+            prompt_audio = gr.Audio(
+                sources=["upload"],
+                type="numpy",
+                label="Voice Sample (3-10 sec)"
+            )
+            generate_btn = gr.Button("Generate Speech", variant="primary")
+        with gr.Column():
+            tts_audio = gr.Audio(label="Generated Speech")
+            tts_status = gr.Textbox(label="Status")
+    generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status])
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+git+https://github.com/huggingface/transformers.git@82a06db03535c49aa987719ed0746a76093b1ec4
+torch
+torchaudio
+librosa
+numpy
+gradio
+huggingface_hub
+hyperpyyaml
+modelscope
+onnxruntime
+soundfile