Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import tempfile | |
| import torchaudio | |
| import os | |
| import sys | |
| from pathlib import Path | |
| # ============================================================ | |
| # CosyVoice3 β Text-to-Speech with Voice Cloning | |
| # ============================================================ | |
| WORK_DIR = Path.cwd() | |
| COSYVOICE_DIR = WORK_DIR / "CosyVoice" | |
| MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B" | |
| cosyvoice = None | |
| def setup_cosyvoice(): | |
| import subprocess | |
| from huggingface_hub import snapshot_download | |
| if not COSYVOICE_DIR.exists(): | |
| print("Cloning CosyVoice repository ...") | |
| subprocess.run( | |
| ["git", "clone", "--recursive", | |
| "https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)], | |
| check=True | |
| ) | |
| if not MODEL_DIR.exists(): | |
| print("Downloading CosyVoice3 model weights ...") | |
| snapshot_download( | |
| "FunAudioLLM/Fun-CosyVoice3-0.5B-2512", | |
| local_dir=str(MODEL_DIR), | |
| ) | |
| sys.path.insert(0, str(COSYVOICE_DIR)) | |
| sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS")) | |
| def load_cosyvoice(): | |
| global cosyvoice | |
| if cosyvoice is not None: | |
| return | |
| setup_cosyvoice() | |
| from cosyvoice.cli.cosyvoice import AutoModel | |
| print("Loading CosyVoice3 model ...") | |
| cosyvoice = AutoModel( | |
| model_dir=str(MODEL_DIR), | |
| load_trt=False, | |
| fp16=False | |
| ) | |
| print("CosyVoice3 loaded.") | |
| def tts_speak(text, prompt_audio=None): | |
| load_cosyvoice() | |
| if not text.strip(): | |
| return None, "Please enter text." | |
| if prompt_audio is None: | |
| return None, "Please upload a short voice sample (3-10 seconds) for voice cloning." | |
| sr, audio_data = prompt_audio | |
| audio_tensor = torch.from_numpy(audio_data).float() | |
| if audio_tensor.dim() == 2: | |
| audio_tensor = audio_tensor.mean(dim=1) | |
| if audio_tensor.dim() == 1: | |
| audio_tensor = audio_tensor.unsqueeze(0) | |
| if sr != 16000: | |
| resampler = torchaudio.transforms.Resample(sr, 16000) | |
| audio_tensor = resampler(audio_tensor) | |
| prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| torchaudio.save(prompt_path.name, audio_tensor, 16000) | |
| try: | |
| prompt_text = "You are a helpful assistant.<|endofprompt|>" | |
| speech_list = [] | |
| for result in cosyvoice.inference_zero_shot( | |
| text, prompt_text, prompt_path.name, stream=False, speed=1.0 | |
| ): | |
| speech_list.append(result["tts_speech"]) | |
| output = torch.concat(speech_list, dim=1) | |
| output_np = output.numpy().flatten() | |
| return (24000, output_np), "Speech generated successfully!" | |
| except Exception as e: | |
| return None, f"TTS Error: {str(e)}" | |
| finally: | |
| if os.path.exists(prompt_path.name): | |
| os.remove(prompt_path.name) | |
| # ============================================================ | |
| # Gradio Interface | |
| # ============================================================ | |
| with gr.Blocks(title="CosyVoice3 TTS") as demo: | |
| gr.Markdown(""" | |
| # π CosyVoice3 β Text-to-Speech | |
| Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| tts_text = gr.Textbox( | |
| label="Text to Speak", | |
| value="Hello, welcome to the text to speech demo.", | |
| lines=3 | |
| ) | |
| prompt_audio = gr.Audio( | |
| sources=["upload"], | |
| type="numpy", | |
| label="Voice Sample (3-10 sec)" | |
| ) | |
| generate_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(): | |
| tts_audio = gr.Audio(label="Generated Speech") | |
| tts_status = gr.Textbox(label="Status") | |
| generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status]) | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0") |