import gradio as gr import torch import tempfile import torchaudio import os import sys from pathlib import Path # ============================================================ # CosyVoice3 – Text-to-Speech with Voice Cloning # ============================================================ WORK_DIR = Path.cwd() COSYVOICE_DIR = WORK_DIR / "CosyVoice" MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B" cosyvoice = None def setup_cosyvoice(): import subprocess from huggingface_hub import snapshot_download if not COSYVOICE_DIR.exists(): print("Cloning CosyVoice repository ...") subprocess.run( ["git", "clone", "--recursive", "https://github.com/FunAudioLLM/CosyVoice.git", str(COSYVOICE_DIR)], check=True ) if not MODEL_DIR.exists(): print("Downloading CosyVoice3 model weights ...") snapshot_download( "FunAudioLLM/Fun-CosyVoice3-0.5B-2512", local_dir=str(MODEL_DIR), ) sys.path.insert(0, str(COSYVOICE_DIR)) sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS")) def load_cosyvoice(): global cosyvoice if cosyvoice is not None: return setup_cosyvoice() from cosyvoice.cli.cosyvoice import AutoModel print("Loading CosyVoice3 model ...") cosyvoice = AutoModel( model_dir=str(MODEL_DIR), load_trt=False, fp16=False ) print("CosyVoice3 loaded.") def tts_speak(text, prompt_audio=None): load_cosyvoice() if not text.strip(): return None, "Please enter text." if prompt_audio is None: return None, "Please upload a short voice sample (3-10 seconds) for voice cloning." sr, audio_data = prompt_audio audio_tensor = torch.from_numpy(audio_data).float() if audio_tensor.dim() == 2: audio_tensor = audio_tensor.mean(dim=1) if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) if sr != 16000: resampler = torchaudio.transforms.Resample(sr, 16000) audio_tensor = resampler(audio_tensor) prompt_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) torchaudio.save(prompt_path.name, audio_tensor, 16000) try: prompt_text = "You are a helpful assistant.<|endofprompt|>" speech_list = [] for result in cosyvoice.inference_zero_shot( text, prompt_text, prompt_path.name, stream=False, speed=1.0 ): speech_list.append(result["tts_speech"]) output = torch.concat(speech_list, dim=1) output_np = output.numpy().flatten() return (24000, output_np), "Speech generated successfully!" except Exception as e: return None, f"TTS Error: {str(e)}" finally: if os.path.exists(prompt_path.name): os.remove(prompt_path.name) # ============================================================ # Gradio Interface # ============================================================ with gr.Blocks(title="CosyVoice3 TTS") as demo: gr.Markdown(""" # 🔊 CosyVoice3 – Text-to-Speech Upload a short voice sample (3-10 seconds), enter text, and generate speech in that voice. """) with gr.Row(): with gr.Column(): tts_text = gr.Textbox( label="Text to Speak", value="Hello, welcome to the text to speech demo.", lines=3 ) prompt_audio = gr.Audio( sources=["upload"], type="numpy", label="Voice Sample (3-10 sec)" ) generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): tts_audio = gr.Audio(label="Generated Speech") tts_status = gr.Textbox(label="Status") generate_btn.click(tts_speak, [tts_text, prompt_audio], [tts_audio, tts_status]) if __name__ == "__main__": demo.launch(server_name="0.0.0.0")