import gradio as gr import sys import os import torch import torchaudio import torchaudio.transforms as T import numpy as np import tempfile import librosa from pathlib import Path print("=" * 60) print("πŸŽ™οΈ Fun-CosyVoice3 TTS Initialization") print("=" * 60) # Step 1: Setup directories print("\nπŸ“ Step 1: Setting up directories...") WORK_DIR = Path.cwd() COSYVOICE_DIR = WORK_DIR / "CosyVoice" MODEL_DIR = COSYVOICE_DIR / "pretrained_models" / "Fun-CosyVoice3-0.5B" print(f"Working directory: {WORK_DIR}") print(f"CosyVoice directory: {COSYVOICE_DIR}") print(f"Model directory: {MODEL_DIR}") # Step 2: Clone CosyVoice if needed if not COSYVOICE_DIR.exists(): print("\nπŸ“₯ Step 2: Cloning CosyVoice repository...") import subprocess try: subprocess.run([ 'git', 'clone', '--recursive', 'https://github.com/FunAudioLLM/CosyVoice.git', str(COSYVOICE_DIR) ], check=True) print("βœ… Repository cloned successfully") except Exception as e: print(f"❌ Failed to clone repository: {e}") raise else: print("\nβœ… Step 2: CosyVoice repository already exists") # Step 3: Download models if not MODEL_DIR.exists(): print("\nπŸ“₯ Step 3: Downloading models (this may take a few minutes)...") from huggingface_hub import snapshot_download try: print("Downloading Fun-CosyVoice3-0.5B-2512...") snapshot_download( 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512', local_dir=str(MODEL_DIR), local_dir_use_symlinks=False ) print("βœ… Model downloaded successfully") except Exception as e: print(f"❌ Failed to download model: {e}") raise else: print("\nβœ… Step 3: Models already exist") # Step 4: Download ttsfrd (optional) TTSFRD_DIR = COSYVOICE_DIR / "pretrained_models" / "CosyVoice-ttsfrd" if not TTSFRD_DIR.exists(): print("\nπŸ“₯ Step 4: Downloading ttsfrd...") from huggingface_hub import snapshot_download try: snapshot_download( 'FunAudioLLM/CosyVoice-ttsfrd', local_dir=str(TTSFRD_DIR), local_dir_use_symlinks=False ) print("βœ… ttsfrd downloaded successfully") except Exception as e: print(f"⚠️ Failed to download ttsfrd (will use WeText): {e}") else: print("\nβœ… Step 4: ttsfrd already exists") # Step 5: Add to Python path print("\nπŸ”§ Step 5: Configuring Python path...") sys.path.insert(0, str(COSYVOICE_DIR)) sys.path.insert(0, str(COSYVOICE_DIR / "third_party" / "Matcha-TTS")) print(f"Added to path: {COSYVOICE_DIR}") print(f"Added to path: {COSYVOICE_DIR / 'third_party' / 'Matcha-TTS'}") # Step 6: Import CosyVoice print("\nπŸ“¦ Step 6: Importing CosyVoice...") try: from cosyvoice.cli.cosyvoice import AutoModel as CosyVoiceAutoModel from cosyvoice.utils.file_utils import load_wav from cosyvoice.utils.common import set_all_random_seed print("βœ… CosyVoice imported successfully") except Exception as e: print(f"❌ Failed to import CosyVoice: {e}") raise print("\n" + "=" * 60) print("βœ… Initialization completed successfully!") print("=" * 60 + "\n") # Global variables cosyvoice = None target_sr = 24000 prompt_sr = 16000 max_val = 0.8 top_db = 60 hop_length = 220 win_length = 440 def load_model(): """Load the CosyVoice model""" global cosyvoice if cosyvoice is None: print("πŸš€ Loading CosyVoice model...") try: cosyvoice = CosyVoiceAutoModel( model_dir=str(MODEL_DIR), load_trt=False, fp16=False ) print("βœ… Model loaded successfully!") except Exception as e: print(f"❌ Error loading model: {e}") import traceback traceback.print_exc() raise gr.Error(f"Failed to load model: {e}") return cosyvoice def postprocess(wav_path): """Post-process audio - trim silence and normalize (from official code)""" try: speech = load_wav(wav_path, target_sr=target_sr, min_sr=16000) # Trim silence from beginning and end speech, _ = librosa.effects.trim( speech, top_db=top_db, frame_length=win_length, hop_length=hop_length ) # Normalize if too loud if speech.abs().max() > max_val: speech = speech / speech.abs().max() * max_val # Add silence at the end speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1) # Save back torchaudio.save(wav_path, speech, target_sr) return wav_path except Exception as e: print(f"⚠️ Postprocess warning: {e}") return wav_path def process_audio(audio_input): """ Convert audio input to proper format for CosyVoice Handles: stereo->mono, different dtypes, resampling """ if audio_input is None: return None try: sr, audio_data = audio_input print(f"πŸ“Š Input audio - shape: {audio_data.shape}, dtype: {audio_data.dtype}, sr: {sr}Hz") # Step 1: Normalize data type to float32 if audio_data.dtype == np.int16: audio_data = audio_data.astype(np.float32) / 32768.0 elif audio_data.dtype == np.int32: audio_data = audio_data.astype(np.float32) / 2147483648.0 elif audio_data.dtype == np.float64: audio_data = audio_data.astype(np.float32) elif audio_data.dtype != np.float32: audio_data = audio_data.astype(np.float32) # Step 2: Convert stereo to mono if needed if len(audio_data.shape) == 2: print(f" Converting stereo ({audio_data.shape[1]} channels) to mono...") if audio_data.shape[1] == 2: audio_data = audio_data.mean(axis=1) elif audio_data.shape[1] == 1: audio_data = audio_data.squeeze() else: audio_data = audio_data[:, 0] # Step 3: Ensure 1D array audio_data = audio_data.flatten() # Step 4: Check and adjust duration duration = len(audio_data) / sr print(f" Duration: {duration:.2f}s") if duration < 1: return None, "❌ Audio too short (minimum 1 second)" if duration > 30: print(f" ⚠️ Truncating audio from {duration:.2f}s to 30s") audio_data = audio_data[:sr * 30] # Step 5: Convert to torch tensor audio_tensor = torch.from_numpy(audio_data).float() # Step 6: Add channel dimension (1, samples) if audio_tensor.dim() == 1: audio_tensor = audio_tensor.unsqueeze(0) print(f" Tensor shape: {audio_tensor.shape}") # Step 7: Resample if needed if sr != target_sr: print(f" πŸ”„ Resampling from {sr}Hz to {target_sr}Hz...") resampler = T.Resample(sr, target_sr) audio_tensor = resampler(audio_tensor) sr = target_sr # Step 8: Save to temporary file temp_path = tempfile.mktemp(suffix='.wav') torchaudio.save(temp_path, audio_tensor, sr) # Step 9: Post-process (trim silence, normalize) temp_path = postprocess(temp_path) print(f" βœ… Audio processed and saved: {os.path.basename(temp_path)}") return temp_path except Exception as e: print(f"❌ Error processing audio: {e}") import traceback traceback.print_exc() return None def zero_shot_tts(tts_text, prompt_text, prompt_audio, seed, speed): """Zero-shot TTS synthesis - following official code structure""" try: # Validation if not tts_text or not tts_text.strip(): return None, "❌ Please provide text to synthesize" if len(tts_text) > 200: return None, "❌ Text too long, please keep within 200 characters" if not prompt_audio: return None, "❌ Please upload reference audio" if not prompt_text or not prompt_text.strip(): return None, "❌ Please provide prompt text" # Load model model = load_model() # Process audio prompt_audio_path = process_audio(prompt_audio) if prompt_audio_path is None: return None, "❌ Failed to process audio" # Check sample rate info = torchaudio.info(prompt_audio_path) if info.sample_rate < prompt_sr: return None, f"❌ Audio sample rate {info.sample_rate} is below {prompt_sr}Hz" # Check duration duration = info.num_frames / info.sample_rate if duration > 10: return None, "❌ Please keep prompt audio within 10 seconds" # Clean inputs tts_text = tts_text.strip() prompt_text = prompt_text.strip() # Build prompt following official format # IMPORTANT: This is the official format from the code full_prompt = f"You are a helpful assistant.<|endofprompt|>{prompt_text}" print(f"\n🎡 Generating speech...") print(f" TTS text: '{tts_text[:100]}{'...' if len(tts_text) > 100 else ''}'") print(f" Prompt text: '{prompt_text[:50]}{'...' if len(prompt_text) > 50 else ''}'") print(f" Full prompt: '{full_prompt[:80]}{'...' if len(full_prompt) > 80 else ''}'") print(f" Seed: {seed}, Speed: {speed}") # Set random seed set_all_random_seed(seed) # Generate - following official code exactly speech_list = [] for i in model.inference_zero_shot( tts_text, # Text to synthesize full_prompt, # Prompt with special format prompt_audio_path, # Processed prompt audio stream=False, speed=speed ): speech_list.append(i["tts_speech"]) # Concatenate all speech segments output_speech = torch.concat(speech_list, dim=1) # Clean up if os.path.exists(prompt_audio_path): os.remove(prompt_audio_path) print(f" βœ… Generated audio shape: {output_speech.shape}") print("βœ… Speech generated successfully!\n") # Return as numpy array for Gradio return (target_sr, output_speech.numpy().flatten()), "βœ… Success!" except Exception as e: print(f"❌ Error in zero_shot_tts: {e}") import traceback traceback.print_exc() # Clean up on error try: if prompt_audio_path and os.path.exists(prompt_audio_path): os.remove(prompt_audio_path) except: pass return None, f"❌ Error: {str(e)}" def instruct_tts(tts_text, instruct_text, prompt_audio, seed, speed): """Instruction-based TTS - following official code structure""" try: # Validation if not tts_text or not tts_text.strip(): return None, "❌ Please provide text to synthesize" if len(tts_text) > 200: return None, "❌ Text too long, please keep within 200 characters" if not prompt_audio: return None, "❌ Please upload reference audio" if not instruct_text or not instruct_text.strip(): return None, "❌ Please provide instruction text" # Load model model = load_model() # Process audio prompt_audio_path = process_audio(prompt_audio) if prompt_audio_path is None: return None, "❌ Failed to process audio" # Clean inputs tts_text = tts_text.strip() instruct_text = instruct_text.strip() print(f"\nπŸ“ Generating speech with instruction...") print(f" TTS text: '{tts_text[:100]}{'...' if len(tts_text) > 100 else ''}'") print(f" Instruction: '{instruct_text}'") print(f" Seed: {seed}, Speed: {speed}") # Set random seed set_all_random_seed(seed) # Generate - following official code speech_list = [] for i in model.inference_instruct2( tts_text, # Text to synthesize instruct_text, # Instruction prompt_audio_path, # Processed prompt audio stream=False, speed=speed ): speech_list.append(i["tts_speech"]) # Concatenate all speech segments output_speech = torch.concat(speech_list, dim=1) # Clean up if os.path.exists(prompt_audio_path): os.remove(prompt_audio_path) print(f" βœ… Generated audio shape: {output_speech.shape}") print("βœ… Speech generated successfully!\n") # Return as numpy array for Gradio return (target_sr, output_speech.numpy().flatten()), "βœ… Success!" except Exception as e: print(f"❌ Error: {e}") import traceback traceback.print_exc() # Clean up on error try: if prompt_audio_path and os.path.exists(prompt_audio_path): os.remove(prompt_audio_path) except: pass return None, f"❌ Error: {str(e)}" # Instruction options (from official code) instruct_options = [ "You are a helpful assistant. θ―·η”¨εΉΏδΈœθ―θ‘¨θΎΎγ€‚<|endofprompt|>", "You are a helpful assistant. θ―·η”¨ε°½ε―θƒ½εΏ«εœ°θ―­ι€Ÿθ―΄δΈ€ε₯话。<|endofprompt|>", "You are a helpful assistant. θ―·η”¨ζ­£εΈΈηš„θ―­ι€Ÿθ―΄δΈ€ε₯话。<|endofprompt|>", "You are a helpful assistant. θ―·η”¨ζ…’δΈ€η‚Ήηš„θ―­ι€Ÿθ―΄δΈ€ε₯话。<|endofprompt|>", "You are a helpful assistant. Please speak in a professional tone.<|endofprompt|>", "You are a helpful assistant. Please speak in a friendly tone.<|endofprompt|>", ] # Create Gradio interface with gr.Blocks(title="Fun-CosyVoice3 TTS") as demo: gr.Markdown(""" # πŸŽ™οΈ Fun-CosyVoice3-0.5B Text-to-Speech Advanced multilingual zero-shot TTS system supporting **9 languages** and **18+ Chinese dialects**. Based on the official [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) implementation. """) with gr.Tabs(): # Tab 1: Zero-Shot TTS with gr.Tab("🎯 Zero-Shot Voice Cloning (3s Fast Cloning)"): gr.Markdown(""" ### Clone any voice with 3-10 seconds of reference audio **Steps:** 1. Upload or record reference audio (≀30s, β‰₯16kHz) 2. Enter the **prompt text** (transcription of the reference audio) 3. Enter the **text to synthesize** (what you want the voice to say) 4. Click Generate """) with gr.Row(): with gr.Column(): zs_tts_text = gr.Textbox( label="Text to synthesize (what will be spoken)", placeholder="Enter the text you want to synthesize...", lines=2, value="Her handwriting is very neat, which suggests she likes things tidy." ) zs_prompt_audio = gr.Audio( label="Reference audio (upload or record)", type="numpy" ) zs_prompt_text = gr.Textbox( label="Prompt text (transcription of reference audio)", placeholder="Enter what is said in the reference audio...", lines=2, value="" ) with gr.Row(): zs_seed = gr.Number(label="Random seed", value=0, precision=0) zs_speed = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1) zs_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg") with gr.Column(): zs_output = gr.Audio(label="Generated speech") zs_status = gr.Textbox(label="Status", interactive=False) zs_btn.click( fn=zero_shot_tts, inputs=[zs_tts_text, zs_prompt_text, zs_prompt_audio, zs_seed, zs_speed], outputs=[zs_output, zs_status] ) gr.Markdown(""" **Important:** - **Text to synthesize**: The new text you want to hear in the cloned voice - **Prompt text**: Transcription of what is said in your reference audio - **Reference audio**: 3-10 seconds of clear speech **Example:** - Reference audio: Someone saying "Hello, how are you?" - Prompt text: "Hello, how are you?" - Text to synthesize: "This is a test of voice cloning" - Result: "This is a test of voice cloning" in the cloned voice """) # Tab 2: Instruction-Based TTS with gr.Tab("πŸ“ Instruction-Based Control (Natural Language)"): gr.Markdown(""" ### Control voice characteristics with natural language instructions **Steps:** 1. Upload or record reference audio 2. Select or enter instruction (speed, dialect, emotion) 3. Enter text to synthesize 4. Click Generate """) with gr.Row(): with gr.Column(): inst_tts_text = gr.Textbox( label="Text to synthesize", placeholder="Enter your text...", lines=2, value="Welcome to the natural language control demo." ) inst_prompt_audio = gr.Audio( label="Reference audio", type="numpy" ) inst_text = gr.Dropdown( label="Instruction", choices=instruct_options, value=instruct_options[0] ) with gr.Row(): inst_seed = gr.Number(label="Random seed", value=0, precision=0) inst_speed = gr.Slider(label="Speed", minimum=0.5, maximum=2.0, value=1.0, step=0.1) inst_btn = gr.Button("🎡 Generate Speech", variant="primary", size="lg") with gr.Column(): inst_output = gr.Audio(label="Generated speech") inst_status = gr.Textbox(label="Status", interactive=False) inst_btn.click( fn=instruct_tts, inputs=[inst_tts_text, inst_text, inst_prompt_audio, inst_seed, inst_speed], outputs=[inst_output, inst_status] ) gr.Markdown(""" **Example instructions:** - "θ―·η”¨εΉΏδΈœθ―θ‘¨θΎΎ" (Speak in Cantonese) - "θ―·η”¨ε°½ε―θƒ½εΏ«εœ°θ―­ι€Ÿθ―΄" (Speak as fast as possible) - "Please speak in a professional tone" """) gr.Markdown(""" --- ### πŸ“‹ Supported Languages & Dialects **Languages:** Chinese, English, Japanese, Korean, German, Spanish, French, Italian, Russian **Chinese Dialects:** Guangdong, Minnan, Sichuan, Dongbei, Shanxi, Shanghai, Tianjin, Shandong, and more ### ⚑ Performance - Model: Fun-CosyVoice3-0.5B (500M parameters) - Sample Rate: 24kHz - Latency: ~5-10s on CPU, ~2-3s on GPU ### πŸ“š Resources [Paper](https://arxiv.org/abs/2505.17589) β€’ [GitHub](https://github.com/FunAudioLLM/CosyVoice) β€’ [Model](https://huggingface.co/FunAudioLLM/Fun-CosyVoice3-0.5B-2512) """) if __name__ == "__main__": print("\nπŸš€ Launching Gradio interface...") demo.queue(max_size=10, default_concurrency_limit=2) demo.launch( server_name="0.0.0.0", server_port=7860, show_error=True )