import os import tempfile import gradio as gr import librosa import soundfile as sf import warnings warnings.filterwarnings("ignore", category=FutureWarning) # This will delete cached model to force a clean download import subprocess # Clear cached XTTS model to force a fresh download (only needed once) model_cache_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2") if os.path.exists(model_cache_path): subprocess.run(["rm", "-rf", model_cache_path], check=True) # ===== Step 1: Allowlist Required Classes for PyTorch >= 2.6 ===== from torch.serialization import add_safe_globals from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs from TTS.config.shared_configs import BaseDatasetConfig add_safe_globals([ XttsConfig, XttsAudioConfig, XttsArgs, BaseDatasetConfig ]) # ===== Step 2: Agree to Coqui TTS Terms of Service ===== os.environ["COQUI_TOS_AGREED"] = "1" # ===== Step 3: Load the Coqui XTTS Model ===== from TTS.api import TTS tts = TTS( model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=True, gpu=False # Set to True if using CUDA ) # ===== Step 4: Define Voice Cloning Inference Function ===== def text_to_speech_clone(text, voice_sample): if voice_sample is None: return "Please provide a voice sample audio.", None # Load the voice sample audio file sample_wav, sample_rate = librosa.load(voice_sample, sr=22050) # Save sample temporarily in correct format with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_sample: sf.write(tmp_sample.name, sample_wav, sample_rate) voice_sample_path = tmp_sample.name # Generate cloned Hindi speech and save it to a temp file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output: tts.tts_to_file( text=text, file_path=tmp_output.name, speaker_wav=voice_sample_path, language="hi" ) output_path = tmp_output.name return output_path # ===== Step 5: Gradio UI Interface ===== iface = gr.Interface( fn=text_to_speech_clone, inputs=[ gr.Textbox(lines=5, placeholder="हिंदी में टेक्स्ट दर्ज करें...", label="Text"), gr.Audio(type="filepath", label="Voice Sample (Hindi speech)") ], outputs=gr.Audio(type="filepath", label="Generated Cloned Speech"), title="Hindi Text-to-Speech with Voice Cloning", description=( "यह ऐप हिंदी टेक्स्ट से वॉयस क्लोनिंग के साथ स्पीच जेनरेट करता है।\n" "एक छोटी सी हिंदी आवाज़ की रिकॉर्डिंग (5-10 सेकंड) अपलोड करें, और यह उसी आवाज़ में टेक्स्ट पढ़कर सुनाएगा।" ) ) # ===== Step 6: Launch the Web App ===== iface.launch()