File size: 3,105 Bytes
f2fb137 3a12dbe a960ed3 4940f5b 4808325 4e9d040 40dde11 4808325 d2e3831 a960ed3 cc1d381 a960ed3 3a926a7 ded29d3 a960ed3 ded29d3 a960ed3 f44f660 cc1d381 a960ed3 3a926a7 a960ed3 3a926a7 a960ed3 3a926a7 4940f5b a960ed3 4940f5b c378619 a960ed3 4940f5b a960ed3 4940f5b a960ed3 4940f5b 07f51a1 a960ed3 7a6d0bc 4940f5b 07f51a1 ab2cf02 4940f5b 07f51a1 4940f5b 47ccc28 a960ed3 47ccc28 7a6d0bc a960ed3 3a12dbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import os
import tempfile
import gradio as gr
import librosa
import soundfile as sf
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# This will delete cached model to force a clean download
import subprocess
# Clear cached XTTS model to force a fresh download (only needed once)
model_cache_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
if os.path.exists(model_cache_path):
subprocess.run(["rm", "-rf", model_cache_path], check=True)
# ===== Step 1: Allowlist Required Classes for PyTorch >= 2.6 =====
from torch.serialization import add_safe_globals
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig
add_safe_globals([
XttsConfig,
XttsAudioConfig,
XttsArgs,
BaseDatasetConfig
])
# ===== Step 2: Agree to Coqui TTS Terms of Service =====
os.environ["COQUI_TOS_AGREED"] = "1"
# ===== Step 3: Load the Coqui XTTS Model =====
from TTS.api import TTS
tts = TTS(
model_name="tts_models/multilingual/multi-dataset/xtts_v2",
progress_bar=True,
gpu=False # Set to True if using CUDA
)
# ===== Step 4: Define Voice Cloning Inference Function =====
def text_to_speech_clone(text, voice_sample):
if voice_sample is None:
return "Please provide a voice sample audio.", None
# Load the voice sample audio file
sample_wav, sample_rate = librosa.load(voice_sample, sr=22050)
# Save sample temporarily in correct format
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_sample:
sf.write(tmp_sample.name, sample_wav, sample_rate)
voice_sample_path = tmp_sample.name
# Generate cloned Hindi speech and save it to a temp file
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output:
tts.tts_to_file(
text=text,
file_path=tmp_output.name,
speaker_wav=voice_sample_path,
language="hi"
)
output_path = tmp_output.name
return output_path
# ===== Step 5: Gradio UI Interface =====
iface = gr.Interface(
fn=text_to_speech_clone,
inputs=[
gr.Textbox(lines=5, placeholder="हिंदी में टेक्स्ट दर्ज करें...", label="Text"),
gr.Audio(type="filepath", label="Voice Sample (Hindi speech)")
],
outputs=gr.Audio(type="filepath", label="Generated Cloned Speech"),
title="Hindi Text-to-Speech with Voice Cloning",
description=(
"यह ऐप हिंदी टेक्स्ट से वॉयस क्लोनिंग के साथ स्पीच जेनरेट करता है।\n"
"एक छोटी सी हिंदी आवाज़ की रिकॉर्डिंग (5-10 सेकंड) अपलोड करें, और यह उसी आवाज़ में टेक्स्ट पढ़कर सुनाएगा।"
)
)
# ===== Step 6: Launch the Web App =====
iface.launch()
|