File size: 3,105 Bytes
f2fb137
3a12dbe
a960ed3
4940f5b
 
4808325
 
4e9d040
40dde11
 
 
 
 
 
 
4808325
d2e3831
a960ed3
cc1d381
 
a960ed3
3a926a7
ded29d3
a960ed3
 
 
 
 
 
ded29d3
a960ed3
f44f660
cc1d381
a960ed3
3a926a7
a960ed3
3a926a7
 
 
a960ed3
3a926a7
4940f5b
a960ed3
4940f5b
 
 
c378619
a960ed3
4940f5b
 
a960ed3
4940f5b
 
 
 
a960ed3
4940f5b
 
 
 
 
 
 
 
 
 
07f51a1
a960ed3
7a6d0bc
4940f5b
07f51a1
ab2cf02
4940f5b
07f51a1
4940f5b
 
47ccc28
a960ed3
 
47ccc28
7a6d0bc
 
a960ed3
3a12dbe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import os
import tempfile
import gradio as gr
import librosa
import soundfile as sf
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# This will delete cached model to force a clean download
import subprocess

# Clear cached XTTS model to force a fresh download (only needed once)
model_cache_path = os.path.expanduser("~/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2")
if os.path.exists(model_cache_path):
    subprocess.run(["rm", "-rf", model_cache_path], check=True)



# ===== Step 1: Allowlist Required Classes for PyTorch >= 2.6 =====
from torch.serialization import add_safe_globals
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import XttsAudioConfig, XttsArgs
from TTS.config.shared_configs import BaseDatasetConfig

add_safe_globals([
    XttsConfig,
    XttsAudioConfig,
    XttsArgs,
    BaseDatasetConfig
])

# ===== Step 2: Agree to Coqui TTS Terms of Service =====
os.environ["COQUI_TOS_AGREED"] = "1"

# ===== Step 3: Load the Coqui XTTS Model =====
from TTS.api import TTS

tts = TTS(
    model_name="tts_models/multilingual/multi-dataset/xtts_v2",
    progress_bar=True,
    gpu=False  # Set to True if using CUDA
)

# ===== Step 4: Define Voice Cloning Inference Function =====
def text_to_speech_clone(text, voice_sample):
    if voice_sample is None:
        return "Please provide a voice sample audio.", None

    # Load the voice sample audio file
    sample_wav, sample_rate = librosa.load(voice_sample, sr=22050)

    # Save sample temporarily in correct format
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_sample:
        sf.write(tmp_sample.name, sample_wav, sample_rate)
        voice_sample_path = tmp_sample.name

    # Generate cloned Hindi speech and save it to a temp file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_output:
        tts.tts_to_file(
            text=text,
            file_path=tmp_output.name,
            speaker_wav=voice_sample_path,
            language="hi"
        )
        output_path = tmp_output.name

    return output_path

# ===== Step 5: Gradio UI Interface =====
iface = gr.Interface(
    fn=text_to_speech_clone,
    inputs=[
        gr.Textbox(lines=5, placeholder="हिंदी में टेक्स्ट दर्ज करें...", label="Text"),
        gr.Audio(type="filepath", label="Voice Sample (Hindi speech)")
    ],
    outputs=gr.Audio(type="filepath", label="Generated Cloned Speech"),
    title="Hindi Text-to-Speech with Voice Cloning",
    description=(
        "यह ऐप हिंदी टेक्स्ट से वॉयस क्लोनिंग के साथ स्पीच जेनरेट करता है।\n"
        "एक छोटी सी हिंदी आवाज़ की रिकॉर्डिंग (5-10 सेकंड) अपलोड करें, और यह उसी आवाज़ में टेक्स्ट पढ़कर सुनाएगा।"
    )
)

# ===== Step 6: Launch the Web App =====
iface.launch()