File size: 3,307 Bytes
ca90f9d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
# FINAL, ROBUST TTS Space: app.py
# This version uses a stable model loading method to avoid all previous errors.

import gradio as gr
import torch
import torchaudio
import os

# --- FIX for MeCab/unidic START ---
# This command downloads the necessary Japanese dictionary for the TTS library.
print("Fix: Triggering unidic download...")
os.system('python -m unidic download')
print("Fix: Unidic download command executed.")
# --- FIX for MeCab/unidic END ---

# --- Import necessary classes from the TTS library ---
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
from TTS.utils.manage import ModelManager
from TTS.utils.generic_utils import get_user_data_dir

# --- Configuration & Model Loading (Happens ONCE at startup) ---
DEFAULT_SPEAKER_WAV = "tutor_voice.wav"
device = "cpu"
print(f"TTS Service: Using device: {device}")

print("TTS Service: Downloading model if not present...")
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
ModelManager().download_model(model_name)
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
print(f"TTS Service: Model downloaded to: {model_path}")

print("TTS Service: Loading model config...")
config = XttsConfig()
config.load_json(os.path.join(model_path, "config.json"))

print("TTS Service: Initializing model...")
model = Xtts.init_from_config(config)

print("TTS Service: Loading model checkpoint...")
model.load_checkpoint(
    config,
    checkpoint_path=os.path.join(model_path, "model.pth"),
    vocab_path=os.path.join(model_path, "vocab.json"),
    eval=True,
    use_deepspeed=False # Important for CPU inference
)
model.to(device)
print("TTS Service: Model loaded successfully.")


# --- The Core API Function ---
def synthesize(text_to_speak, speaker_wav_path):
    if not os.path.exists(speaker_wav_path): speaker_wav_path = DEFAULT_SPEAKER_WAV
    if not os.path.exists(speaker_wav_path): raise gr.Error("Default 'tutor_voice.wav' is missing!")
    
    output_wav_path = "output.wav"
    try:
        print(f"TTS Service: Synthesizing text: '{text_to_speak[:40]}...'")
        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav_path)
        
        print("TTS Service: Performing inference...")
        out = model.inference(
            text_to_speak, "en", gpt_cond_latent, speaker_embedding, temperature=0.7,
        )
        torchaudio.save(output_wav_path, torch.tensor(out["wav"]).unsqueeze(0), 24000)
        
        print(f"TTS Service: Audio saved to '{output_wav_path}'")
        return output_wav_path
        
    except Exception as e:
        print(f"An error occurred during synthesis: {e}")
        raise gr.Error(f"Failed to synthesize audio. Error: {e}")


# --- Build the Gradio API Interface ---
with gr.Blocks() as app:
    gr.Markdown("# EveryPrep XII - Custom TTS Voice Service")
    gr.Interface(
        fn=synthesize,
        inputs=[
            gr.Textbox(label="Text to Synthesize", value="This is a test of the stable TTS service."),
            gr.File(label="Speaker WAV (Optional)", value=DEFAULT_SPEAKER_WAV)
        ],
        outputs=gr.Audio(label="Synthesized Audio"),
        title="TTS API Test Interface",
        api_name="synthesize"
    )

# --- Launch the App ---
app.launch()