# FINAL, ROBUST TTS Space: app.py # This version uses a stable model loading method to avoid all previous errors. import gradio as gr import torch import torchaudio import os # --- FIX for MeCab/unidic START --- # This command downloads the necessary Japanese dictionary for the TTS library. print("Fix: Triggering unidic download...") os.system('python -m unidic download') print("Fix: Unidic download command executed.") # --- FIX for MeCab/unidic END --- # --- Import necessary classes from the TTS library --- from TTS.tts.configs.xtts_config import XttsConfig from TTS.tts.models.xtts import Xtts from TTS.utils.manage import ModelManager from TTS.utils.generic_utils import get_user_data_dir # --- Configuration & Model Loading (Happens ONCE at startup) --- DEFAULT_SPEAKER_WAV = "tutor_voice.wav" device = "cpu" print(f"TTS Service: Using device: {device}") print("TTS Service: Downloading model if not present...") model_name = "tts_models/multilingual/multi-dataset/xtts_v2" ModelManager().download_model(model_name) model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--")) print(f"TTS Service: Model downloaded to: {model_path}") print("TTS Service: Loading model config...") config = XttsConfig() config.load_json(os.path.join(model_path, "config.json")) print("TTS Service: Initializing model...") model = Xtts.init_from_config(config) print("TTS Service: Loading model checkpoint...") model.load_checkpoint( config, checkpoint_path=os.path.join(model_path, "model.pth"), vocab_path=os.path.join(model_path, "vocab.json"), eval=True, use_deepspeed=False # Important for CPU inference ) model.to(device) print("TTS Service: Model loaded successfully.") # --- The Core API Function --- def synthesize(text_to_speak, speaker_wav_path): if not os.path.exists(speaker_wav_path): speaker_wav_path = DEFAULT_SPEAKER_WAV if not os.path.exists(speaker_wav_path): raise gr.Error("Default 'tutor_voice.wav' is missing!") output_wav_path = "output.wav" try: print(f"TTS Service: Synthesizing text: '{text_to_speak[:40]}...'") gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=speaker_wav_path) print("TTS Service: Performing inference...") out = model.inference( text_to_speak, "en", gpt_cond_latent, speaker_embedding, temperature=0.7, ) torchaudio.save(output_wav_path, torch.tensor(out["wav"]).unsqueeze(0), 24000) print(f"TTS Service: Audio saved to '{output_wav_path}'") return output_wav_path except Exception as e: print(f"An error occurred during synthesis: {e}") raise gr.Error(f"Failed to synthesize audio. Error: {e}") # --- Build the Gradio API Interface --- with gr.Blocks() as app: gr.Markdown("# EveryPrep XII - Custom TTS Voice Service") gr.Interface( fn=synthesize, inputs=[ gr.Textbox(label="Text to Synthesize", value="This is a test of the stable TTS service."), gr.File(label="Speaker WAV (Optional)", value=DEFAULT_SPEAKER_WAV) ], outputs=gr.Audio(label="Synthesized Audio"), title="TTS API Test Interface", api_name="synthesize" ) # --- Launch the App --- app.launch()