Space2 / app.py
amasha03's picture
Update app.py
3a7b95c verified
import gradio as gr
import torch
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import os
def load_eng_model():
repo_id = "E-motionAssistant/text-to-speech-VITS-english"
print("--- Starting Weights Surgery ---")
model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pth")
config_path = hf_hub_download(repo_id=repo_id, filename="config.json")
# 1. Load the "Brain" (Checkpoint) directly into PyTorch
checkpoint = torch.load(model_path, map_location="cpu")
# 2. PERFORM SURGERY: Shrink the layer from 137 down to 131
# This removes the mismatch error entirely
raw_weights = checkpoint['model']['text_encoder.emb.weight']
print(f"Original weight shape: {raw_weights.shape}")
if raw_weights.shape[0] == 137:
print("Trimming 137 -> 131...")
checkpoint['model']['text_encoder.emb.weight'] = raw_weights[:131, :]
# 3. Save the "Fixed" brain to a new file
fixed_model_path = os.path.join(os.getcwd(), "fixed_model.pth")
torch.save(checkpoint, fixed_model_path)
print("Surgery complete. Fixed model saved.")
# 4. Load using the standard TTS library
# Now that the weights match (131), it won't crash!
tts = TTS(model_path=fixed_model_path, config_path=config_path, gpu=False)
return tts
# --- Initialization ---
try:
eng_tts = load_eng_model()
print("--- SUCCESS: SURGERY WORKED, SYSTEM ONLINE ---")
except Exception as e:
print(f"CRITICAL ERROR: {e}")
eng_tts = None
def generate_voice(text):
if not eng_tts: return None
try:
output_path = "output.wav"
eng_tts.tts_to_file(text=str(text), file_path=output_path)
return output_path
except Exception as e:
print(f"Synthesis Error: {e}")
return None
demo = gr.Interface(
fn=generate_voice,
inputs=gr.Textbox(label="English Text"),
outputs=gr.Audio(label="Result", type="filepath"),
title="English TTS"
)
if __name__ == "__main__":
demo.launch()