import gradio as gr import torch from TTS.api import TTS from huggingface_hub import hf_hub_download import os def load_eng_model(): repo_id = "E-motionAssistant/text-to-speech-VITS-english" print("--- Starting Weights Surgery ---") model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pth") config_path = hf_hub_download(repo_id=repo_id, filename="config.json") # 1. Load the "Brain" (Checkpoint) directly into PyTorch checkpoint = torch.load(model_path, map_location="cpu") # 2. PERFORM SURGERY: Shrink the layer from 137 down to 131 # This removes the mismatch error entirely raw_weights = checkpoint['model']['text_encoder.emb.weight'] print(f"Original weight shape: {raw_weights.shape}") if raw_weights.shape[0] == 137: print("Trimming 137 -> 131...") checkpoint['model']['text_encoder.emb.weight'] = raw_weights[:131, :] # 3. Save the "Fixed" brain to a new file fixed_model_path = os.path.join(os.getcwd(), "fixed_model.pth") torch.save(checkpoint, fixed_model_path) print("Surgery complete. Fixed model saved.") # 4. Load using the standard TTS library # Now that the weights match (131), it won't crash! tts = TTS(model_path=fixed_model_path, config_path=config_path, gpu=False) return tts # --- Initialization --- try: eng_tts = load_eng_model() print("--- SUCCESS: SURGERY WORKED, SYSTEM ONLINE ---") except Exception as e: print(f"CRITICAL ERROR: {e}") eng_tts = None def generate_voice(text): if not eng_tts: return None try: output_path = "output.wav" eng_tts.tts_to_file(text=str(text), file_path=output_path) return output_path except Exception as e: print(f"Synthesis Error: {e}") return None demo = gr.Interface( fn=generate_voice, inputs=gr.Textbox(label="English Text"), outputs=gr.Audio(label="Result", type="filepath"), title="English TTS" ) if __name__ == "__main__": demo.launch()