import gradio as gr
import torch
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import os

def load_eng_model():
    repo_id = "E-motionAssistant/text-to-speech-VITS-english"
    print("--- Starting Weights Surgery ---")
    
    model_path = hf_hub_download(repo_id=repo_id, filename="best_model.pth")
    config_path = hf_hub_download(repo_id=repo_id, filename="config.json")

    # 1. Load the "Brain" (Checkpoint) directly into PyTorch
    checkpoint = torch.load(model_path, map_location="cpu")
    
    # 2. PERFORM SURGERY: Shrink the layer from 137 down to 131
    # This removes the mismatch error entirely
    raw_weights = checkpoint['model']['text_encoder.emb.weight']
    print(f"Original weight shape: {raw_weights.shape}")
    
    if raw_weights.shape[0] == 137:
        print("Trimming 137 -> 131...")
        checkpoint['model']['text_encoder.emb.weight'] = raw_weights[:131, :]
    
    # 3. Save the "Fixed" brain to a new file
    fixed_model_path = os.path.join(os.getcwd(), "fixed_model.pth")
    torch.save(checkpoint, fixed_model_path)
    print("Surgery complete. Fixed model saved.")

    # 4. Load using the standard TTS library
    # Now that the weights match (131), it won't crash!
    tts = TTS(model_path=fixed_model_path, config_path=config_path, gpu=False)
    
    return tts

# --- Initialization ---
try:
    eng_tts = load_eng_model()
    print("--- SUCCESS: SURGERY WORKED, SYSTEM ONLINE ---")
except Exception as e:
    print(f"CRITICAL ERROR: {e}")
    eng_tts = None

def generate_voice(text):
    if not eng_tts: return None
    try:
        output_path = "output.wav"
        eng_tts.tts_to_file(text=str(text), file_path=output_path)
        return output_path
    except Exception as e:
        print(f"Synthesis Error: {e}")
        return None

demo = gr.Interface(
    fn=generate_voice,
    inputs=gr.Textbox(label="English Text"),
    outputs=gr.Audio(label="Result", type="filepath"),
    title="English TTS"
)

if __name__ == "__main__":
    demo.launch()