import torch
from transformers import AutoProcessor, AutoModelForTextToSpeech
import gradio as gr
import soundfile as sf
import tempfile

# Load model và processor cho tiếng Tây Tạng (bod)
model_id = "facebook/mms-tts-bod"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForTextToSpeech.from_pretrained(model_id)

def tts_bod(text):
    inputs = processor(text=text, return_tensors="pt")
    with torch.no_grad():
        speech = model(**inputs).waveform

    # Tạo file tạm để lưu audio
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as f:
        sf.write(f.name, speech.squeeze().numpy(), 16000)
        return f.name

# Giao diện Gradio
iface = gr.Interface(
    fn=tts_bod,
    inputs=gr.Textbox(label="Nhập câu tiếng Tây Tạng (Tibetan)"),
    outputs=gr.Audio(label="Phát âm tiếng Tây Tạng"),
    title="TTS Tiếng Tây Tạng - facebook/mms-tts-bod",
    description="Nhập câu bằng tiếng Tây Tạng để nghe phát âm. Model: facebook/mms-tts-bod"
)

iface.launch()