import gradio as gr
from transformers import AutoProcessor, BarkModel
import torch
import uuid
import os
from datasets import load_dataset
from scipy.io.wavfile import write

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

model_id = "facebook/mms-tts-bo"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def tts_tibetan(text):
    if not text.strip():
        return "Vui lòng nhập văn bản."
    
    inputs = processor(text=text, return_tensors="pt").to(device)
    with torch.no_grad():
        generated = model.generate(**inputs)
    
    speech = processor.decode(generated[0])
    output_path = f"/tmp/{uuid.uuid4().hex}.wav"
    processor.save_wav(speech, output_path)
    
    return output_path

iface = gr.Interface(
    fn=tts_tibetan,
    inputs=gr.Textbox(label="Nhập văn bản tiếng Tây Tạng (Unicode)"),
    outputs=gr.Audio(type="filepath", label="Giọng đọc Tây Tạng (MMS)"),
    title="TTS tiếng Tây Tạng bằng Facebook MMS",
    description="Sử dụng mô hình MMS để tạo giọng đọc tiếng Tây Tạng"
)

iface.launch()