import os
import gradio as gr
from transformers import AutoModel

# --- मॉडल लोड करें (Space शुरू होने पर एक बार) ---
MODEL_ID = "bharatgenai/sooktam2"
print("Loading model...")
model = AutoModel.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
)
print("Model loaded successfully!")

def synthesize_speech(ref_audio, ref_text, gen_text, cls_language):
    """
    इनपुट ऑडियो, रेफरेंस टेक्स्ट, और जनरेट करने के लिए टेक्स्ट लेता है 
    और सिंथेसाइज़्ड ऑडियो लौटाता है।
    """
    if ref_audio is None:
        return "कृपया एक रेफरेंस ऑडियो फ़ाइल अपलोड करें।", None

    # ऑडियो फ़ाइल का पथ प्राप्त करें
    ref_audio_path = ref_audio.name

    # आउटपुट डायरेक्टरी बनाएं
    out_dir = "outputs"
    os.makedirs(out_dir, exist_ok=True)
    out_wav = os.path.join(out_dir, "output.wav")

    try:
        # मॉडल इन्फेरेंस चलाएं
        wav, sr, _ = model.infer(
            ref_file=ref_audio_path,
            ref_text=ref_text,
            gen_text=gen_text,
            tokenizer="cls",
            cls_language=cls_language,
            file_wave=out_wav,
        )
        return "सफलतापूर्वक ऑडियो जनरेट किया गया!", out_wav
    except Exception as e:
        return f"एरर: {str(e)}", None

# --- Gradio Interface ---
inputs = [
    gr.Audio(type="filepath", label="रेफरेंस ऑडियो (3-10 सेकंड)"),
    gr.Textbox(label="रेफरेंस टेक्स्ट (ऑडियो की ट्रांसक्रिप्ट)", lines=2),
    gr.Textbox(label="जनरेट करने के लिए टेक्स्ट", lines=2),
    gr.Dropdown(
        choices=["hindi", "marathi", "gujarati", "tamil", "telugu", "kannada", 
                 "bengali", "malayalam", "odia", "urdu", "punjabi", "indian-english"],
        value="hindi",
        label="भाषा चुनें"
    ),
]

outputs = [
    gr.Textbox(label="स्टेटस"),
    gr.Audio(type="filepath", label="जनरेटेड ऑडियो")
]

gr.Interface(
    fn=synthesize_speech,
    inputs=inputs,
    outputs=outputs,
    title="Sooktam-2 Text-to-Speech",
    description="भारतGenAI का मल्टीलिंगुअल TTS मॉडल। रेफरेंस ऑडियो और टेक्स्ट के आधार पर आवाज़ क्लोन करें।"
).launch()