import gradio as gr
import torch
import whisper
from transformers import NllbTokenizerFast, AutoModelForSeq2SeqLM

import os
os.environ["COQUI_TOS_AGREED"] = "1"

from TTS.api import TTS
import tempfile

# ---------------------------
# Load all models once (Hugging Face caching will persist)
# ---------------------------
whisper_model = whisper.load_model("small")
#tokenizer = AutoTokenizer.from_pretrained("mafromedia/yaaba-fr-mo-nllb600M")
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "mafromedia/yaaba-fr-mo-nllb600M",
    use_fast=True,
    trust_remote_code=True
)
translator_model = AutoModelForSeq2SeqLM.from_pretrained("mafromedia/yaaba-fr-mo-nllb600M")
tts = TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2", progress_bar=False, gpu=False)

# ---------------------------
# Translate text
# ---------------------------
def translate_text(input_text, direction):
    if not input_text.strip():
        return "Error: Empty input text."
    
    src_lang = "fra_Latn" if "French" in direction else "mos_Latn"
    tgt_lang = "mos_Latn" if "French" in direction else "fra_Latn"

    inputs = tokenizer(input_text, return_tensors="pt")
    translated_tokens = translator_model.generate(
        **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang)
    )
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# ---------------------------
# Speech → Speech Translation
# ---------------------------
def voice_translate(audio, direction):
    if audio is None:
        return "Error: No audio input detected.", "", None
    
    # Step 1: Transcribe audio → text
    result = whisper_model.transcribe(audio)
    detected_text = result["text"]

    # Step 2: Translate
    translated_text = translate_text(detected_text, direction)

    # Step 3: TTS synthesis
    tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tts.tts_to_file(text=translated_text, file_path=tmp_wav.name, speaker="female-en")

    return translated_text, detected_text, tmp_wav.name

# ---------------------------
# Text-only translation tab
# ---------------------------
def text_translate(input_text, direction):
    translated_text = translate_text(input_text, direction)
    tmp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
    tts.tts_to_file(text=translated_text, file_path=tmp_wav.name, speaker="female-en")
    return translated_text, tmp_wav.name

# ---------------------------
# Build Gradio Interface
# ---------------------------
description = (
    "Yaaba AI bridges African and global communities through real-time French ↔ Mooré translation. "
    "Speak or type in one language to instantly hear and read the translation in the other."
)

with gr.Blocks(title="🎙 Yaaba AI – French ↔ Mooré Voice Translator") as demo:
    gr.Markdown("# 🎙 Yaaba AI – French ↔ Mooré Voice Translator")
    gr.Markdown(description)

    with gr.Tabs():
        with gr.Tab("Voice Translator"):
            direction_select = gr.Radio(
                ["French → Mooré", "Mooré → French"], label="Translation Direction", value="French → Mooré"
            )
            audio_input = gr.Audio(sources=["microphone"], type="filepath", label="Record or Upload Speech")
            translate_btn = gr.Button("Translate Speech")

            translated_box = gr.Textbox(label="Translated Text")
            detected_box = gr.Textbox(label="Detected Speech")
            audio_output = gr.Audio(label="Translated Speech")

            translate_btn.click(
                fn=voice_translate,
                inputs=[audio_input, direction_select],
                outputs=[translated_box, detected_box, audio_output],
            )

        with gr.Tab("Text-only"):
            direction_text = gr.Radio(
                ["French → Mooré", "Mooré → French"], label="Translation Direction", value="French → Mooré"
            )
            text_box = gr.Textbox(label="Enter Text", placeholder="Type your text here...")
            text_btn = gr.Button("Translate Text")
            text_out = gr.Textbox(label="Translated Text")
            audio_out = gr.Audio(label="Spoken Translation")

            text_btn.click(fn=text_translate, inputs=[text_box, direction_text], outputs=[text_out, audio_out])

    gr.Markdown("---")
    gr.Markdown("Built by GO AI Corp – Yaaba AI Initiative")

# ---------------------------
# Launch app (important for Spaces)
# ---------------------------
if __name__ == "__main__":
    demo.launch()