import gradio as gr
import torch
import gc
import time
import logging
from transformers import (
    pipeline,
    AutoProcessor,
    AutoModelForSpeechSeq2Seq,
    AutoModelForCTC,
    WhisperForConditionalGeneration,
    WhisperProcessor,
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class MultiASRApp:
    def __init__(self):
        self.pipe = None
        self.current_model = None
        self.current_kind = None  # "whisper" | "ctc"

        self.available_models = [
            "openai/whisper-tiny",
            "openai/whisper-base",
            "openai/whisper-small",
            "openai/whisper-medium",
            "openai/whisper-large-v2",
            "openai/whisper-large-v3",
            "ilsp/whisper_greek_dialect_of_lesbos",
            "ilsp/xls-r-greek-cretan",
        ]

    # ------------------------
    # Model detection
    # ------------------------
    def detect_model_kind(self, model_name):
        if "xls-r" in model_name.lower() or "xlsr" in model_name.lower():
            return "ctc"
        return "whisper"

    def is_fine_tuned_whisper(self, model_name):
        return "ilsp/" in model_name.lower() and "whisper" in model_name.lower()

    # ------------------------
    # Device & dtype
    # ------------------------
    def pick_device(self, conservative=True):
        if torch.cuda.is_available():
            return "cuda:0", torch.float32 if conservative else torch.float16
        return "cpu", torch.float32

    # ------------------------
    # Pipeline creation
    # ------------------------
    def create_whisper_pipe(self, model_name):
        conservative = self.is_fine_tuned_whisper(model_name)
        device, dtype = self.pick_device(conservative)

        try:
            model = WhisperForConditionalGeneration.from_pretrained(
                model_name,
                torch_dtype=dtype,
                low_cpu_mem_usage=True,
            )
            processor = WhisperProcessor.from_pretrained(model_name)
        except Exception:
            model = AutoModelForSpeechSeq2Seq.from_pretrained(
                model_name,
                torch_dtype=dtype,
                low_cpu_mem_usage=True,
            )
            processor = AutoProcessor.from_pretrained(model_name)

        model.to(device)

        return pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=processor.tokenizer,
            feature_extractor=processor.feature_extractor,
            device=device,
            torch_dtype=dtype,
            chunk_length_s=30,
        )

    def create_ctc_pipe(self, model_name):
        device, dtype = self.pick_device(conservative=True)

        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModelForCTC.from_pretrained(
            model_name,
            torch_dtype=dtype,
            low_cpu_mem_usage=True,
        )
        model.to(device)

        return pipeline(
            "automatic-speech-recognition",
            model=model,
            tokenizer=getattr(processor, "tokenizer", None),
            feature_extractor=getattr(processor, "feature_extractor", None),
            device=device,
            torch_dtype=dtype,
            chunk_length_s=20,
            stride_length_s=(4, 2),
        )

    def load_model(self, model_name):
        if self.current_model == model_name and self.pipe is not None:
            return True

        self.clear_model()
        kind = self.detect_model_kind(model_name)

        try:
            if kind == "ctc":
                self.pipe = self.create_ctc_pipe(model_name)
            else:
                self.pipe = self.create_whisper_pipe(model_name)

            self.current_model = model_name
            self.current_kind = kind
            return True
        except Exception as e:
            logger.error(e, exc_info=True)
            self.clear_model()
            return False

    def clear_model(self):
        if self.pipe is not None:
            del self.pipe
        self.pipe = None
        self.current_model = None
        self.current_kind = None
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        gc.collect()

    # ------------------------
    # Transcription
    # ------------------------
    def transcribe(self, audio, model_name):
        if audio is None:
            return "Ανέβασε ένα ηχητικό αρχείο.", ""

        start = time.time()
        if not self.load_model(model_name):
            return "Σφάλμα φόρτωσης μοντέλου.", ""

        # 🔒 FORCE GREEK FOR ALL WHISPER MODELS
        if self.current_kind == "whisper":
            result = self.pipe(
                audio,
                generate_kwargs={
                    "language": "greek",
                    "task": "transcribe",
                },
            )
        else:
            # XLS-R (CTC)
            result = self.pipe(audio)

        text = result.get("text", "")

        info = (
            f"Μοντέλο: {model_name}\n"
            f"Χρόνος επεξεργασίας: {time.time() - start:.2f} δευτ."
        )

        return text.strip(), info

    def status(self):
        if not self.current_model:
            return "Δεν έχει φορτωθεί μοντέλο"
        return f"✔ {self.current_model}"


# ------------------------
# Gradio App
# ------------------------
app = MultiASRApp()

def run(audio, model):
    return app.transcribe(audio, model)

def status():
    return app.status()


with gr.Blocks(title="Ίντα λαλείς;", theme=gr.themes.Soft()) as demo:
    gr.Markdown(
        """
# Ίντα λαλείς;
## Η Τεχνητή Νοημοσύνη μαθαίνει ελληνικές διαλέκτους

🎧 Ανέβασε ένα ηχητικό αρχείο και δες πώς η Τεχνητή Νοημοσύνη
αναγνωρίζει την ελληνική γλώσσα και τις διαλέκτους της.

📍 Athens Science Festival 2025  
🏛 Ωδείο Αθηνών | 18–21 Δεκεμβρίου 2025
"""
    )

    model_status = gr.Textbox(
        label="Κατάσταση μοντέλου",
        value=status(),
        interactive=False,
    )

    with gr.Row():
        with gr.Column():
            audio = gr.Audio(
                label="🎵 Ανέβασε ηχητικό αρχείο",
                type="filepath",
            )

            model = gr.Dropdown(
                choices=app.available_models,
                value="openai/whisper-small",
                label="Μοντέλο αναγνώρισης ομιλίας",
            )

            btn = gr.Button(
                "🗣️ Μετατροπή ομιλίας σε κείμενο",
                variant="primary",
            )

        with gr.Column():
            text_out = gr.Textbox(
                label="📄 Κείμενο",
                lines=8,
                show_copy_button=True,
            )

            info_out = gr.Textbox(
                label="Πληροφορίες",
                lines=4,
            )

    btn.click(
        run,
        inputs=[audio, model],
        outputs=[text_out, info_out],
    )

    model.change(lambda _: status(), outputs=model_status)

    gr.Markdown(
        """
🔬 Έρευνα & τεχνολογία για τη γλωσσική ποικιλία  
🎙️ Η φωνή ως πολιτιστική κληρονομιά
"""
    )

if __name__ == "__main__":
    demo.launch()