import gradio as gr
import torch
from PIL import Image
from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
import numpy as np
import cv2
import threading
import traceback
import time 

# ======================================================
# 1. SETUP (GOOGLE PALIGEMMA)
# ======================================================

print("⏳ Avvio MUSIC4D - PaliGemma (Smart Mode)")

# Usiamo la versione 224px. È più leggera per la CPU rispetto alla 448px.
MODEL_ID = "google/paligemma-3b-mix-224"

VALID_EMOTIONS = [
    "JOY", "HAPPY", "SMILING", "LAUGHING",      
    "ANGER", "ANGRY", "FROWNING", "FURIOUS",    
    "FEAR", "SCARED", "TERRIFIED",                
    "DISGUST", "DISGUSTED",                       
    "SURPRISE", "SHOCKED", "AMAZED",           
    "SADNESS", "SAD", "CRYING", "DEPRESSED",      
    "BOREDOM", "BORED", "TIRED",              
    "NEUTRAL", "SERIOUS", "CALM"             
]

model = None
processor = None
current_emotion = "CARICAMENTO..." 
analysis_thread = None 

# ======================================================
# 2. CARICAMENTO
# ======================================================

def load_paligemma():
    global model, processor
    try:
        print("📦 Caricamento PaliGemma (3B Params)...")
        # PaliGemma richiede la sua classe specifica
        model = PaliGemmaForConditionalGeneration.from_pretrained(
            MODEL_ID, 
            device_map="cpu",
            torch_dtype=torch.float32, # Float32 è più sicuro su CPU vecchie
            revision="bfloat16" # Scarica i pesi leggeri ma li converte
        ).eval()
        
        processor = AutoProcessor.from_pretrained(MODEL_ID)
        print("✅ PaliGemma Pronto! (Sarà lento ma intelligente)")
    except Exception as e:
        print(f"❌ Errore caricamento: {e}")

load_paligemma()

# ======================================================
# 3. AI LOGIC (PaliGemma)
# ======================================================

def analyze_emotion_task(pil_image):
    global current_emotion
    start = time.time()
    
    if model is None:
        return

    try:
        # Prompt
        prompt = """<image>Which is the general emotion in this picture? Answer the question
                    using a single word for each emotion you can find. Follow this
                    example: ’emotions: [em1, em2, em3, ...]’. Choose from: [’joy’,
                    ’anger’, ’fear’, ’disgust’, ’surprise’, ’sadness’, ’boredom’, ’neutral’,
                    ’contentment’]. \n"""
        
        # 1. Preprocessing
        if pil_image.mode != "RGB":
            pil_image = pil_image.convert("RGB")

        inputs = processor(text=prompt, images=pil_image, return_tensors="pt")
        inputs = {k: v.to("cpu") for k, v in inputs.items()}

        # 2. Generazione
        with torch.no_grad():
            generated_ids = model.generate(
                **inputs,
                max_new_tokens=20, # Risposta breve
                do_sample=False,   # Deterministico
            )

        # 3. Decoding
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        
        # --- FIX BUG "SEMPRE JOY" ---
        # Il problema era che 'replace' non funzionava bene e rimaneva la lista delle opzioni
        # (che contiene la parola 'joy').
        # SOLUZIONE: Spezziamo la stringa dove finisce la domanda ("contentment’].")
        # e prendiamo solo quello che c'è dopo.
        
        description = generated_text.upper()
        
        # Cerchiamo la fine della domanda per tagliare via tutto il prompt
        marker = "CONTENTMENT’]." 
        if marker in description:
            description = description.split(marker)[-1].strip()
        else:
            # Fallback di sicurezza: prendiamo solo gli ultimi 50 caratteri
            # (la risposta è corta, il prompt è lungo)
            description = description[-50:].strip()
            
        description = description.replace("\n", "").strip()
        
        print(f"🧠 PaliGemma dice: {description}") 
        
        # 4. Keyword Matching
        detected = "UNKNOWN"
        for keyword in VALID_EMOTIONS:
            if keyword in description:
                if keyword in ["HAPPY", "SMILING", "LAUGHING", "JOY"]: detected = "JOY"
                elif keyword in ["ANGRY", "FROWNING", "FURIOUS"]: detected = "ANGER"
                elif keyword in ["SCARED", "TERRIFIED", "FEAR"]: detected = "FEAR"
                elif keyword in ["SAD", "CRYING", "DEPRESSED"]: detected = "SADNESS"
                elif keyword in ["SHOCKED", "AMAZED", "SURPRISE"]: detected = "SURPRISE"
                elif keyword in ["BORED", "TIRED"]: detected = "BOREDOM"
                elif keyword in ["SERIOUS", "CALM", "NEUTRAL"]: detected = "NEUTRAL"
                elif keyword in ["CONTENTMENT"]: detected = "JOY" 
                else: detected = keyword
                break
        
        # Fallback intelligente
        if detected == "UNKNOWN" and len(description) > 2:
             detected = description

        current_emotion = detected

    except Exception as e:
        print("⚠️ ERRORE CRITICO NEL THREAD:")
        traceback.print_exc()
        current_emotion = "ERRORE"

    end = time.time() - start
    print(f"⏱️ Tempo inferenza: {end:.2f}s")

# ======================================================
# 4. FRAME PROCESSING
# ======================================================

def process_frame(image_array):
    global analysis_thread, current_emotion

    if image_array is None:
        return image_array
    
    # print("thread grafica")
    annotated = image_array.copy()
    img_h, img_w, _ = annotated.shape

    # Logica Thread
    if analysis_thread is None or not analysis_thread.is_alive():
        if model is not None:
            try:
                # Resize per PaliGemma
                ai_frame = cv2.resize(image_array, (224, 224))
                pil_image = Image.fromarray(ai_frame)
                
                analysis_thread = threading.Thread(target=analyze_emotion_task, args=(pil_image,))
                analysis_thread.start()
            except Exception as e:
                print(f"Errore prep immagine: {e}")

    # Grafica
    is_working = analysis_thread is not None and analysis_thread.is_alive()
    status_symbol = " (*)" if is_working else ""
    text_display = f"{current_emotion}{status_symbol}"
    
    color_bg = (0, 0, 0)
    color_txt = (0, 165, 255) if is_working else (0, 255, 0)
    if "ERRORE" in current_emotion: color_txt = (255, 0, 0)

    font = cv2.FONT_HERSHEY_SIMPLEX
    scale = 0.8
    thick = 2
    
    (tw, th), base = cv2.getTextSize(text_display, font, scale, thick)
    
    x = 20
    y = img_h - 20
    
    cv2.rectangle(annotated, (x-10, y-th-10), (x+tw+10, y+base+5), color_bg, -1)
    cv2.putText(annotated, text_display, (x, y), font, scale, color_txt, thick, cv2.LINE_AA)

    return annotated

# ======================================================
# 5. UI
# ======================================================

with gr.Blocks(title="MUSIC4D - PaliGemma Final") as demo:
    gr.Markdown("## 🎵 MUSIC4D – PaliGemma CPU")
    
    with gr.Row():
        inp = gr.Image(sources=["webcam"], type="numpy", label="Webcam")
        out = gr.Image(type="numpy", label="Output Live")

    inp.stream(
        process_frame,
        inputs=inp,
        outputs=out,
        stream_every=11.5, 
        time_limit=350
    )

if __name__ == "__main__":
    demo.launch(ssr_mode=False)