import gradio as gr
import torch
import sys
import traceback
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from threading import Thread

# Configuración del Modelo
MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"

# Optimizaciones extremas de CPU y RAM para Tier Gratuito
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Iniciando arranque de Lumin Flash ({MODEL_ID}) en {device}...")

model = None
tokenizer = None

try:
    print("⏳ Descargando y cargando Tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
    
    print("⏳ Descargando y cargando Modelo en RAM...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID, 
        dtype=torch.float16 if device == "cuda" else torch.float32, 
        device_map="auto",
        trust_remote_code=True,
        low_cpu_mem_usage=True
    )
    print("✅ ¡Modelo cargado correctamente en memoria!")
    
except Exception as e:
    print("❌" * 20)
    print(f"ERROR CRÍTICO FATAL AL CARGAR EL MODELO:\n{e}")
    print(traceback.format_exc())
    print("❌" * 20)
    # Obligamos al container a morir si no hay modelo, así HF te avisará del fallo 
    # y evitará el estado "Running zombi" que da el "NameError".
    sys.exit(1)


def chat(message, history):
    # Detección de seguridad en tiempo real
    if model is None or tokenizer is None:
        yield "⚠️ Error del servidor: El modelo de IA no está cargado correctamente. Contacta al administrador."
        return

    # Preparar el contexto del sistema
    messages = []
    messages.append({
        "role": "system", 
        "content": "You are Lumin Flash, an advanced AI assistant created by Lumin Web. You are helpful, precise, and professional. Answer questions clearly and concisely. Do not cut off sentences."
    })
    
    # Inyectar el historial de chat anterior
    for user_msg, bot_msg in history:
        messages.append({"role": "user", "content": user_msg})
        messages.append({"role": "assistant", "content": bot_msg})
    
    # Añadir el nuevo mensaje del usuario
    messages.append({"role": "user", "content": message})
    
    # Formatear el texto usando la plantilla oficial de Qwen/ChatML
    try:
        text = tokenizer.apply_chat_template(
            messages, 
            tokenize=False, 
            add_generation_prompt=True
        )
    except Exception as e:
        print(f"Aviso tokenizer: Falló el apply_chat_template, usando fallback manual. {e}")
        text = f"<|im_start|>system\nYou are Lumin Flash.<|im_end|>\n<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
    
    # Enviar al procesador (device)
    inputs = tokenizer([text], return_tensors="pt").to(device)
    
    # Streamer para respuestas rápidas
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    
    # Parámetros de generación inteligentes
    generation_kwargs = dict(
        inputs, 
        streamer=streamer, 
        max_new_tokens=1024,
        temperature=0.7, 
        do_sample=True,
        top_k=50,
        top_p=0.9,
        repetition_penalty=1.1
    )
    
    # Iniciar la generación en segundo plano
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    
    # Devolver texto palabra por palabra
    partial_text = ""
    for new_text in streamer:
        partial_text += new_text
        yield partial_text


# Interfaz Gráfica de Gradio
demo = gr.ChatInterface(
    fn=chat, 
    chatbot=gr.Chatbot(height=500),
    textbox=gr.Textbox(placeholder="Pregúntale a Lumin Flash...", container=False, scale=7),
    title="⚡ Lumin Flash (High Performance)",
    description="Backend oficial de inferencia rápida para Lumin Web."
)

if __name__ == "__main__":
    demo.queue().launch(server_name="0.0.0.0", server_port=7860)