import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# -------------------------
# Config
# -------------------------
MODEL_NAME = "google/gemma-3-1b-it"
HF_TOKEN = os.environ.get("HF_TOKEN")  # <-- coloque o token nos Secrets do Space (nome: HF_TOKEN)

device_id = 0 if torch.cuda.is_available() else -1
# -------------------------
# Load tokenizer + model (authenticated)
# -------------------------
# Use 'token=' (newer transformers) to authenticate gated models
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
# try to load with device_map to reduce peak RAM where possible
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    token=HF_TOKEN,
    device_map="auto" if torch.cuda.is_available() or "CUDA_VISIBLE_DEVICES" in os.environ else None,
)

# Create a text-generation pipeline that accepts the structured "messages" format
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=device_id,
)

# -------------------------
# Helper: build messages and call pipeline
# -------------------------
def build_messages(system_message: str, history, user_message: str):
    """
    Build the messages list expected by Gemma-style structured input:
    [ {role: "system", content: [{"type":"text","text":...}]}, {role:"user", content:[...]}, ... ]
    The pipeline expects a list-of-list: [[msg1, msg2, ...]] when called.
    """
    msgs = []
    if system_message:
        msgs.append({"role": "system", "content": [{"type": "text", "text": system_message}]})

    # 'history' from Gradio ChatInterface is a list of dicts with keys like {"role": "...", "text": "..."}
    for item in history or []:
        # normalize possible key names
        role = item.get("role", "user")
        text = item.get("text") or item.get("message") or ""
        msgs.append({"role": role, "content": [{"type": "text", "text": text}]})

    msgs.append({"role": "user", "content": [{"type": "text", "text": user_message}]})
    return [msgs]  # pipeline expects a list of conversations

# -------------------------
# The Gradio respond function
# -------------------------
def respond(message, history, system_message, max_tokens, temperature, top_p):
    conv = build_messages(system_message or "", history or [], message)

    try:
        outputs = pipe(
            conv,
            max_new_tokens=int(max_tokens or 128),
            temperature=float(temperature or 1.0),
            top_p=float(top_p or 1.0),
        )
    except Exception as e:
        return f"⚠️ Erro ao chamar o modelo: {e}"

    try:
        # Acessa o primeiro elemento da lista interna
        first = outputs[0][0]  # <--- aqui está a correção
        gen_text = first.get("generated_text", [])

        # Procura o último bloco "assistant"
        for block in reversed(gen_text):
            if block.get("role") == "assistant":
                content = block.get("content", "")
                if isinstance(content, list) and len(content) > 0:
                    return content[0].get("text", "").strip()
                elif isinstance(content, str):
                    return content.strip()
        return "⚠️ Nenhum bloco 'assistant' encontrado."
    except Exception as e:
        return f"⚠️ Erro ao processar saída: {e}\n\n{outputs}"

# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("## Gemma 3 1B-IT — Chat (demo)\nProvide a system message (optional) and ask something.")
    with gr.Row():
        system_input = gr.Textbox(value="You are a helpful assistant.", label="System message")
    with gr.Row():
        max_tokens = gr.Slider(minimum=1, maximum=1024, value=128, step=1, label="Max new tokens")
        temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="Temperature")
        top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
    chat = gr.ChatInterface(respond, type="messages", additional_inputs=[system_input, max_tokens, temperature, top_p])

if __name__ == "__main__":
    demo.launch()