# app.py — versão corrigida e mais robusta
import os
import gradio as gr
from huggingface_hub import InferenceClient

MODEL_ID = "mradermacher/sk2decompile-struct-6.7b-GGUF"  # ajuste se necessário

def make_client(hf_token):
    """
    Retorna um InferenceClient tentando, na ordem:
    1) token passado pelo gr.LoginButton (hf_token.token)
    2) variável de ambiente HF_TOKEN (útil como Secret no Space)
    3) sem token (anônimo) — pode falhar dependendo do modelo
    """
    token = None
    if hf_token:
        # hf_token pode ser None ou um objeto com .token
        try:
            token = hf_token.token
        except Exception:
            token = None

    if not token:
        token = os.environ.get("HF_TOKEN")

    if token:
        return InferenceClient(token=token, model=MODEL_ID)
    else:
        # sem token; InferenceClient aceita não passar token (anon)
        return InferenceClient(model=MODEL_ID)


def extract_token_from_chunk(chunk):
    """
    Extrai o fragmento gerado do chunk de stream do InferenceClient,
    suportando alguns formatos possíveis:
      - objeto com .choices[].delta.content
      - dict com choices -> delta -> content
      - top-level 'generated_text' ou 'text'
    Retorna string ("" se nada).
    """
    try:
        # caso venham objetos com atributos
        if hasattr(chunk, "choices"):
            choices = chunk.choices
            if choices and len(choices) > 0:
                delta = choices[0].delta if hasattr(choices[0], "delta") else None
                if delta:
                    return getattr(delta, "content", "") or ""
        # caso seja dict-like
        if isinstance(chunk, dict):
            # top-level generated_text
            if "generated_text" in chunk and chunk["generated_text"]:
                return chunk["generated_text"]
            if "text" in chunk and chunk["text"]:
                return chunk["text"]
            choices = chunk.get("choices") or []
            if len(choices) > 0:
                first = choices[0]
                # delta as dict
                delta = first.get("delta") if isinstance(first, dict) else None
                if delta:
                    return delta.get("content", "") or ""
                # older style: message/content
                msg = first.get("message") if isinstance(first, dict) else None
                if msg and isinstance(msg, dict):
                    return msg.get("content", "") or ""
        # fallback vazio
        return ""
    except Exception:
        return ""


def respond(
    message,
    history: list[dict[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    hf_token: gr.OAuthToken,
):
    """
    Handler para gr.ChatInterface. Retorna um generator que emite
    o texto cumulativo (streaming) — compatível com gradio.
    """
    try:
        client = make_client(hf_token)
    except Exception as e:
        yield f"Erro ao criar InferenceClient: {e}"
        return

    # montar mensagens no formato esperado (role/content)
    messages = [{"role": "system", "content": system_message}] if system_message else []
    # history: dependendo do gradio, pode já ser no formato messages; tratamos apenas uma lista de dicts role/content
    if history:
        # se history vir como pares [("user","..."),("assistant","..."), ...] convert
        if isinstance(history, list) and len(history) and isinstance(history[0], (list, tuple)):
            for u, a in history:
                messages.append({"role": "user", "content": u})
                messages.append({"role": "assistant", "content": a})
        else:
            # assumimos que history já esteja em formato role/content dicts ou semelhante
            for item in history:
                # se for tuple ignore — caso comum não usar
                if isinstance(item, dict) and "role" in item and "content" in item:
                    messages.append(item)

    messages.append({"role": "user", "content": message})

    response = ""
    try:
        stream = client.chat_completion(
            messages=messages,
            max_tokens=int(max_tokens),
            stream=True,
            temperature=float(temperature),
            top_p=float(top_p),
        )
    except Exception as e:
        yield f"Erro ao chamar chat_completion: {e}"
        return

    # Itera sobre o stream e acumula o texto extraído
    try:
        for chunk in stream:
            token = extract_token_from_chunk(chunk)
            if token:
                response += token
                yield response
        # quando o stream termina, garante que o conteúdo final seja entregue
        if response == "":
            # se não houve fragmentos, tenta obter resposta final sem stream
            try:
                final = client.chat_completion(messages=messages, max_tokens=int(max_tokens), stream=False, temperature=float(temperature), top_p=float(top_p))
                # final pode ser objeto ou dict
                if hasattr(final, "choices"):
                    # tentar extrair
                    try:
                        content = final.choices[0].message.content
                    except Exception:
                        content = ""
                elif isinstance(final, dict):
                    # procurar generated_text ou choices[0].message.content
                    content = final.get("generated_text", "") or ""
                    if not content:
                        ch = final.get("choices", [])
                        if len(ch) and isinstance(ch[0], dict):
                            msg = ch[0].get("message", {})
                            if isinstance(msg, dict):
                                content = msg.get("content", "") or ""
                else:
                    content = ""
                if content:
                    response += content
                    yield response
            except Exception:
                # nada a fazer mais
                pass
    except Exception as e:
        yield f"Erro durante streaming: {e}"
        return

# configuração do ChatInterface / UI
chatbot = gr.ChatInterface(
    respond,
    type="messages",
    additional_inputs=[
        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
        # o LoginButton fornece um OAuth token quando o usuário loga no Hugging Face
        gr.LoginButton(),
    ],
)

with gr.Blocks() as demo:
    with gr.Sidebar():
        gr.Markdown("Login com Hugging Face para usar o Inference API (recomendado).")
    chatbot.render()

if __name__ == "__main__":
    demo.launch()