# app.py — versão corrigida e mais robusta import os import gradio as gr from huggingface_hub import InferenceClient MODEL_ID = "mradermacher/sk2decompile-struct-6.7b-GGUF" # ajuste se necessário def make_client(hf_token): """ Retorna um InferenceClient tentando, na ordem: 1) token passado pelo gr.LoginButton (hf_token.token) 2) variável de ambiente HF_TOKEN (útil como Secret no Space) 3) sem token (anônimo) — pode falhar dependendo do modelo """ token = None if hf_token: # hf_token pode ser None ou um objeto com .token try: token = hf_token.token except Exception: token = None if not token: token = os.environ.get("HF_TOKEN") if token: return InferenceClient(token=token, model=MODEL_ID) else: # sem token; InferenceClient aceita não passar token (anon) return InferenceClient(model=MODEL_ID) def extract_token_from_chunk(chunk): """ Extrai o fragmento gerado do chunk de stream do InferenceClient, suportando alguns formatos possíveis: - objeto com .choices[].delta.content - dict com choices -> delta -> content - top-level 'generated_text' ou 'text' Retorna string ("" se nada). """ try: # caso venham objetos com atributos if hasattr(chunk, "choices"): choices = chunk.choices if choices and len(choices) > 0: delta = choices[0].delta if hasattr(choices[0], "delta") else None if delta: return getattr(delta, "content", "") or "" # caso seja dict-like if isinstance(chunk, dict): # top-level generated_text if "generated_text" in chunk and chunk["generated_text"]: return chunk["generated_text"] if "text" in chunk and chunk["text"]: return chunk["text"] choices = chunk.get("choices") or [] if len(choices) > 0: first = choices[0] # delta as dict delta = first.get("delta") if isinstance(first, dict) else None if delta: return delta.get("content", "") or "" # older style: message/content msg = first.get("message") if isinstance(first, dict) else None if msg and isinstance(msg, dict): return msg.get("content", "") or "" # fallback vazio return "" except Exception: return "" def respond( message, history: list[dict[str, str]], system_message, max_tokens, temperature, top_p, hf_token: gr.OAuthToken, ): """ Handler para gr.ChatInterface. Retorna um generator que emite o texto cumulativo (streaming) — compatível com gradio. """ try: client = make_client(hf_token) except Exception as e: yield f"Erro ao criar InferenceClient: {e}" return # montar mensagens no formato esperado (role/content) messages = [{"role": "system", "content": system_message}] if system_message else [] # history: dependendo do gradio, pode já ser no formato messages; tratamos apenas uma lista de dicts role/content if history: # se history vir como pares [("user","..."),("assistant","..."), ...] convert if isinstance(history, list) and len(history) and isinstance(history[0], (list, tuple)): for u, a in history: messages.append({"role": "user", "content": u}) messages.append({"role": "assistant", "content": a}) else: # assumimos que history já esteja em formato role/content dicts ou semelhante for item in history: # se for tuple ignore — caso comum não usar if isinstance(item, dict) and "role" in item and "content" in item: messages.append(item) messages.append({"role": "user", "content": message}) response = "" try: stream = client.chat_completion( messages=messages, max_tokens=int(max_tokens), stream=True, temperature=float(temperature), top_p=float(top_p), ) except Exception as e: yield f"Erro ao chamar chat_completion: {e}" return # Itera sobre o stream e acumula o texto extraído try: for chunk in stream: token = extract_token_from_chunk(chunk) if token: response += token yield response # quando o stream termina, garante que o conteúdo final seja entregue if response == "": # se não houve fragmentos, tenta obter resposta final sem stream try: final = client.chat_completion(messages=messages, max_tokens=int(max_tokens), stream=False, temperature=float(temperature), top_p=float(top_p)) # final pode ser objeto ou dict if hasattr(final, "choices"): # tentar extrair try: content = final.choices[0].message.content except Exception: content = "" elif isinstance(final, dict): # procurar generated_text ou choices[0].message.content content = final.get("generated_text", "") or "" if not content: ch = final.get("choices", []) if len(ch) and isinstance(ch[0], dict): msg = ch[0].get("message", {}) if isinstance(msg, dict): content = msg.get("content", "") or "" else: content = "" if content: response += content yield response except Exception: # nada a fazer mais pass except Exception as e: yield f"Erro durante streaming: {e}" return # configuração do ChatInterface / UI chatbot = gr.ChatInterface( respond, type="messages", additional_inputs=[ gr.Textbox(value="You are a friendly Chatbot.", label="System message"), gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), # o LoginButton fornece um OAuth token quando o usuário loga no Hugging Face gr.LoginButton(), ], ) with gr.Blocks() as demo: with gr.Sidebar(): gr.Markdown("Login com Hugging Face para usar o Inference API (recomendado).") chatbot.render() if __name__ == "__main__": demo.launch()