Spaces:
Sleeping
Sleeping
File size: 6,925 Bytes
e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 8681097 e602dc7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 | # app.py — versão corrigida e mais robusta
import os
import gradio as gr
from huggingface_hub import InferenceClient
MODEL_ID = "mradermacher/sk2decompile-struct-6.7b-GGUF" # ajuste se necessário
def make_client(hf_token):
"""
Retorna um InferenceClient tentando, na ordem:
1) token passado pelo gr.LoginButton (hf_token.token)
2) variável de ambiente HF_TOKEN (útil como Secret no Space)
3) sem token (anônimo) — pode falhar dependendo do modelo
"""
token = None
if hf_token:
# hf_token pode ser None ou um objeto com .token
try:
token = hf_token.token
except Exception:
token = None
if not token:
token = os.environ.get("HF_TOKEN")
if token:
return InferenceClient(token=token, model=MODEL_ID)
else:
# sem token; InferenceClient aceita não passar token (anon)
return InferenceClient(model=MODEL_ID)
def extract_token_from_chunk(chunk):
"""
Extrai o fragmento gerado do chunk de stream do InferenceClient,
suportando alguns formatos possíveis:
- objeto com .choices[].delta.content
- dict com choices -> delta -> content
- top-level 'generated_text' ou 'text'
Retorna string ("" se nada).
"""
try:
# caso venham objetos com atributos
if hasattr(chunk, "choices"):
choices = chunk.choices
if choices and len(choices) > 0:
delta = choices[0].delta if hasattr(choices[0], "delta") else None
if delta:
return getattr(delta, "content", "") or ""
# caso seja dict-like
if isinstance(chunk, dict):
# top-level generated_text
if "generated_text" in chunk and chunk["generated_text"]:
return chunk["generated_text"]
if "text" in chunk and chunk["text"]:
return chunk["text"]
choices = chunk.get("choices") or []
if len(choices) > 0:
first = choices[0]
# delta as dict
delta = first.get("delta") if isinstance(first, dict) else None
if delta:
return delta.get("content", "") or ""
# older style: message/content
msg = first.get("message") if isinstance(first, dict) else None
if msg and isinstance(msg, dict):
return msg.get("content", "") or ""
# fallback vazio
return ""
except Exception:
return ""
def respond(
message,
history: list[dict[str, str]],
system_message,
max_tokens,
temperature,
top_p,
hf_token: gr.OAuthToken,
):
"""
Handler para gr.ChatInterface. Retorna um generator que emite
o texto cumulativo (streaming) — compatível com gradio.
"""
try:
client = make_client(hf_token)
except Exception as e:
yield f"Erro ao criar InferenceClient: {e}"
return
# montar mensagens no formato esperado (role/content)
messages = [{"role": "system", "content": system_message}] if system_message else []
# history: dependendo do gradio, pode já ser no formato messages; tratamos apenas uma lista de dicts role/content
if history:
# se history vir como pares [("user","..."),("assistant","..."), ...] convert
if isinstance(history, list) and len(history) and isinstance(history[0], (list, tuple)):
for u, a in history:
messages.append({"role": "user", "content": u})
messages.append({"role": "assistant", "content": a})
else:
# assumimos que history já esteja em formato role/content dicts ou semelhante
for item in history:
# se for tuple ignore — caso comum não usar
if isinstance(item, dict) and "role" in item and "content" in item:
messages.append(item)
messages.append({"role": "user", "content": message})
response = ""
try:
stream = client.chat_completion(
messages=messages,
max_tokens=int(max_tokens),
stream=True,
temperature=float(temperature),
top_p=float(top_p),
)
except Exception as e:
yield f"Erro ao chamar chat_completion: {e}"
return
# Itera sobre o stream e acumula o texto extraído
try:
for chunk in stream:
token = extract_token_from_chunk(chunk)
if token:
response += token
yield response
# quando o stream termina, garante que o conteúdo final seja entregue
if response == "":
# se não houve fragmentos, tenta obter resposta final sem stream
try:
final = client.chat_completion(messages=messages, max_tokens=int(max_tokens), stream=False, temperature=float(temperature), top_p=float(top_p))
# final pode ser objeto ou dict
if hasattr(final, "choices"):
# tentar extrair
try:
content = final.choices[0].message.content
except Exception:
content = ""
elif isinstance(final, dict):
# procurar generated_text ou choices[0].message.content
content = final.get("generated_text", "") or ""
if not content:
ch = final.get("choices", [])
if len(ch) and isinstance(ch[0], dict):
msg = ch[0].get("message", {})
if isinstance(msg, dict):
content = msg.get("content", "") or ""
else:
content = ""
if content:
response += content
yield response
except Exception:
# nada a fazer mais
pass
except Exception as e:
yield f"Erro durante streaming: {e}"
return
# configuração do ChatInterface / UI
chatbot = gr.ChatInterface(
respond,
type="messages",
additional_inputs=[
gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
# o LoginButton fornece um OAuth token quando o usuário loga no Hugging Face
gr.LoginButton(),
],
)
with gr.Blocks() as demo:
with gr.Sidebar():
gr.Markdown("Login com Hugging Face para usar o Inference API (recomendado).")
chatbot.render()
if __name__ == "__main__":
demo.launch() |