Spaces:
Sleeping
Sleeping
File size: 4,204 Bytes
85c2654 7180fe8 13a28cb 85c2654 13a28cb 85c2654 13a28cb 85c2654 13a28cb 85c2654 13a28cb 85c2654 13a28cb 7180fe8 85c2654 5ca78e6 85c2654 5ca78e6 85c2654 0520ccc 0131668 0520ccc 85c2654 7180fe8 85c2654 7180fe8 13a28cb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# -------------------------
# Config
# -------------------------
MODEL_NAME = "google/gemma-3-1b-it"
HF_TOKEN = os.environ.get("HF_TOKEN") # <-- coloque o token nos Secrets do Space (nome: HF_TOKEN)
device_id = 0 if torch.cuda.is_available() else -1
# -------------------------
# Load tokenizer + model (authenticated)
# -------------------------
# Use 'token=' (newer transformers) to authenticate gated models
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN)
# try to load with device_map to reduce peak RAM where possible
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
token=HF_TOKEN,
device_map="auto" if torch.cuda.is_available() or "CUDA_VISIBLE_DEVICES" in os.environ else None,
)
# Create a text-generation pipeline that accepts the structured "messages" format
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device=device_id,
)
# -------------------------
# Helper: build messages and call pipeline
# -------------------------
def build_messages(system_message: str, history, user_message: str):
"""
Build the messages list expected by Gemma-style structured input:
[ {role: "system", content: [{"type":"text","text":...}]}, {role:"user", content:[...]}, ... ]
The pipeline expects a list-of-list: [[msg1, msg2, ...]] when called.
"""
msgs = []
if system_message:
msgs.append({"role": "system", "content": [{"type": "text", "text": system_message}]})
# 'history' from Gradio ChatInterface is a list of dicts with keys like {"role": "...", "text": "..."}
for item in history or []:
# normalize possible key names
role = item.get("role", "user")
text = item.get("text") or item.get("message") or ""
msgs.append({"role": role, "content": [{"type": "text", "text": text}]})
msgs.append({"role": "user", "content": [{"type": "text", "text": user_message}]})
return [msgs] # pipeline expects a list of conversations
# -------------------------
# The Gradio respond function
# -------------------------
def respond(message, history, system_message, max_tokens, temperature, top_p):
conv = build_messages(system_message or "", history or [], message)
try:
outputs = pipe(
conv,
max_new_tokens=int(max_tokens or 128),
temperature=float(temperature or 1.0),
top_p=float(top_p or 1.0),
)
except Exception as e:
return f"⚠️ Erro ao chamar o modelo: {e}"
try:
# Acessa o primeiro elemento da lista interna
first = outputs[0][0] # <--- aqui está a correção
gen_text = first.get("generated_text", [])
# Procura o último bloco "assistant"
for block in reversed(gen_text):
if block.get("role") == "assistant":
content = block.get("content", "")
if isinstance(content, list) and len(content) > 0:
return content[0].get("text", "").strip()
elif isinstance(content, str):
return content.strip()
return "⚠️ Nenhum bloco 'assistant' encontrado."
except Exception as e:
return f"⚠️ Erro ao processar saída: {e}\n\n{outputs}"
# -------------------------
# Gradio UI
# -------------------------
with gr.Blocks() as demo:
with gr.Row():
gr.Markdown("## Gemma 3 1B-IT — Chat (demo)\nProvide a system message (optional) and ask something.")
with gr.Row():
system_input = gr.Textbox(value="You are a helpful assistant.", label="System message")
with gr.Row():
max_tokens = gr.Slider(minimum=1, maximum=1024, value=128, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="Temperature")
top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)")
chat = gr.ChatInterface(respond, type="messages", additional_inputs=[system_input, max_tokens, temperature, top_p])
if __name__ == "__main__":
demo.launch() |