import os import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline # ------------------------- # Config # ------------------------- MODEL_NAME = "google/gemma-3-1b-it" HF_TOKEN = os.environ.get("HF_TOKEN") # <-- coloque o token nos Secrets do Space (nome: HF_TOKEN) device_id = 0 if torch.cuda.is_available() else -1 # ------------------------- # Load tokenizer + model (authenticated) # ------------------------- # Use 'token=' (newer transformers) to authenticate gated models tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_TOKEN) # try to load with device_map to reduce peak RAM where possible model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, token=HF_TOKEN, device_map="auto" if torch.cuda.is_available() or "CUDA_VISIBLE_DEVICES" in os.environ else None, ) # Create a text-generation pipeline that accepts the structured "messages" format pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device=device_id, ) # ------------------------- # Helper: build messages and call pipeline # ------------------------- def build_messages(system_message: str, history, user_message: str): """ Build the messages list expected by Gemma-style structured input: [ {role: "system", content: [{"type":"text","text":...}]}, {role:"user", content:[...]}, ... ] The pipeline expects a list-of-list: [[msg1, msg2, ...]] when called. """ msgs = [] if system_message: msgs.append({"role": "system", "content": [{"type": "text", "text": system_message}]}) # 'history' from Gradio ChatInterface is a list of dicts with keys like {"role": "...", "text": "..."} for item in history or []: # normalize possible key names role = item.get("role", "user") text = item.get("text") or item.get("message") or "" msgs.append({"role": role, "content": [{"type": "text", "text": text}]}) msgs.append({"role": "user", "content": [{"type": "text", "text": user_message}]}) return [msgs] # pipeline expects a list of conversations # ------------------------- # The Gradio respond function # ------------------------- def respond(message, history, system_message, max_tokens, temperature, top_p): conv = build_messages(system_message or "", history or [], message) try: outputs = pipe( conv, max_new_tokens=int(max_tokens or 128), temperature=float(temperature or 1.0), top_p=float(top_p or 1.0), ) except Exception as e: return f"⚠️ Erro ao chamar o modelo: {e}" try: # Acessa o primeiro elemento da lista interna first = outputs[0][0] # <--- aqui está a correção gen_text = first.get("generated_text", []) # Procura o último bloco "assistant" for block in reversed(gen_text): if block.get("role") == "assistant": content = block.get("content", "") if isinstance(content, list) and len(content) > 0: return content[0].get("text", "").strip() elif isinstance(content, str): return content.strip() return "⚠️ Nenhum bloco 'assistant' encontrado." except Exception as e: return f"⚠️ Erro ao processar saída: {e}\n\n{outputs}" # ------------------------- # Gradio UI # ------------------------- with gr.Blocks() as demo: with gr.Row(): gr.Markdown("## Gemma 3 1B-IT — Chat (demo)\nProvide a system message (optional) and ask something.") with gr.Row(): system_input = gr.Textbox(value="You are a helpful assistant.", label="System message") with gr.Row(): max_tokens = gr.Slider(minimum=1, maximum=1024, value=128, step=1, label="Max new tokens") temperature = gr.Slider(minimum=0.0, maximum=2.0, value=0.7, step=0.05, label="Temperature") top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="Top-p (nucleus sampling)") chat = gr.ChatInterface(respond, type="messages", additional_inputs=[system_input, max_tokens, temperature, top_p]) if __name__ == "__main__": demo.launch()