Tucano2Cool / app.py
nicholasKluge's picture
Update app.py
4be1b0e verified
import spaces
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
import torch
from threading import Thread
# Download the models and tokenizers from the Hugging Face platform
MODEL_CHOICES = [
"Polygl0t/Tucano2-qwen-0.5B-Instruct",
"Polygl0t/Tucano2-qwen-0.5B-Think",
"Polygl0t/Tucano2-qwen-1.5B-Instruct",
"Polygl0t/Tucano2-qwen-1.5B-Think",
"Polygl0t/Tucano2-qwen-3.7B-Instruct",
"Polygl0t/Tucano2-qwen-3.7B-Think",
]
models = {}
tokenizers = {}
for mid in MODEL_CHOICES:
tokenizers[mid] = AutoTokenizer.from_pretrained(mid)
models[mid] = AutoModelForCausalLM.from_pretrained(
mid,
dtype=torch.float16,
device_map="auto",
)
# Default system prompt used when none is provided
DEFAULT_SYSTEM_PROMPT = "Você é Tucano, um assistente de IA útil e amigável. Responda às perguntas de forma clara e concisa, fornecendo informações relevantes e precisas. Seja educado e respeitoso em suas respostas."
# Text generation function
@spaces.GPU
def predict(message, history, system_prompt=None, model_name=None):
model_name = model_name or MODEL_CHOICES[0]
model = models[model_name]
tokenizer = tokenizers[model_name]
class StopOnTokens(StoppingCriteria):
def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
if input_ids[0][-1] == tokenizer.eos_token_id:
return True
return False
stop = StopOnTokens()
# The system prompt is always present and never removed during truncation
system_message = {"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}
# Build the message list from conversation history
messages = []
for msg in history:
messages.append({"role": msg["role"], "content": msg["content"]})
messages.append({"role": "user", "content": message})
# Pop oldest conversation messages (keeping the system prompt and the
# latest user message) until the prompt fits within model_max_length
max_length = tokenizer.model_max_length
while True:
input_text = tokenizer.apply_chat_template(
[system_message] + messages, tokenize=False, add_generation_prompt=True
)
token_count = len(tokenizer.encode(input_text))
if token_count <= max_length or len(messages) <= 1:
break
messages.pop(0)
model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
repetition_penalty=1.2,
max_new_tokens=1024,
do_sample=True,
top_p=.9,
top_k=50,
temperature=0.2,
num_beams=1,
stopping_criteria=StoppingCriteriaList([stop])
)
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_message = ""
for new_token in streamer:
partial_message += new_token
if tokenizer.eos_token and tokenizer.eos_token in partial_message:
break
yield partial_message
CITE_AS = """
## Cite as
```bibtex
@misc{correa2026tucano2cool,
title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}},
author={Nicholas Kluge Corr{\\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek},
year={2026},
eprint={2603.03543},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2603.03543},
}
```
"""
with gr.Blocks() as demo:
gr.ChatInterface(
fn=predict,
additional_inputs=[
gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=3, render=False),
gr.Dropdown(choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", render=False),
],
title="Tucano 2 Cool 🦜",
description="Meet Tucano 2 — a family of open-source language models (0.5B to 3.7B parameters) built from the ground up for Portuguese. As of their release, Tucano 2 outperforms most prior Portuguese models of similar size.",
examples=[
["Como eu posso resolver o seguinte problema: 2x + 3 = 11?"],
["Qual é a capital do Brasil?"],
["Explique a teoria da relatividade de forma simples."],
],
cache_examples=False
)
gr.Markdown(CITE_AS)
if __name__ == "__main__":
demo.launch()