Spaces:
Sleeping
Sleeping
| import spaces | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer | |
| import torch | |
| from threading import Thread | |
| # Download the models and tokenizers from the Hugging Face platform | |
| MODEL_CHOICES = [ | |
| "Polygl0t/Tucano2-qwen-0.5B-Instruct", | |
| "Polygl0t/Tucano2-qwen-0.5B-Think", | |
| "Polygl0t/Tucano2-qwen-1.5B-Instruct", | |
| "Polygl0t/Tucano2-qwen-1.5B-Think", | |
| "Polygl0t/Tucano2-qwen-3.7B-Instruct", | |
| "Polygl0t/Tucano2-qwen-3.7B-Think", | |
| ] | |
| models = {} | |
| tokenizers = {} | |
| for mid in MODEL_CHOICES: | |
| tokenizers[mid] = AutoTokenizer.from_pretrained(mid) | |
| models[mid] = AutoModelForCausalLM.from_pretrained( | |
| mid, | |
| dtype=torch.float16, | |
| device_map="auto", | |
| ) | |
| # Default system prompt used when none is provided | |
| DEFAULT_SYSTEM_PROMPT = "Você é Tucano, um assistente de IA útil e amigável. Responda às perguntas de forma clara e concisa, fornecendo informações relevantes e precisas. Seja educado e respeitoso em suas respostas." | |
| # Text generation function | |
| def predict(message, history, system_prompt=None, model_name=None): | |
| model_name = model_name or MODEL_CHOICES[0] | |
| model = models[model_name] | |
| tokenizer = tokenizers[model_name] | |
| class StopOnTokens(StoppingCriteria): | |
| def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool: | |
| if input_ids[0][-1] == tokenizer.eos_token_id: | |
| return True | |
| return False | |
| stop = StopOnTokens() | |
| # The system prompt is always present and never removed during truncation | |
| system_message = {"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT} | |
| # Build the message list from conversation history | |
| messages = [] | |
| for msg in history: | |
| messages.append({"role": msg["role"], "content": msg["content"]}) | |
| messages.append({"role": "user", "content": message}) | |
| # Pop oldest conversation messages (keeping the system prompt and the | |
| # latest user message) until the prompt fits within model_max_length | |
| max_length = tokenizer.model_max_length | |
| while True: | |
| input_text = tokenizer.apply_chat_template( | |
| [system_message] + messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| token_count = len(tokenizer.encode(input_text)) | |
| if token_count <= max_length or len(messages) <= 1: | |
| break | |
| messages.pop(0) | |
| model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device) | |
| streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True) | |
| generate_kwargs = dict( | |
| model_inputs, | |
| streamer=streamer, | |
| repetition_penalty=1.2, | |
| max_new_tokens=1024, | |
| do_sample=True, | |
| top_p=.9, | |
| top_k=50, | |
| temperature=0.2, | |
| num_beams=1, | |
| stopping_criteria=StoppingCriteriaList([stop]) | |
| ) | |
| t = Thread(target=model.generate, kwargs=generate_kwargs) | |
| t.start() | |
| partial_message = "" | |
| for new_token in streamer: | |
| partial_message += new_token | |
| if tokenizer.eos_token and tokenizer.eos_token in partial_message: | |
| break | |
| yield partial_message | |
| CITE_AS = """ | |
| ## Cite as | |
| ```bibtex | |
| @misc{correa2026tucano2cool, | |
| title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}}, | |
| author={Nicholas Kluge Corr{\\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek}, | |
| year={2026}, | |
| eprint={2603.03543}, | |
| archivePrefix={arXiv}, | |
| primaryClass={cs.CL}, | |
| url={https://arxiv.org/abs/2603.03543}, | |
| } | |
| ``` | |
| """ | |
| with gr.Blocks() as demo: | |
| gr.ChatInterface( | |
| fn=predict, | |
| additional_inputs=[ | |
| gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=3, render=False), | |
| gr.Dropdown(choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", render=False), | |
| ], | |
| title="Tucano 2 Cool 🦜", | |
| description="Meet Tucano 2 — a family of open-source language models (0.5B to 3.7B parameters) built from the ground up for Portuguese. As of their release, Tucano 2 outperforms most prior Portuguese models of similar size.", | |
| examples=[ | |
| ["Como eu posso resolver o seguinte problema: 2x + 3 = 11?"], | |
| ["Qual é a capital do Brasil?"], | |
| ["Explique a teoria da relatividade de forma simples."], | |
| ], | |
| cache_examples=False | |
| ) | |
| gr.Markdown(CITE_AS) | |
| if __name__ == "__main__": | |
| demo.launch() |