Spaces:

Polygl0t
/

Tucano2Cool

Sleeping

App Files Files Community

Tucano2Cool / app.py

nicholasKluge

Update app.py

4be1b0e verified about 1 month ago

raw

history blame contribute delete

4.61 kB

	import spaces
	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria, StoppingCriteriaList, TextIteratorStreamer
	import torch
	from threading import Thread

	# Download the models and tokenizers from the Hugging Face platform
	MODEL_CHOICES = [
	"Polygl0t/Tucano2-qwen-0.5B-Instruct",
	"Polygl0t/Tucano2-qwen-0.5B-Think",
	"Polygl0t/Tucano2-qwen-1.5B-Instruct",
	"Polygl0t/Tucano2-qwen-1.5B-Think",
	"Polygl0t/Tucano2-qwen-3.7B-Instruct",
	"Polygl0t/Tucano2-qwen-3.7B-Think",
	]

	models = {}
	tokenizers = {}
	for mid in MODEL_CHOICES:
	tokenizers[mid] = AutoTokenizer.from_pretrained(mid)
	models[mid] = AutoModelForCausalLM.from_pretrained(
	mid,
	dtype=torch.float16,
	device_map="auto",
	)

	# Default system prompt used when none is provided
	DEFAULT_SYSTEM_PROMPT = "Você é Tucano, um assistente de IA útil e amigável. Responda às perguntas de forma clara e concisa, fornecendo informações relevantes e precisas. Seja educado e respeitoso em suas respostas."

	# Text generation function
	@spaces.GPU
	def predict(message, history, system_prompt=None, model_name=None):
	model_name = model_name or MODEL_CHOICES[0]
	model = models[model_name]
	tokenizer = tokenizers[model_name]

	class StopOnTokens(StoppingCriteria):
	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
	if input_ids[0][-1] == tokenizer.eos_token_id:
	return True
	return False

	stop = StopOnTokens()

	# The system prompt is always present and never removed during truncation
	system_message = {"role": "system", "content": system_prompt or DEFAULT_SYSTEM_PROMPT}

	# Build the message list from conversation history
	messages = []
	for msg in history:
	messages.append({"role": msg["role"], "content": msg["content"]})
	messages.append({"role": "user", "content": message})

	# Pop oldest conversation messages (keeping the system prompt and the
	# latest user message) until the prompt fits within model_max_length
	max_length = tokenizer.model_max_length
	while True:
	input_text = tokenizer.apply_chat_template(
	[system_message] + messages, tokenize=False, add_generation_prompt=True
	)
	token_count = len(tokenizer.encode(input_text))
	if token_count <= max_length or len(messages) <= 1:
	break
	messages.pop(0)

	model_inputs = tokenizer([input_text], return_tensors="pt").to(model.device)
	streamer = TextIteratorStreamer(tokenizer, timeout=10., skip_prompt=True, skip_special_tokens=True)
	generate_kwargs = dict(
	model_inputs,
	streamer=streamer,
	repetition_penalty=1.2,
	max_new_tokens=1024,
	do_sample=True,
	top_p=.9,
	top_k=50,
	temperature=0.2,
	num_beams=1,
	stopping_criteria=StoppingCriteriaList([stop])
	)
	t = Thread(target=model.generate, kwargs=generate_kwargs)
	t.start()
	partial_message = ""
	for new_token in streamer:
	partial_message += new_token
	if tokenizer.eos_token and tokenizer.eos_token in partial_message:
	break
	yield partial_message

	CITE_AS = """
	## Cite as

	```bibtex
	@misc{correa2026tucano2cool,
	title={{Tucano 2 Cool: Better Open Source LLMs for Portuguese}},
	author={Nicholas Kluge Corr{\\^e}a and Aniket Sen and Shiza Fatimah and Sophia Falk and Lennard Landgraf and Julia Kastner and Lucie Flek},
	year={2026},
	eprint={2603.03543},
	archivePrefix={arXiv},
	primaryClass={cs.CL},
	url={https://arxiv.org/abs/2603.03543},
	}
	```
	"""

	with gr.Blocks() as demo:
	gr.ChatInterface(
	fn=predict,
	additional_inputs=[
	gr.Textbox(value=DEFAULT_SYSTEM_PROMPT, label="System Prompt", lines=3, render=False),
	gr.Dropdown(choices=MODEL_CHOICES, value=MODEL_CHOICES[0], label="Model", render=False),
	],
	title="Tucano 2 Cool 🦜",
	description="Meet Tucano 2 — a family of open-source language models (0.5B to 3.7B parameters) built from the ground up for Portuguese. As of their release, Tucano 2 outperforms most prior Portuguese models of similar size.",
	examples=[
	["Como eu posso resolver o seguinte problema: 2x + 3 = 11?"],
	["Qual é a capital do Brasil?"],
	["Explique a teoria da relatividade de forma simples."],
	],
	cache_examples=False
	)
	gr.Markdown(CITE_AS)

	if __name__ == "__main__":
	demo.launch()