import gradio as gr import torch from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer MODEL_ID = "TildeAI/TildeOpen-30b" # Tokenizer MUST be slow version per model card tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) # Load model on GPU with BF16 model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", ) SYSTEM = ( "You are a helpful multilingual assistant. " "The model is base (not instruction-tuned), so follow the user's request precisely." ) def format_history(history, user_msg): prompt = SYSTEM + "\n\n" for u, a in history: prompt += f"<|user|>\n{u}\n<|assistant|>\n{a}\n" prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n" return prompt def chat_fn(message, history): prompt = format_history(history, message) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, streamer=streamer, ) thread = Thread(target=model.generate, kwargs=gen_kwargs) thread.start() partial = "" for new_text in streamer: partial += new_text yield partial demo = gr.ChatInterface( fn=chat_fn, title="TildeOpen-30B (Transformers, BF16)", description="Base model (not instruction-tuned). Multilingual; context length 8192.", ) demo.queue().launch()