import os, torch, gradio as gr from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1") # faster download on Spaces MODEL_ID = "TildeAI/TildeOpen-30b" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False) # load in BF16 and let HF map devices automatically model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map="auto", ) # slight speedups on A100 torch.backends.cuda.matmul.allow_tf32 = True SYS = ( "You are a helpful multilingual assistant. " "This is a *base* model (not instruction tuned); follow the user's request precisely." ) def build_prompt(history, user_msg): # simple conversation transcript; base models don't need a special chat template parts = [SYS, ""] for u, a in history: parts += [f"User: {u}", f"Assistant: {a}"] parts += [f"User: {user_msg}", "Assistant:"] return "\n".join(parts) def chat_fn(message, history): prompt = build_prompt(history, message) inputs = tokenizer(prompt, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) gen_kwargs = dict( **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1, streamer=streamer, ) t = Thread(target=model.generate, kwargs=gen_kwargs) t.start() partial = "" for chunk in streamer: partial += chunk yield partial demo = gr.ChatInterface( fn=chat_fn, title="TildeOpen-30B (Transformers, BF16)", description="Base model; multilingual. If build fails with OOM, switch to Option B (GGUF).", ) demo.queue().launch()