import gradio as gr
import torch
from threading import Thread
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer

MODEL_ID = "TildeAI/TildeOpen-30b"

# Tokenizer MUST be slow version per model card
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False)

# Load model on GPU with BF16
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

SYSTEM = (
    "You are a helpful multilingual assistant. "
    "The model is base (not instruction-tuned), so follow the user's request precisely."
)

def format_history(history, user_msg):
    prompt = SYSTEM + "\n\n"
    for u, a in history:
        prompt += f"<|user|>\n{u}\n<|assistant|>\n{a}\n"
    prompt += f"<|user|>\n{user_msg}\n<|assistant|>\n"
    return prompt

def chat_fn(message, history):
    prompt = format_history(history, message)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    gen_kwargs = dict(
        **inputs,
        max_new_tokens=512,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1,
        streamer=streamer,
    )

    thread = Thread(target=model.generate, kwargs=gen_kwargs)
    thread.start()
    partial = ""
    for new_text in streamer:
        partial += new_text
        yield partial

demo = gr.ChatInterface(
    fn=chat_fn,
    title="TildeOpen-30B (Transformers, BF16)",
    description="Base model (not instruction-tuned). Multilingual; context length 8192.",
)
demo.queue().launch()