import gradio as gr from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import torch from threading import Thread model_id = "TheDrummer/Tiger-Gemma-9B-v3" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.bfloat16, device_map="auto" ) def respond(message, history): # Build conversation (NO system prompt) messages = [] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) thread = Thread(target=model.generate, kwargs=dict( input_ids=input_ids, max_new_tokens=512, temperature=0.7, do_sample=True, streamer=streamer )) thread.start() partial = "" for token in streamer: partial += token yield partial gr.ChatInterface( fn=respond, title="Tiger-Gemma 9B Chat", description="Powered by TheDrummer/Tiger-Gemma-9B-v3", ).launch(share=True)