Spaces:
Sleeping
Sleeping
File size: 1,380 Bytes
79bffd3 c8ce4cd 79bffd3 c8ce4cd 79bffd3 c8ce4cd 79bffd3 c8ce4cd 79bffd3 c8ce4cd 79bffd3 c8ce4cd eb24f8f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
import torch
from threading import Thread
model_id = "TheDrummer/Tiger-Gemma-9B-v3"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto"
)
def respond(message, history):
# Build conversation (NO system prompt)
messages = []
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages,
return_tensors="pt",
add_generation_prompt=True
).to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
thread = Thread(target=model.generate, kwargs=dict(
input_ids=input_ids,
max_new_tokens=512,
temperature=0.7,
do_sample=True,
streamer=streamer
))
thread.start()
partial = ""
for token in streamer:
partial += token
yield partial
gr.ChatInterface(
fn=respond,
title="Tiger-Gemma 9B Chat",
description="Powered by TheDrummer/Tiger-Gemma-9B-v3",
).launch(share=True) |