|
|
import gradio as gr |
|
|
from unsloth import FastLanguageModel |
|
|
from transformers import TextIteratorStreamer |
|
|
from threading import Thread |
|
|
|
|
|
|
|
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
|
model_name=".", |
|
|
max_seq_length=8192, |
|
|
dtype='bf16', |
|
|
load_in_4bit=False, |
|
|
) |
|
|
FastLanguageModel.for_inference(model) |
|
|
|
|
|
def get_streaming_generator(model, tokenizer, history, max_new_tokens=8192): |
|
|
"""Function that returns a generator yielding streaming outputs""" |
|
|
|
|
|
formatted_history = [] |
|
|
for exchange in history: |
|
|
formatted_history.append({"role": "user", "content": exchange[0]}) |
|
|
if len(exchange) > 1 and exchange[1]: |
|
|
formatted_history.append({"role": "assistant", "content": exchange[1]}) |
|
|
|
|
|
inputs = tokenizer( |
|
|
[ |
|
|
tokenizer.apply_chat_template(formatted_history, |
|
|
tokenize=False, |
|
|
add_generation_prompt=True), |
|
|
], |
|
|
return_tensors="pt", |
|
|
padding=True, |
|
|
return_attention_mask=True |
|
|
).to("cuda") |
|
|
|
|
|
|
|
|
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True) |
|
|
|
|
|
|
|
|
generation_kwargs = dict( |
|
|
input_ids=inputs["input_ids"], |
|
|
attention_mask=inputs["attention_mask"], |
|
|
streamer=streamer, |
|
|
max_new_tokens=max_new_tokens |
|
|
) |
|
|
|
|
|
thread = Thread(target=model.generate, kwargs=generation_kwargs) |
|
|
thread.start() |
|
|
|
|
|
return streamer |
|
|
|
|
|
def predict(message, history): |
|
|
|
|
|
history = history or [] |
|
|
history.append([message, ""]) |
|
|
|
|
|
|
|
|
streamer = get_streaming_generator(model, tokenizer, history) |
|
|
|
|
|
|
|
|
full_response = "" |
|
|
for text_chunk in streamer: |
|
|
full_response += text_chunk |
|
|
|
|
|
history[-1][1] = full_response |
|
|
yield history |
|
|
|
|
|
def clear_chat(): |
|
|
return [], "" |
|
|
|
|
|
|
|
|
with gr.Blocks(css=".message { white-space: pre-wrap; }") as iface: |
|
|
chatbot = gr.Chatbot( |
|
|
show_label=False, |
|
|
container=True, |
|
|
height=600, |
|
|
bubble_full_width=False, |
|
|
render_markdown=True, |
|
|
latex_delimiters=[ |
|
|
{"left": "$$", "right": "$$", "display": True}, |
|
|
{"left": "$", "right": "$", "display": False}, |
|
|
], |
|
|
) |
|
|
msg = gr.Textbox( |
|
|
label="Message", |
|
|
placeholder="Type your message here... (Markdown supported)", |
|
|
lines=2 |
|
|
) |
|
|
submit = gr.Button("Submit") |
|
|
clear = gr.Button("Clear") |
|
|
|
|
|
|
|
|
msg.submit( |
|
|
predict, |
|
|
[msg, chatbot], |
|
|
[chatbot], |
|
|
api_name="predict" |
|
|
).then( |
|
|
lambda: "", None, [msg] |
|
|
) |
|
|
|
|
|
submit.click( |
|
|
predict, |
|
|
[msg, chatbot], |
|
|
[chatbot] |
|
|
).then( |
|
|
lambda: "", None, [msg] |
|
|
) |
|
|
|
|
|
clear.click( |
|
|
clear_chat, |
|
|
None, |
|
|
[chatbot, msg], |
|
|
queue=False |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |