| | import os |
| | import torch |
| | import gradio as gr |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer |
| | from threading import Thread |
| |
|
| | MODEL_NAME = "S1mp1eXXX/Nimi-1b-thinking" |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) |
| |
|
| | model = AutoModelForCausalLM.from_pretrained( |
| | MODEL_NAME, |
| | torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
| | device_map="auto" |
| | ) |
| |
|
| | def respond(message, history, system_message, max_tokens, temperature, top_p): |
| |
|
| | messages = system_message + "\n" |
| |
|
| | for h in history: |
| | messages += f"{h['role']}: {h['content']}\n" |
| |
|
| | messages += f"user: {message}\nassistant:" |
| |
|
| | inputs = tokenizer(messages, return_tensors="pt").to(model.device) |
| |
|
| | streamer = TextIteratorStreamer( |
| | tokenizer, |
| | skip_prompt=True, |
| | skip_special_tokens=True |
| | ) |
| |
|
| | generation_kwargs = dict( |
| | **inputs, |
| | max_new_tokens=max_tokens, |
| | temperature=temperature, |
| | top_p=top_p, |
| | streamer=streamer |
| | ) |
| |
|
| | thread = Thread(target=model.generate, kwargs=generation_kwargs) |
| | thread.start() |
| |
|
| | partial_output = "" |
| | for new_token in streamer: |
| | partial_output += new_token |
| | yield partial_output |
| |
|
| |
|
| | chatbot = gr.ChatInterface( |
| | respond, |
| | type="messages", |
| | additional_inputs=[ |
| | gr.Textbox(value="You are a helpful assistant.", label="System message"), |
| | gr.Slider(1, 2048, value=512, step=1, label="Max new tokens"), |
| | gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), |
| | gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), |
| | ], |
| | ) |
| |
|
| | if __name__ == "__main__": |
| | chatbot.launch() |
| |
|