| | import spaces |
| | import threading |
| | import gradio as gr |
| | from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer |
| |
|
| | |
| | model_name = "kz919/QwQ-0.5B-Distilled-SFT" |
| | tokenizer = AutoTokenizer.from_pretrained(model_name) |
| | model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda") |
| |
|
| | |
| | @spaces.GPU |
| | def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p): |
| | |
| | msg = [ |
| | {"role": "system", "content": system_message} |
| | ] |
| | for user_input, assistant_response in history: |
| | msg.extend( |
| | [ |
| | {"role": "user", "content": user_input}, |
| | {"role": "assistant", "content": assistant_response} |
| | ] |
| | ) |
| | msg.append({"role": "user", "content": message}) |
| |
|
| | prompt = tokenizer.apply_chat_template( |
| | msg, |
| | tokenize=False, |
| | add_generation_prompt=True |
| | ) |
| |
|
| | |
| | inputs = tokenizer(prompt, return_tensors="pt").to("cuda") |
| |
|
| |
|
| | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| |
|
| | |
| | generation_thread = threading.Thread( |
| | target=model.generate, |
| | kwargs=dict( |
| | inputs=inputs.input_ids, |
| | max_length=max_tokens, |
| | streamer=streamer, |
| | do_sample=True, |
| | temperature=temperature, |
| | top_p=top_p, |
| | pad_token_id=tokenizer.eos_token_id, |
| | ), |
| | ) |
| | generation_thread.start() |
| |
|
| | |
| | text_buffer = "" |
| | for new_text in streamer: |
| | text_buffer+=new_text |
| | yield text_buffer |
| |
|
| |
|
| | |
| | demo = gr.ChatInterface( |
| | respond, |
| | additional_inputs=[ |
| | gr.Textbox(value="You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.", label="System message"), |
| | gr.Slider(minimum=1, maximum=16384, value=512, step=1, label="Max new tokens"), |
| | gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), |
| | gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), |
| | ] |
| | ) |
| |
|
| | |
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|
| |
|