| | def chat(message, history): |
| | messages = [{"role": "system", "content": "You are a lightning-fast assistant."}] |
| | for val in history: |
| | messages.append({"role": "user", "content": val[0]}) |
| | messages.append({"role": "assistant", "content": val[1]}) |
| | messages.append({"role": "user", "content": message}) |
| |
|
| | |
| | model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device) |
| | |
| | streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) |
| | |
| | generate_kwargs = dict( |
| | input_ids=model_inputs, |
| | streamer=streamer, |
| | max_new_tokens=512, |
| | do_sample=True, |
| | temperature=0.7, |
| | ) |
| |
|
| | t = Thread(target=model.generate, kwargs=generate_kwargs) |
| | t.start() |
| |
|
| | partial_message = "" |
| | for new_token in streamer: |
| | partial_message += new_token |
| | yield partial_message |
| |
|