def chat(message, history): messages = [{"role": "system", "content": "You are a lightning-fast assistant."}] for val in history: messages.append({"role": "user", "content": val[0]}) messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) # FIX: Added .to(model.device) so the brain can read the input model_inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generate_kwargs = dict( input_ids=model_inputs, # This is now the correct shape/type streamer=streamer, max_new_tokens=512, do_sample=True, temperature=0.7, ) t = Thread(target=model.generate, kwargs=generate_kwargs) t.start() partial_message = "" for new_token in streamer: partial_message += new_token yield partial_message