from transformers import LlamaForCausalLM, AutoTokenizer import torch model_path = "./result" model = LlamaForCausalLM.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path) def chat(): print("Chat") print("Type 'exit' to quit.") chat_history = "" while True: user_input = input("You: ").strip() if user_input.lower() in ["exit", "quit"]: print("Bye") break chat_history += f"You: {user_input}\n" prompt = chat_history + "Bot:" inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt").to(model.device) with torch.no_grad(): tokens = model.generate(**inputs, max_new_tokens=150, do_sample=True, temperature=0.7, top_p=0.9) response = tokenizer.decode(tokens[0], skip_special_tokens=True) bot_response = response[len(prompt):].strip() print(f"Bot: {bot_response}") chat_history += f"Bot: {bot_response}\n" if __name__ == "__main__": chat()