| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import os.path as op | |
| output_dir = "./checkpoint-2" | |
| tokenizer = AutoTokenizer.from_pretrained(output_dir) | |
| model = AutoModelForCausalLM.from_pretrained(output_dir, device_map="auto") | |
| # We use the tokenizer's chat template to format each message - see https://huggingface.co/docs/transformers/main/en/chat_templating | |
| # messages = [ | |
| # {"role": "user", "content": "How many helicopters can a human eat in one sitting?"}, | |
| # ] | |
| # prepare the messages for the model | |
| # input_ids = tokenizer.apply_chat_template(messages, truncation=True, add_generation_prompt=True, return_tensors="pt").to("cuda") | |
| # inference | |
| # outputs = model.generate( | |
| # input_ids=input_ids, | |
| # max_new_tokens=256, | |
| # do_sample=True, | |
| # temperature=0.7, | |
| # top_k=50, | |
| # top_p=0.95 | |
| # ) | |
| # print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]) | |