import torch from transformers import PreTrainedTokenizerFast, LlamaForCausalLM from tokenizers import Tokenizer tk = PreTrainedTokenizerFast(tokenizer_object=Tokenizer.from_file("tokenizer.json"), bos_token="<|bos|>", eos_token="<|eos|>", unk_token="<|unk|>", pad_token="<|pad|>") md = LlamaForCausalLM.from_pretrained(".", torch_dtype=torch.bfloat16).to("cuda") ids = tk.encode("<|bos|>The future of AI is", return_tensors="pt").to("cuda") gen = md.generate(ids, max_new_tokens=150, do_sample=True, temperature=0.7, repetition_penalty=1.2, no_repeat_ngram_size=3, pad_token_id=0) print(tk.decode(gen[0], skip_special_tokens=True))