from transformers import AutoModelForCausalLM, AutoTokenizer save_dir = "checkpoint-1750" print("Loading model from checkpoint...") model = AutoModelForCausalLM.from_pretrained(save_dir, load_in_8bit=True) print("Attaching adapter...") model.load_adapter(save_dir, adapter_name="Adapter1") print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(save_dir) while True: text = input(">>> ") if text == "exit": break model_inputs = tokenizer([text], return_tensors="pt", max_length=256).to("cuda") generated_ids = model.generate(**model_inputs, max_length=1024, #truncation=True, temperature=0.1, do_sample=True, pad_token_id=tokenizer.eos_token_id) response=tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # remove repeat of the question if '?' in response: to_q = response.index('?') if len(text)-1 <= to_q and response[:to_q] == text[:to_q]: response = response[to_q+1:] print(f"\n\t<<< {response} >>>\n")