| import fire | |
| import torch | |
| from peft import PeftModel | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| def main( | |
| base_model: str, instruction: str, lora_weights: str = None, device: str = "cuda:0" | |
| ): | |
| tokenizer = AutoTokenizer.from_pretrained(base_model) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| base_model, | |
| torch_dtype=torch.bfloat16, | |
| device_map=device, | |
| ) | |
| model = PeftModel.from_pretrained(model, lora_weights, torch_dtype=torch.float32) | |
| input_ids = tokenizer(instruction, return_tensors="pt").input_ids.to(device) | |
| output = "" | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| input_ids=input_ids, | |
| max_new_tokens=128, | |
| ) | |
| output = tokenizer.batch_decode( | |
| outputs.detach().cpu().numpy(), skip_special_tokens=True | |
| )[0][input_ids.shape[-1] :] | |
| print(f"Prompt:\n{instruction}\n") | |
| print(f"Generated:\n{output}") | |
| if __name__ == "__main__": | |
| fire.Fire(main) | |