--- library_name: transformers license: llama3.1 language: - en base_model: unsloth/llama-3-8b-bnb-4bit --- from transformers import AutoTokenizer, TextStreamer from peft import PeftModel, PeftConfig from transformers import AutoModelForCausalLM from unsloth import FastLanguageModel import torch alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" # Load the Peft configuration config = PeftConfig.from_pretrained("umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot") # Load the base model from Hugging Face with float16 data type base_model = AutoModelForCausalLM.from_pretrained( "unsloth/Meta-Llama-3.1-8B-bnb-4bit", torch_dtype=torch.float16, # Switch to float16 device_map="auto" # Automatically map model to available devices ) # Apply the PeftModel to the base model model = PeftModel.from_pretrained(base_model, "umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot") # Initialize the tokenizer tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit") # Optimize the model for inference (this applies if using FastLanguageModel) FastLanguageModel.for_inference(model) # Prepare the input without using .to("cuda") because device_map="auto" handles it inputs = tokenizer( alpaca_prompt.format( "hello doctor can you understand me i want to know about deseacse of flu", # instruction "i dont know how to flu", "", # output - leave this blank for generation! ), return_tensors="pt" ) # Set up the streamer for output text_streamer = TextStreamer(tokenizer) # Generate the output _ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)