model / README.md
umairimran's picture
Update README.md
ff07c51 verified
---
library_name: transformers
license: llama3.1
language:
- en
base_model: unsloth/llama-3-8b-bnb-4bit
---
from transformers import AutoTokenizer, TextStreamer
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM
from unsloth import FastLanguageModel
import torch
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
# Load the Peft configuration
config = PeftConfig.from_pretrained("umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot")
# Load the base model from Hugging Face with float16 data type
base_model = AutoModelForCausalLM.from_pretrained(
"unsloth/Meta-Llama-3.1-8B-bnb-4bit",
torch_dtype=torch.float16, # Switch to float16
device_map="auto" # Automatically map model to available devices
)
# Apply the PeftModel to the base model
model = PeftModel.from_pretrained(base_model, "umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot")
# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit")
# Optimize the model for inference (this applies if using FastLanguageModel)
FastLanguageModel.for_inference(model)
# Prepare the input without using .to("cuda") because device_map="auto" handles it
inputs = tokenizer(
alpaca_prompt.format(
"hello doctor can you understand me i want to know about deseacse of flu", # instruction
"i dont know how to flu",
"", # output - leave this blank for generation!
),
return_tensors="pt"
)
# Set up the streamer for output
text_streamer = TextStreamer(tokenizer)
# Generate the output
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128)