|
|
--- |
|
|
library_name: transformers |
|
|
license: llama3.1 |
|
|
language: |
|
|
- en |
|
|
base_model: unsloth/llama-3-8b-bnb-4bit |
|
|
--- |
|
|
|
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer, TextStreamer |
|
|
from peft import PeftModel, PeftConfig |
|
|
from transformers import AutoModelForCausalLM |
|
|
from unsloth import FastLanguageModel |
|
|
import torch |
|
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
|
|
### Instruction: |
|
|
{} |
|
|
|
|
|
### Input: |
|
|
{} |
|
|
|
|
|
### Response: |
|
|
{}""" |
|
|
# Load the Peft configuration |
|
|
config = PeftConfig.from_pretrained("umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot") |
|
|
|
|
|
# Load the base model from Hugging Face with float16 data type |
|
|
base_model = AutoModelForCausalLM.from_pretrained( |
|
|
"unsloth/Meta-Llama-3.1-8B-bnb-4bit", |
|
|
torch_dtype=torch.float16, # Switch to float16 |
|
|
device_map="auto" # Automatically map model to available devices |
|
|
) |
|
|
# Apply the PeftModel to the base model |
|
|
model = PeftModel.from_pretrained(base_model, "umairimran/medical_chatbot_model_trained_on_ruslanmv_ai-medical-chatbot") |
|
|
# Initialize the tokenizer |
|
|
tokenizer = AutoTokenizer.from_pretrained("unsloth/Meta-Llama-3.1-8B-bnb-4bit") |
|
|
# Optimize the model for inference (this applies if using FastLanguageModel) |
|
|
FastLanguageModel.for_inference(model) |
|
|
# Prepare the input without using .to("cuda") because device_map="auto" handles it |
|
|
inputs = tokenizer( |
|
|
alpaca_prompt.format( |
|
|
"hello doctor can you understand me i want to know about deseacse of flu", # instruction |
|
|
"i dont know how to flu", |
|
|
"", # output - leave this blank for generation! |
|
|
), |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
# Set up the streamer for output |
|
|
text_streamer = TextStreamer(tokenizer) |
|
|
|
|
|
# Generate the output |
|
|
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=128) |