|
|
--- |
|
|
base_model: unsloth/llama-3.2-11b-vision-instruct-bnb-4bit |
|
|
tags: |
|
|
- text-generation-inference |
|
|
- transformers |
|
|
- unsloth |
|
|
- mllama |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
--- |
|
|
|
|
|
# Uploaded model |
|
|
|
|
|
- **Developed by:** pollitoconpapass |
|
|
- **License:** apache-2.0 |
|
|
- **Finetuned from model :** unsloth/llama-3.2-11b-vision-instruct-bnb-4bit |
|
|
|
|
|
This mllama model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library. |
|
|
|
|
|
[<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth) |
|
|
|
|
|
|
|
|
# Implementation |
|
|
```py |
|
|
from datasets import load_dataset |
|
|
from unsloth import FastVisionModel |
|
|
|
|
|
model, tokenizer = FastVisionModel.from_pretrained( |
|
|
# "unsloth/Llama-3.2-11B-Vision-Instruct", |
|
|
"pollitoconpapass/Llama-3.2-11B-Vision-Radiology-mini", |
|
|
load_in_4bit = True, # Use 4bit to reduce memory use. False for 16bit LoRA. |
|
|
use_gradient_checkpointing = "unsloth", # True or "unsloth" for long context |
|
|
) |
|
|
|
|
|
model = FastVisionModel.get_peft_model( |
|
|
model, |
|
|
finetune_vision_layers = True, # False if not finetuning vision layers |
|
|
finetune_language_layers = True, # False if not finetuning language layers |
|
|
finetune_attention_modules = True, # False if not finetuning attention layers |
|
|
finetune_mlp_modules = True, # False if not finetuning MLP layers |
|
|
|
|
|
r = 16, # The larger, the higher the accuracy, but might overfit |
|
|
lora_alpha = 16, # Recommended alpha == r at least |
|
|
lora_dropout = 0, |
|
|
bias = "none", |
|
|
random_state = 3407, |
|
|
use_rslora = False, # We support rank stabilized LoRA |
|
|
loftq_config = None, # And LoftQ |
|
|
# target_modules = "all-linear", # Optional now! Can specify a list if needed |
|
|
) |
|
|
|
|
|
dataset = load_dataset("unsloth/Radiology_mini", split = "train") |
|
|
instruction = "You are an expert radiographer. Describe accurately what you see in this image." |
|
|
|
|
|
def convert_to_conversation(sample): |
|
|
conversation = [ |
|
|
{ "role": "user", |
|
|
"content" : [ |
|
|
{"type" : "text", "text" : instruction}, |
|
|
{"type" : "image", "image" : sample["image"]} ] |
|
|
}, |
|
|
{ "role" : "assistant", |
|
|
"content" : [ |
|
|
{"type" : "text", "text" : sample["caption"]} ] |
|
|
}, |
|
|
] |
|
|
return { "messages" : conversation } |
|
|
pass |
|
|
|
|
|
converted_dataset = [convert_to_conversation(sample) for sample in dataset] |
|
|
|
|
|
FastVisionModel.for_inference(model) # Enable for inference! |
|
|
|
|
|
image = dataset[0]["image"] |
|
|
instruction = "You are an expert radiographer. Describe accurately what you see in this image." |
|
|
|
|
|
messages = [ |
|
|
{"role": "user", "content": [ |
|
|
{"type": "image"}, |
|
|
{"type": "text", "text": instruction} |
|
|
]} |
|
|
] |
|
|
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True) |
|
|
inputs = tokenizer( |
|
|
image, |
|
|
input_text, |
|
|
add_special_tokens = False, |
|
|
return_tensors = "pt", |
|
|
).to("cuda") |
|
|
|
|
|
from transformers import TextStreamer |
|
|
text_streamer = TextStreamer(tokenizer, skip_prompt = True) |
|
|
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 500, |
|
|
use_cache = True, temperature = 1.5, min_p = 0.1) |
|
|
|
|
|
``` |
|
|
|