|
|
--- |
|
|
base_model: |
|
|
- meta-llama/Llama-3.2-11B-Vision-Instruct |
|
|
tags: |
|
|
- text-generation-inference |
|
|
- transformers |
|
|
- mllama |
|
|
- trl |
|
|
license: apache-2.0 |
|
|
language: |
|
|
- en |
|
|
datasets: |
|
|
- unsloth/RLAIF-V-Dataset |
|
|
pipeline_tag: image-text-to-text |
|
|
--- |
|
|
|
|
|
# Uploaded model |
|
|
|
|
|
- **Developed by:** saishshinde15 |
|
|
- **License:** apache-2.0 |
|
|
- **Finetuned from model : llama-3.2-11b-vision-instruct |
|
|
|
|
|
## How to use this model |
|
|
- use unsolth for faster model download and faster inference speed . Users can also use transformers module from hugging face |
|
|
|
|
|
```python |
|
|
from unsloth import FastVisionModel |
|
|
from PIL import Image |
|
|
import requests |
|
|
from transformers import TextStreamer |
|
|
|
|
|
# Load the model and tokenizer |
|
|
model, tokenizer = FastVisionModel.from_pretrained( |
|
|
model_name="saishshinde15/VisionAI", # YOUR MODEL YOU USED FOR TRAINING |
|
|
load_in_4bit=False # Set to False for 16bit LoRA |
|
|
) |
|
|
|
|
|
# Enable the model for inference |
|
|
FastVisionModel.for_inference(model) |
|
|
|
|
|
# Load the image from URL |
|
|
url = 'your image url' |
|
|
image = Image.open(requests.get(url, stream=True).raw) |
|
|
|
|
|
# Define the instruction and user query |
|
|
instruction = ( |
|
|
"You are an expert in answering questions related to the image provided: " |
|
|
"Answer to the questions given by the user accurately by referring to the image." |
|
|
) |
|
|
query = "What is this image about?" |
|
|
|
|
|
# Create the chat message structure |
|
|
messages = [ |
|
|
{"role": "user", "content": [ |
|
|
{"type": "image"}, |
|
|
{"type": "text", "text": instruction}, |
|
|
{"type": "text", "text": query} |
|
|
]} |
|
|
] |
|
|
|
|
|
# Generate input text using the tokenizer's chat template |
|
|
input_text = tokenizer.apply_chat_template(messages, add_generation_prompt=True) |
|
|
|
|
|
# Tokenize the inputs |
|
|
inputs = tokenizer( |
|
|
image, |
|
|
input_text, |
|
|
add_special_tokens=False, |
|
|
return_tensors="pt", |
|
|
).to("cuda") |
|
|
|
|
|
# Initialize the text streamer |
|
|
text_streamer = TextStreamer(tokenizer, skip_prompt=True) |
|
|
|
|
|
# Generate the response |
|
|
_ = model.generate( |
|
|
**inputs, |
|
|
streamer=text_streamer, |
|
|
max_new_tokens=128, |
|
|
use_cache=True, |
|
|
temperature=1.5, |
|
|
min_p=0.1 |
|
|
) |