|
|
--- |
|
|
library_name: peft |
|
|
language: |
|
|
- ar |
|
|
- en |
|
|
base_model: |
|
|
- ALLaM-AI/ALLaM-7B-Instruct-preview |
|
|
--- |
|
|
## ArabVLM: Vision Language Model |
|
|
|
|
|
- **Repository:** https://github.com/BigData-KSU/ArabVLM |
|
|
- **Demo:** Soon. |
|
|
|
|
|
|
|
|
## How to Get Started with the Model |
|
|
|
|
|
### Install |
|
|
|
|
|
1. Clone this repository and navigate to RS-LLaVA folder |
|
|
|
|
|
``` |
|
|
git clone https://github.com/BigData-KSU/ArabVLM.git |
|
|
cd ArabVLM |
|
|
``` |
|
|
|
|
|
2. Install Packages |
|
|
|
|
|
``` |
|
|
pip install -r requirements.txt |
|
|
``` |
|
|
|
|
|
--- |
|
|
|
|
|
### Inference |
|
|
|
|
|
Use the code below to get started with the model. |
|
|
|
|
|
|
|
|
```python |
|
|
|
|
|
from PIL import Image |
|
|
import os |
|
|
import torch |
|
|
from vllm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN |
|
|
from vllm.conversation import conv_templates, SeparatorStyle |
|
|
from vllm.model.builder import load_pretrained_model |
|
|
from vllm.utils import disable_torch_init |
|
|
from vllm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria |
|
|
|
|
|
|
|
|
### Main model.... |
|
|
model_path ='/BigData-KSU/ArabVLM' |
|
|
model_base = 'ALLaM-AI/ALLaM-7B-Instruct-preview' |
|
|
|
|
|
|
|
|
conv_mode = 'llava_llama_2' |
|
|
disable_torch_init() |
|
|
model_path = os.path.abspath(model_path) |
|
|
print('model path') |
|
|
print(model_path) |
|
|
model_name = get_model_name_from_path(model_path) |
|
|
print('model name') |
|
|
print(model_name) |
|
|
print('model base') |
|
|
print(model_base) |
|
|
|
|
|
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name,device='cuda:0') |
|
|
|
|
|
|
|
|
def chat_with_Vision_BioLLM(cur_prompt,image_name): |
|
|
# Prepare the input text, adding image-related tokens if needed |
|
|
image_mem = Image.open(image_name).convert('RGB') |
|
|
image_processor = processor['image'] |
|
|
conv = conv_templates[conv_mode].copy() |
|
|
roles = conv.roles |
|
|
print(image_mem) |
|
|
image_tensor = image_processor.preprocess(image_mem, return_tensors='pt')['pixel_values'] |
|
|
tensor = image_tensor.to(model.device, dtype=torch.float16) |
|
|
print(f"{roles[1]}: {cur_prompt}") |
|
|
cur_prompt = DEFAULT_IMAGE_TOKEN + '\n' + cur_prompt |
|
|
conv.append_message(conv.roles[0], cur_prompt) |
|
|
conv.append_message(conv.roles[1], None) |
|
|
prompt = conv.get_prompt() |
|
|
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() |
|
|
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 |
|
|
keywords = [stop_str] |
|
|
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) |
|
|
|
|
|
if image_mem: |
|
|
with torch.inference_mode(): |
|
|
output_ids = model.generate( |
|
|
input_ids, |
|
|
images=tensor, |
|
|
do_sample=False, |
|
|
max_new_tokens=1024, |
|
|
use_cache=True, |
|
|
stopping_criteria=[stopping_criteria]) |
|
|
|
|
|
|
|
|
response = tokenizer.decode(output_ids[0, input_ids.shape[1]:]) |
|
|
#print(outputs) |
|
|
|
|
|
return response |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
cur_prompt='وصف الصورة بالتفصيل ' |
|
|
image_name='path/to/image' |
|
|
outputs=chat_with_Vision_BioLLM(cur_prompt,image_name) |
|
|
print('Model Response.....') |
|
|
print(outputs) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
``` |
|
|
- PEFT 0.4.0 |