File size: 3,042 Bytes
cd0cb4c b5f7c86 cd0cb4c 2cef2a1 cd0cb4c 4672b8a 66fedb0 cd0cb4c 66fedb0 c2b8c8e 66fedb0 b5f7c86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 | ---
library_name: peft
language:
- ar
- en
base_model:
- ALLaM-AI/ALLaM-7B-Instruct-preview
---
## ArabVLM: Vision Language Model
- **Repository:** https://github.com/BigData-KSU/ArabVLM
- **Demo:** Soon.
## How to Get Started with the Model
### Install
1. Clone this repository and navigate to RS-LLaVA folder
```
git clone https://github.com/BigData-KSU/ArabVLM.git
cd ArabVLM
```
2. Install Packages
```
pip install -r requirements.txt
```
---
### Inference
Use the code below to get started with the model.
```python
from PIL import Image
import os
import torch
from vllm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from vllm.conversation import conv_templates, SeparatorStyle
from vllm.model.builder import load_pretrained_model
from vllm.utils import disable_torch_init
from vllm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
### Main model....
model_path ='/BigData-KSU/ArabVLM'
model_base = 'ALLaM-AI/ALLaM-7B-Instruct-preview'
conv_mode = 'llava_llama_2'
disable_torch_init()
model_path = os.path.abspath(model_path)
print('model path')
print(model_path)
model_name = get_model_name_from_path(model_path)
print('model name')
print(model_name)
print('model base')
print(model_base)
tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name,device='cuda:0')
def chat_with_Vision_BioLLM(cur_prompt,image_name):
# Prepare the input text, adding image-related tokens if needed
image_mem = Image.open(image_name).convert('RGB')
image_processor = processor['image']
conv = conv_templates[conv_mode].copy()
roles = conv.roles
print(image_mem)
image_tensor = image_processor.preprocess(image_mem, return_tensors='pt')['pixel_values']
tensor = image_tensor.to(model.device, dtype=torch.float16)
print(f"{roles[1]}: {cur_prompt}")
cur_prompt = DEFAULT_IMAGE_TOKEN + '\n' + cur_prompt
conv.append_message(conv.roles[0], cur_prompt)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
if image_mem:
with torch.inference_mode():
output_ids = model.generate(
input_ids,
images=tensor,
do_sample=False,
max_new_tokens=1024,
use_cache=True,
stopping_criteria=[stopping_criteria])
response = tokenizer.decode(output_ids[0, input_ids.shape[1]:])
#print(outputs)
return response
if __name__ == "__main__":
cur_prompt='وصف الصورة بالتفصيل '
image_name='path/to/image'
outputs=chat_with_Vision_BioLLM(cur_prompt,image_name)
print('Model Response.....')
print(outputs)
```
- PEFT 0.4.0 |