File size: 3,042 Bytes
cd0cb4c
 
b5f7c86
 
 
 
 
cd0cb4c
2cef2a1
cd0cb4c
4672b8a
66fedb0
cd0cb4c
 
66fedb0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b8c8e
66fedb0
 
 
 
 
 
 
 
b5f7c86
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
---
library_name: peft
language:
- ar
- en
base_model:
- ALLaM-AI/ALLaM-7B-Instruct-preview
---
## ArabVLM: Vision Language Model

- **Repository:** https://github.com/BigData-KSU/ArabVLM
- **Demo:** Soon.


## How to Get Started with the Model

### Install

1. Clone this repository and navigate to RS-LLaVA folder

```
git clone https://github.com/BigData-KSU/ArabVLM.git
cd ArabVLM
```

2. Install Packages

```
pip install -r requirements.txt
```

---

### Inference

Use the code below to get started with the model.


```python 

from PIL import Image
import os
import torch
from vllm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from vllm.conversation import conv_templates, SeparatorStyle
from vllm.model.builder import load_pretrained_model
from vllm.utils import disable_torch_init
from vllm.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria


### Main model....
model_path ='/BigData-KSU/ArabVLM'
model_base = 'ALLaM-AI/ALLaM-7B-Instruct-preview'


conv_mode = 'llava_llama_2'
disable_torch_init()
model_path = os.path.abspath(model_path)
print('model path')
print(model_path)
model_name = get_model_name_from_path(model_path)
print('model name')
print(model_name)
print('model base')
print(model_base)

tokenizer, model, processor, context_len = load_pretrained_model(model_path, model_base, model_name,device='cuda:0')


def chat_with_Vision_BioLLM(cur_prompt,image_name):
    # Prepare the input text, adding image-related tokens if needed
    image_mem = Image.open(image_name).convert('RGB')
    image_processor = processor['image']
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles
    print(image_mem)
    image_tensor = image_processor.preprocess(image_mem, return_tensors='pt')['pixel_values']
    tensor = image_tensor.to(model.device, dtype=torch.float16)
    print(f"{roles[1]}: {cur_prompt}")
    cur_prompt = DEFAULT_IMAGE_TOKEN + '\n' + cur_prompt
    conv.append_message(conv.roles[0], cur_prompt)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

    if image_mem:
        with torch.inference_mode():
            output_ids = model.generate(
                input_ids,
                images=tensor,
                do_sample=False,
                max_new_tokens=1024,
                use_cache=True,
                stopping_criteria=[stopping_criteria])


    response = tokenizer.decode(output_ids[0, input_ids.shape[1]:])
    #print(outputs)

    return response


if __name__ == "__main__":

    cur_prompt='وصف الصورة بالتفصيل '
    image_name='path/to/image'
    outputs=chat_with_Vision_BioLLM(cur_prompt,image_name)
    print('Model Response.....')
    print(outputs)




```
- PEFT 0.4.0