Update README.md
Browse files
README.md
CHANGED
|
@@ -1,3 +1,58 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
---
|
| 4 |
+
|
| 5 |
+
```python
|
| 6 |
+
import torch
|
| 7 |
+
from PIL import Image
|
| 8 |
+
from transformers import AutoModel, CLIPImageProcessor
|
| 9 |
+
from transformers import AutoTokenizer
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
model = AutoModel.from_pretrained(
|
| 13 |
+
'OpenGVLab/InternVL-14B-224px',
|
| 14 |
+
torch_dtype=torch.bfloat16,
|
| 15 |
+
low_cpu_mem_usage=True,
|
| 16 |
+
trust_remote_code=True).cuda().eval()
|
| 17 |
+
|
| 18 |
+
image_processor = CLIPImageProcessor.from_pretrained('OpenGVLab/InternVL-14B-224px')
|
| 19 |
+
|
| 20 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 21 |
+
'OpenGVLab/InternVL-14B-224px', use_fast=False, add_eos_token=True)
|
| 22 |
+
tokenizer.pad_token_id = 0 # set pad_token_id to 0
|
| 23 |
+
|
| 24 |
+
images = [
|
| 25 |
+
Image.open('./examples/image1.jpg').convert('RGB'),
|
| 26 |
+
Image.open('./examples/image2.jpg').convert('RGB'),
|
| 27 |
+
Image.open('./examples/image3.jpg').convert('RGB')
|
| 28 |
+
]
|
| 29 |
+
prefix = 'summarize:'
|
| 30 |
+
texts = [
|
| 31 |
+
prefix + 'a photo of a red panda', # English
|
| 32 |
+
prefix + '一张熊猫的照片', # Chinese
|
| 33 |
+
prefix + '二匹の猫の写真' # Japanese
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
pixel_values = image_processor(images=images, return_tensors='pt').pixel_values
|
| 37 |
+
pixel_values = pixel_values.to(torch.bfloat16).cuda()
|
| 38 |
+
input_ids = tokenizer(texts, return_tensors='pt', max_length=80,
|
| 39 |
+
truncation=True, padding='max_length').input_ids.cuda()
|
| 40 |
+
|
| 41 |
+
# InternVL-C
|
| 42 |
+
logits_per_image, logits_per_text = model(
|
| 43 |
+
image=pixel_values, text=input_ids, mode='InternVL-C')
|
| 44 |
+
probs = logits_per_image.softmax(dim=-1)
|
| 45 |
+
# tensor([[9.9609e-01, 5.2185e-03, 6.0070e-08],
|
| 46 |
+
# [2.2949e-02, 9.7656e-01, 5.9903e-06],
|
| 47 |
+
# [3.2932e-06, 7.4863e-05, 1.0000e+00]], device='cuda:0',
|
| 48 |
+
# dtype=torch.bfloat16, grad_fn=<SoftmaxBackward0>)
|
| 49 |
+
|
| 50 |
+
# InternVL-G
|
| 51 |
+
logits_per_image, logits_per_text = model(
|
| 52 |
+
image=pixel_values, text=input_ids, mode='InternVL-G')
|
| 53 |
+
probs = logits_per_image.softmax(dim=-1)
|
| 54 |
+
# tensor([[9.9609e-01, 3.1738e-03, 3.6322e-08],
|
| 55 |
+
# [8.6060e-03, 9.9219e-01, 2.8759e-06],
|
| 56 |
+
# [1.7583e-06, 3.1233e-05, 1.0000e+00]], device='cuda:0',
|
| 57 |
+
# dtype=torch.bfloat16, grad_fn=<SoftmaxBackward0>)
|
| 58 |
+
```
|