Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224.yaml +35 -0
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_a.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_r.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_real.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml +36 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml +37 -0
- VLMEvalKit_old/InternVL/classification/dataset/build.py +332 -0
- VLMEvalKit_old/InternVL/classification/dataset/imagenet_a_r_indices.py +295 -0
- VLMEvalKit_old/InternVL/classification/dataset/imagenetv2.py +59 -0
- VLMEvalKit_old/InternVL/classification/dataset/samplers.py +116 -0
- VLMEvalKit_old/InternVL/classification/dataset/zipreader.py +102 -0
- VLMEvalKit_old/InternVL/classification/meta_data/22k_class_to_idx.json +0 -0
- VLMEvalKit_old/InternVL/classification/meta_data/imagenet_classes.json +1002 -0
- VLMEvalKit_old/InternVL/classification/meta_data/map22kto1k.txt +1000 -0
- VLMEvalKit_old/InternVL/classification/meta_data/real.json +0 -0
- VLMEvalKit_old/InternVL/classification/models/__init__.py +7 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/answer/answer_bard.jsonl +0 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/results/test_sqa_llava_13b_v0.json +0 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json +0 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/llava_llama.py +140 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/llava_mpt.py +97 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/adapt_tokenizer.py +41 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/attention.py +300 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/blocks.py +41 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/configuration_mpt.py +118 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/custom_embedding.py +11 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/flash_attn_triton.py +484 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/hf_prefixlm_converter.py +415 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/meta_init_context.py +94 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/modeling_mpt.py +331 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/norm.py +56 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/param_init_fns.py +181 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/builder.py +12 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/clip_encoder.py +134 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/eva_clip/configuration_evaclip.py +425 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/eva_clip/modeling_evaclip.py +1428 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/configuration_intern_vit.py +117 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/flash_attention.py +75 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/modeling_intern_vit.py +354 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/internvl_14b/__init__.py +87 -0
- VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/internvl_14b/configuration_intern_vit.py +117 -0
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224.yaml
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 5 |
+
DATA_PATH: './data/imagenet-1k'
|
| 6 |
+
MODEL:
|
| 7 |
+
TYPE: intern_vit_6b
|
| 8 |
+
DROP_PATH_RATE: 0.0
|
| 9 |
+
INTERN_VIT_6B:
|
| 10 |
+
FREEZE_VIT: True
|
| 11 |
+
PATCH_SIZE: 14
|
| 12 |
+
PRETRAIN_SIZE: 224
|
| 13 |
+
QKV_BIAS: False
|
| 14 |
+
EMBED_DIM: 3200
|
| 15 |
+
NUM_HEADS: 25
|
| 16 |
+
MLP_RATIO: 4
|
| 17 |
+
INIT_VALUES: 0.1
|
| 18 |
+
QK_NORMALIZATION: True
|
| 19 |
+
DEPTH: 48
|
| 20 |
+
USE_FLASH_ATTN: True
|
| 21 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 22 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 23 |
+
TRAIN:
|
| 24 |
+
EMA:
|
| 25 |
+
ENABLE: False
|
| 26 |
+
DECAY: 0.998
|
| 27 |
+
EPOCHS: 10
|
| 28 |
+
WARMUP_EPOCHS: 1
|
| 29 |
+
WEIGHT_DECAY: 0.0
|
| 30 |
+
BASE_LR: 0.1 # 512
|
| 31 |
+
WARMUP_LR: .0
|
| 32 |
+
MIN_LR: .0
|
| 33 |
+
LR_LAYER_DECAY: false
|
| 34 |
+
OPTIMIZER:
|
| 35 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_a.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
DATASET: 'imagenet_a'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-a'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: False
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_r.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
DATASET: 'imagenet_r'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-r'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: False
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_real.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
DATASET: 'imagenet-real'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-1k'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: False
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenet_sketch.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
DATASET: 'imagenet_sketch'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-sketch'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: False
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/intern_vit_6b_1k_224_test_imagenetv2.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 128
|
| 4 |
+
DATASET: 'imagenetv2'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenetv2'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: False
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224_64gpu_imagenet_a.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_a'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-a'
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: True
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu.yaml
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 5 |
+
DATA_PATH: './data/imagenet-1k'
|
| 6 |
+
IMG_SIZE: 448
|
| 7 |
+
MODEL:
|
| 8 |
+
TYPE: intern_vit_6b
|
| 9 |
+
DROP_PATH_RATE: 0.0
|
| 10 |
+
INTERN_VIT_6B:
|
| 11 |
+
FREEZE_VIT: True
|
| 12 |
+
PATCH_SIZE: 14
|
| 13 |
+
PRETRAIN_SIZE: 224
|
| 14 |
+
QKV_BIAS: False
|
| 15 |
+
EMBED_DIM: 3200
|
| 16 |
+
NUM_HEADS: 25
|
| 17 |
+
MLP_RATIO: 4
|
| 18 |
+
INIT_VALUES: 0.1
|
| 19 |
+
QK_NORMALIZATION: True
|
| 20 |
+
DEPTH: 48
|
| 21 |
+
USE_FLASH_ATTN: True
|
| 22 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 23 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 24 |
+
TRAIN:
|
| 25 |
+
EMA:
|
| 26 |
+
ENABLE: True
|
| 27 |
+
DECAY: 0.998
|
| 28 |
+
EPOCHS: 10
|
| 29 |
+
WARMUP_EPOCHS: 1
|
| 30 |
+
WEIGHT_DECAY: 0.0
|
| 31 |
+
BASE_LR: 0.1 # 512
|
| 32 |
+
WARMUP_LR: .0
|
| 33 |
+
MIN_LR: .0
|
| 34 |
+
LR_LAYER_DECAY: false
|
| 35 |
+
OPTIMIZER:
|
| 36 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_224px_in1k_224to448_64gpu_imagenet_a.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_a'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-a'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 224
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 48
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_224px.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_0_in1k_448_64gpu_imagenet_r.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_r'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-r'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_0.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenet_sketch.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_sketch'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-sketch'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v1_2_in1k_448_64gpu_imagenetv2.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenetv2'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenetv2'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v1_2.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_r.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_r'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-r'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_real.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet-real'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-1k'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/configs/linear_probing/linear_probing_intern_vit_6b_448px_v2_5_in1k_448_64gpu_imagenet_sketch.yaml
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
DATA:
|
| 2 |
+
IMG_ON_MEMORY: False
|
| 3 |
+
BATCH_SIZE: 16 # single GPU batch size
|
| 4 |
+
DATASET: 'imagenet_sketch'
|
| 5 |
+
TRANSFORM: 'build_transform_for_linear_probe'
|
| 6 |
+
DATA_PATH: './data/imagenet-sketch'
|
| 7 |
+
IMG_SIZE: 448
|
| 8 |
+
MODEL:
|
| 9 |
+
TYPE: intern_vit_6b
|
| 10 |
+
DROP_PATH_RATE: 0.0
|
| 11 |
+
INTERN_VIT_6B:
|
| 12 |
+
FREEZE_VIT: True
|
| 13 |
+
PATCH_SIZE: 14
|
| 14 |
+
PRETRAIN_SIZE: 448
|
| 15 |
+
QKV_BIAS: False
|
| 16 |
+
EMBED_DIM: 3200
|
| 17 |
+
NUM_HEADS: 25
|
| 18 |
+
MLP_RATIO: 4
|
| 19 |
+
INIT_VALUES: 0.1
|
| 20 |
+
QK_NORMALIZATION: True
|
| 21 |
+
DEPTH: 45
|
| 22 |
+
USE_FLASH_ATTN: True
|
| 23 |
+
PRETRAINED: "./pretrained/intern_vit_6b_448px_v2_5.pth"
|
| 24 |
+
CLS_TARGET: 'cls_patch_concat'
|
| 25 |
+
TRAIN:
|
| 26 |
+
EMA:
|
| 27 |
+
ENABLE: True
|
| 28 |
+
DECAY: 0.998
|
| 29 |
+
EPOCHS: 10
|
| 30 |
+
WARMUP_EPOCHS: 1
|
| 31 |
+
WEIGHT_DECAY: 0.0
|
| 32 |
+
BASE_LR: 0.1 # 512
|
| 33 |
+
WARMUP_LR: .0
|
| 34 |
+
MIN_LR: .0
|
| 35 |
+
LR_LAYER_DECAY: false
|
| 36 |
+
OPTIMIZER:
|
| 37 |
+
NAME: 'sgd'
|
VLMEvalKit_old/InternVL/classification/dataset/build.py
ADDED
|
@@ -0,0 +1,332 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
import numpy as np
|
| 10 |
+
import torch
|
| 11 |
+
import torch.distributed as dist
|
| 12 |
+
from timm.data import Mixup, create_transform
|
| 13 |
+
from torchvision import transforms
|
| 14 |
+
from torchvision.datasets import ImageFolder
|
| 15 |
+
|
| 16 |
+
from .cached_image_folder import ImageCephDataset
|
| 17 |
+
from .samplers import NodeDistributedSampler, SubsetRandomSampler
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from torchvision.transforms import InterpolationMode
|
| 21 |
+
|
| 22 |
+
def _pil_interp(method):
|
| 23 |
+
if method == 'bicubic':
|
| 24 |
+
return InterpolationMode.BICUBIC
|
| 25 |
+
elif method == 'lanczos':
|
| 26 |
+
return InterpolationMode.LANCZOS
|
| 27 |
+
elif method == 'hamming':
|
| 28 |
+
return InterpolationMode.HAMMING
|
| 29 |
+
else:
|
| 30 |
+
return InterpolationMode.BILINEAR
|
| 31 |
+
except:
|
| 32 |
+
from timm.data.transforms import _pil_interp
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class TTA(torch.nn.Module):
|
| 36 |
+
|
| 37 |
+
def __init__(self, size, scales=[1.0, 1.05, 1.1]):
|
| 38 |
+
super().__init__()
|
| 39 |
+
self.size = size
|
| 40 |
+
self.scales = scales
|
| 41 |
+
|
| 42 |
+
def forward(self, img):
|
| 43 |
+
out = []
|
| 44 |
+
cc = transforms.CenterCrop(self.size)
|
| 45 |
+
for scale in self.scales:
|
| 46 |
+
size_ = int(scale * self.size)
|
| 47 |
+
rs = transforms.Resize(size_, interpolation=_pil_interp('bicubic'))
|
| 48 |
+
img_ = rs(img)
|
| 49 |
+
img_ = cc(img_)
|
| 50 |
+
out.append(img_)
|
| 51 |
+
|
| 52 |
+
return out
|
| 53 |
+
|
| 54 |
+
def __repr__(self) -> str:
|
| 55 |
+
return f'{self.__class__.__name__}(size={self.size}, scale={self.scales})'
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def build_loader(config):
|
| 59 |
+
config.defrost()
|
| 60 |
+
dataset_train, config.MODEL.NUM_CLASSES = build_dataset('train', config=config)
|
| 61 |
+
config.freeze()
|
| 62 |
+
print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
|
| 63 |
+
'successfully build train dataset')
|
| 64 |
+
|
| 65 |
+
dataset_val, _ = build_dataset('val', config=config)
|
| 66 |
+
print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
|
| 67 |
+
'successfully build val dataset')
|
| 68 |
+
|
| 69 |
+
dataset_test, _ = build_dataset('test', config=config)
|
| 70 |
+
print(f'local rank {config.LOCAL_RANK} / global rank {dist.get_rank()}'
|
| 71 |
+
'successfully build test dataset')
|
| 72 |
+
|
| 73 |
+
num_tasks = dist.get_world_size()
|
| 74 |
+
global_rank = dist.get_rank()
|
| 75 |
+
|
| 76 |
+
if dataset_train is not None:
|
| 77 |
+
if config.DATA.IMG_ON_MEMORY:
|
| 78 |
+
sampler_train = NodeDistributedSampler(dataset_train)
|
| 79 |
+
else:
|
| 80 |
+
if config.DATA.ZIP_MODE and config.DATA.CACHE_MODE == 'part':
|
| 81 |
+
indices = np.arange(dist.get_rank(), len(dataset_train), dist.get_world_size())
|
| 82 |
+
sampler_train = SubsetRandomSampler(indices)
|
| 83 |
+
else:
|
| 84 |
+
sampler_train = torch.utils.data.DistributedSampler(
|
| 85 |
+
dataset_train,
|
| 86 |
+
num_replicas=num_tasks,
|
| 87 |
+
rank=global_rank,
|
| 88 |
+
shuffle=True)
|
| 89 |
+
|
| 90 |
+
if dataset_val is not None:
|
| 91 |
+
if config.TEST.SEQUENTIAL:
|
| 92 |
+
sampler_val = torch.utils.data.SequentialSampler(dataset_val)
|
| 93 |
+
else:
|
| 94 |
+
sampler_val = torch.utils.data.distributed.DistributedSampler(dataset_val, shuffle=False)
|
| 95 |
+
|
| 96 |
+
if dataset_test is not None:
|
| 97 |
+
if config.TEST.SEQUENTIAL:
|
| 98 |
+
sampler_test = torch.utils.data.SequentialSampler(dataset_test)
|
| 99 |
+
else:
|
| 100 |
+
sampler_test = torch.utils.data.distributed.DistributedSampler(dataset_test, shuffle=False)
|
| 101 |
+
|
| 102 |
+
data_loader_train = torch.utils.data.DataLoader(
|
| 103 |
+
dataset_train,
|
| 104 |
+
sampler=sampler_train,
|
| 105 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 106 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 107 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 108 |
+
drop_last=True,
|
| 109 |
+
persistent_workers=True) if dataset_train is not None else None
|
| 110 |
+
|
| 111 |
+
data_loader_val = torch.utils.data.DataLoader(
|
| 112 |
+
dataset_val,
|
| 113 |
+
sampler=sampler_val,
|
| 114 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 115 |
+
shuffle=False,
|
| 116 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 117 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 118 |
+
drop_last=False,
|
| 119 |
+
persistent_workers=True) if dataset_val is not None else None
|
| 120 |
+
|
| 121 |
+
data_loader_test = torch.utils.data.DataLoader(
|
| 122 |
+
dataset_test,
|
| 123 |
+
sampler=sampler_test,
|
| 124 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 125 |
+
shuffle=False,
|
| 126 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 127 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 128 |
+
drop_last=False,
|
| 129 |
+
persistent_workers=True) if dataset_test is not None else None
|
| 130 |
+
|
| 131 |
+
# setup mixup / cutmix
|
| 132 |
+
mixup_fn = None
|
| 133 |
+
mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
|
| 134 |
+
if mixup_active:
|
| 135 |
+
mixup_fn = Mixup(mixup_alpha=config.AUG.MIXUP,
|
| 136 |
+
cutmix_alpha=config.AUG.CUTMIX,
|
| 137 |
+
cutmix_minmax=config.AUG.CUTMIX_MINMAX,
|
| 138 |
+
prob=config.AUG.MIXUP_PROB,
|
| 139 |
+
switch_prob=config.AUG.MIXUP_SWITCH_PROB,
|
| 140 |
+
mode=config.AUG.MIXUP_MODE,
|
| 141 |
+
label_smoothing=config.MODEL.LABEL_SMOOTHING,
|
| 142 |
+
num_classes=config.MODEL.NUM_CLASSES)
|
| 143 |
+
|
| 144 |
+
return dataset_train, dataset_val, dataset_test, data_loader_train, \
|
| 145 |
+
data_loader_val, data_loader_test, mixup_fn
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def build_loader2(config):
|
| 149 |
+
config.defrost()
|
| 150 |
+
dataset_train, config.MODEL.NUM_CLASSES = build_dataset('train', config=config)
|
| 151 |
+
config.freeze()
|
| 152 |
+
dataset_val, _ = build_dataset('val', config=config)
|
| 153 |
+
dataset_test, _ = build_dataset('test', config=config)
|
| 154 |
+
|
| 155 |
+
data_loader_train = torch.utils.data.DataLoader(
|
| 156 |
+
dataset_train,
|
| 157 |
+
shuffle=True,
|
| 158 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 159 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 160 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 161 |
+
drop_last=True,
|
| 162 |
+
persistent_workers=True) if dataset_train is not None else None
|
| 163 |
+
|
| 164 |
+
data_loader_val = torch.utils.data.DataLoader(
|
| 165 |
+
dataset_val,
|
| 166 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 167 |
+
shuffle=False,
|
| 168 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 169 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 170 |
+
drop_last=False,
|
| 171 |
+
persistent_workers=True) if dataset_val is not None else None
|
| 172 |
+
|
| 173 |
+
data_loader_test = torch.utils.data.DataLoader(
|
| 174 |
+
dataset_test,
|
| 175 |
+
batch_size=config.DATA.BATCH_SIZE,
|
| 176 |
+
shuffle=False,
|
| 177 |
+
num_workers=config.DATA.NUM_WORKERS,
|
| 178 |
+
pin_memory=config.DATA.PIN_MEMORY,
|
| 179 |
+
drop_last=False,
|
| 180 |
+
persistent_workers=True) if dataset_test is not None else None
|
| 181 |
+
|
| 182 |
+
# setup mixup / cutmix
|
| 183 |
+
mixup_fn = None
|
| 184 |
+
mixup_active = config.AUG.MIXUP > 0 or config.AUG.CUTMIX > 0. or config.AUG.CUTMIX_MINMAX is not None
|
| 185 |
+
if mixup_active:
|
| 186 |
+
mixup_fn = Mixup(mixup_alpha=config.AUG.MIXUP,
|
| 187 |
+
cutmix_alpha=config.AUG.CUTMIX,
|
| 188 |
+
cutmix_minmax=config.AUG.CUTMIX_MINMAX,
|
| 189 |
+
prob=config.AUG.MIXUP_PROB,
|
| 190 |
+
switch_prob=config.AUG.MIXUP_SWITCH_PROB,
|
| 191 |
+
mode=config.AUG.MIXUP_MODE,
|
| 192 |
+
label_smoothing=config.MODEL.LABEL_SMOOTHING,
|
| 193 |
+
num_classes=config.MODEL.NUM_CLASSES)
|
| 194 |
+
|
| 195 |
+
return dataset_train, dataset_val, dataset_test, data_loader_train, \
|
| 196 |
+
data_loader_val, data_loader_test, mixup_fn
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def build_dataset(split, config):
|
| 200 |
+
if config.DATA.TRANSFORM == 'build_transform':
|
| 201 |
+
transform = build_transform(split == 'train', config)
|
| 202 |
+
elif config.DATA.TRANSFORM == 'build_transform_for_linear_probe':
|
| 203 |
+
transform = build_transform_for_linear_probe(split == 'train', config)
|
| 204 |
+
else:
|
| 205 |
+
raise NotImplementedError
|
| 206 |
+
print(split, transform)
|
| 207 |
+
dataset = None
|
| 208 |
+
nb_classes = None
|
| 209 |
+
prefix = split
|
| 210 |
+
if config.DATA.DATASET == 'imagenet' or config.DATA.DATASET == 'imagenet-real':
|
| 211 |
+
if prefix == 'train' and not config.EVAL_MODE:
|
| 212 |
+
root = os.path.join(config.DATA.DATA_PATH, 'train')
|
| 213 |
+
dataset = ImageCephDataset(root, 'train',
|
| 214 |
+
transform=transform,
|
| 215 |
+
on_memory=config.DATA.IMG_ON_MEMORY)
|
| 216 |
+
elif prefix == 'val':
|
| 217 |
+
root = os.path.join(config.DATA.DATA_PATH, 'val')
|
| 218 |
+
dataset = ImageCephDataset(root, 'val', transform=transform)
|
| 219 |
+
nb_classes = 1000
|
| 220 |
+
elif config.DATA.DATASET == 'imagenet22K':
|
| 221 |
+
if prefix == 'train':
|
| 222 |
+
if not config.EVAL_MODE:
|
| 223 |
+
root = config.DATA.DATA_PATH
|
| 224 |
+
dataset = ImageCephDataset(root, 'train',
|
| 225 |
+
transform=transform,
|
| 226 |
+
on_memory=config.DATA.IMG_ON_MEMORY)
|
| 227 |
+
nb_classes = 21841
|
| 228 |
+
elif prefix == 'val':
|
| 229 |
+
root = os.path.join(config.DATA.DATA_PATH, 'val')
|
| 230 |
+
dataset = ImageCephDataset(root, 'val', transform=transform)
|
| 231 |
+
nb_classes = 1000
|
| 232 |
+
elif config.DATA.DATASET == 'imagenetv2':
|
| 233 |
+
from .imagenetv2 import ImageNetV2Dataset
|
| 234 |
+
if prefix == 'train' and not config.EVAL_MODE:
|
| 235 |
+
print(f'Only test split available for {config.DATA.DATASET}')
|
| 236 |
+
else:
|
| 237 |
+
dataset = ImageNetV2Dataset(variant='matched-frequency',
|
| 238 |
+
transform=transform,
|
| 239 |
+
location=config.DATA.DATA_PATH)
|
| 240 |
+
nb_classes = 1000
|
| 241 |
+
elif config.DATA.DATASET == 'imagenet_sketch':
|
| 242 |
+
if prefix == 'train' and not config.EVAL_MODE:
|
| 243 |
+
print(f'Only test split available for {config.DATA.DATASET}')
|
| 244 |
+
else:
|
| 245 |
+
dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
|
| 246 |
+
nb_classes = 1000
|
| 247 |
+
elif config.DATA.DATASET == 'imagenet_a':
|
| 248 |
+
if prefix == 'train' and not config.EVAL_MODE:
|
| 249 |
+
print(f'Only test split available for {config.DATA.DATASET}')
|
| 250 |
+
else:
|
| 251 |
+
dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
|
| 252 |
+
nb_classes = 1000 # actual number of classes is 200
|
| 253 |
+
elif config.DATA.DATASET == 'imagenet_r':
|
| 254 |
+
if prefix == 'train' and not config.EVAL_MODE:
|
| 255 |
+
print(f'Only test split available for {config.DATA.DATASET}')
|
| 256 |
+
else:
|
| 257 |
+
dataset = ImageFolder(root=config.DATA.DATA_PATH, transform=transform)
|
| 258 |
+
nb_classes = 1000 # actual number of classes is 200
|
| 259 |
+
else:
|
| 260 |
+
raise NotImplementedError(
|
| 261 |
+
f'build_dataset does support {config.DATA.DATASET}')
|
| 262 |
+
|
| 263 |
+
return dataset, nb_classes
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
def build_transform_for_linear_probe(is_train, config):
|
| 267 |
+
# linear probe: weak augmentation
|
| 268 |
+
if is_train:
|
| 269 |
+
transform = transforms.Compose([
|
| 270 |
+
transforms.RandomResizedCrop(
|
| 271 |
+
config.DATA.IMG_SIZE, interpolation=transforms.InterpolationMode.BICUBIC),
|
| 272 |
+
transforms.RandomHorizontalFlip(),
|
| 273 |
+
transforms.ToTensor(),
|
| 274 |
+
transforms.Normalize(mean=config.AUG.MEAN, std=config.AUG.STD)
|
| 275 |
+
])
|
| 276 |
+
else:
|
| 277 |
+
transform = transforms.Compose([
|
| 278 |
+
transforms.Resize(
|
| 279 |
+
config.DATA.IMG_SIZE, interpolation=transforms.InterpolationMode.BICUBIC),
|
| 280 |
+
transforms.CenterCrop(config.DATA.IMG_SIZE),
|
| 281 |
+
transforms.ToTensor(),
|
| 282 |
+
transforms.Normalize(mean=config.AUG.MEAN, std=config.AUG.STD)
|
| 283 |
+
])
|
| 284 |
+
return transform
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def build_transform(is_train, config):
|
| 288 |
+
resize_im = config.DATA.IMG_SIZE > 32
|
| 289 |
+
if is_train:
|
| 290 |
+
# this should always dispatch to transforms_imagenet_train
|
| 291 |
+
transform = create_transform(
|
| 292 |
+
input_size=config.DATA.IMG_SIZE,
|
| 293 |
+
is_training=True,
|
| 294 |
+
color_jitter=config.AUG.COLOR_JITTER
|
| 295 |
+
if config.AUG.COLOR_JITTER > 0 else None,
|
| 296 |
+
auto_augment=config.AUG.AUTO_AUGMENT
|
| 297 |
+
if config.AUG.AUTO_AUGMENT != 'none' else None,
|
| 298 |
+
re_prob=config.AUG.REPROB,
|
| 299 |
+
re_mode=config.AUG.REMODE,
|
| 300 |
+
re_count=config.AUG.RECOUNT,
|
| 301 |
+
interpolation=config.DATA.INTERPOLATION,
|
| 302 |
+
)
|
| 303 |
+
if not resize_im:
|
| 304 |
+
# replace RandomResizedCropAndInterpolation with
|
| 305 |
+
# RandomCrop
|
| 306 |
+
transform.transforms[0] = transforms.RandomCrop(config.DATA.IMG_SIZE, padding=4)
|
| 307 |
+
|
| 308 |
+
return transform
|
| 309 |
+
|
| 310 |
+
t = []
|
| 311 |
+
if resize_im:
|
| 312 |
+
if config.TEST.CROP:
|
| 313 |
+
size = int(1.0 * config.DATA.IMG_SIZE)
|
| 314 |
+
t.append(
|
| 315 |
+
transforms.Resize(size, interpolation=_pil_interp(config.DATA.INTERPOLATION)),
|
| 316 |
+
# to maintain same ratio w.r.t. 224 images
|
| 317 |
+
)
|
| 318 |
+
t.append(transforms.CenterCrop(config.DATA.IMG_SIZE))
|
| 319 |
+
elif config.AUG.RANDOM_RESIZED_CROP:
|
| 320 |
+
t.append(
|
| 321 |
+
transforms.RandomResizedCrop(
|
| 322 |
+
(config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
|
| 323 |
+
interpolation=_pil_interp(config.DATA.INTERPOLATION)))
|
| 324 |
+
else:
|
| 325 |
+
t.append(
|
| 326 |
+
transforms.Resize(
|
| 327 |
+
(config.DATA.IMG_SIZE, config.DATA.IMG_SIZE),
|
| 328 |
+
interpolation=_pil_interp(config.DATA.INTERPOLATION)))
|
| 329 |
+
t.append(transforms.ToTensor())
|
| 330 |
+
t.append(transforms.Normalize(config.AUG.MEAN, config.AUG.STD))
|
| 331 |
+
|
| 332 |
+
return transforms.Compose(t)
|
VLMEvalKit_old/InternVL/classification/dataset/imagenet_a_r_indices.py
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Code from https://github.com/baaivision/EVA/blob/master/EVA-02/asuka/imagenet_a_r_indices.py
|
| 2 |
+
Thanks to the authors of EVA."""
|
| 3 |
+
|
| 4 |
+
all_wnids = [
|
| 5 |
+
'n01440764', 'n01443537', 'n01484850', 'n01491361', 'n01494475',
|
| 6 |
+
'n01496331', 'n01498041', 'n01514668', 'n01514859', 'n01518878',
|
| 7 |
+
'n01530575', 'n01531178', 'n01532829', 'n01534433', 'n01537544',
|
| 8 |
+
'n01558993', 'n01560419', 'n01580077', 'n01582220', 'n01592084',
|
| 9 |
+
'n01601694', 'n01608432', 'n01614925', 'n01616318', 'n01622779',
|
| 10 |
+
'n01629819', 'n01630670', 'n01631663', 'n01632458', 'n01632777',
|
| 11 |
+
'n01641577', 'n01644373', 'n01644900', 'n01664065', 'n01665541',
|
| 12 |
+
'n01667114', 'n01667778', 'n01669191', 'n01675722', 'n01677366',
|
| 13 |
+
'n01682714', 'n01685808', 'n01687978', 'n01688243', 'n01689811',
|
| 14 |
+
'n01692333', 'n01693334', 'n01694178', 'n01695060', 'n01697457',
|
| 15 |
+
'n01698640', 'n01704323', 'n01728572', 'n01728920', 'n01729322',
|
| 16 |
+
'n01729977', 'n01734418', 'n01735189', 'n01737021', 'n01739381',
|
| 17 |
+
'n01740131', 'n01742172', 'n01744401', 'n01748264', 'n01749939',
|
| 18 |
+
'n01751748', 'n01753488', 'n01755581', 'n01756291', 'n01768244',
|
| 19 |
+
'n01770081', 'n01770393', 'n01773157', 'n01773549', 'n01773797',
|
| 20 |
+
'n01774384', 'n01774750', 'n01775062', 'n01776313', 'n01784675',
|
| 21 |
+
'n01795545', 'n01796340', 'n01797886', 'n01798484', 'n01806143',
|
| 22 |
+
'n01806567', 'n01807496', 'n01817953', 'n01818515', 'n01819313',
|
| 23 |
+
'n01820546', 'n01824575', 'n01828970', 'n01829413', 'n01833805',
|
| 24 |
+
'n01843065', 'n01843383', 'n01847000', 'n01855032', 'n01855672',
|
| 25 |
+
'n01860187', 'n01871265', 'n01872401', 'n01873310', 'n01877812',
|
| 26 |
+
'n01882714', 'n01883070', 'n01910747', 'n01914609', 'n01917289',
|
| 27 |
+
'n01924916', 'n01930112', 'n01943899', 'n01944390', 'n01945685',
|
| 28 |
+
'n01950731', 'n01955084', 'n01968897', 'n01978287', 'n01978455',
|
| 29 |
+
'n01980166', 'n01981276', 'n01983481', 'n01984695', 'n01985128',
|
| 30 |
+
'n01986214', 'n01990800', 'n02002556', 'n02002724', 'n02006656',
|
| 31 |
+
'n02007558', 'n02009229', 'n02009912', 'n02011460', 'n02012849',
|
| 32 |
+
'n02013706', 'n02017213', 'n02018207', 'n02018795', 'n02025239',
|
| 33 |
+
'n02027492', 'n02028035', 'n02033041', 'n02037110', 'n02051845',
|
| 34 |
+
'n02056570', 'n02058221', 'n02066245', 'n02071294', 'n02074367',
|
| 35 |
+
'n02077923', 'n02085620', 'n02085782', 'n02085936', 'n02086079',
|
| 36 |
+
'n02086240', 'n02086646', 'n02086910', 'n02087046', 'n02087394',
|
| 37 |
+
'n02088094', 'n02088238', 'n02088364', 'n02088466', 'n02088632',
|
| 38 |
+
'n02089078', 'n02089867', 'n02089973', 'n02090379', 'n02090622',
|
| 39 |
+
'n02090721', 'n02091032', 'n02091134', 'n02091244', 'n02091467',
|
| 40 |
+
'n02091635', 'n02091831', 'n02092002', 'n02092339', 'n02093256',
|
| 41 |
+
'n02093428', 'n02093647', 'n02093754', 'n02093859', 'n02093991',
|
| 42 |
+
'n02094114', 'n02094258', 'n02094433', 'n02095314', 'n02095570',
|
| 43 |
+
'n02095889', 'n02096051', 'n02096177', 'n02096294', 'n02096437',
|
| 44 |
+
'n02096585', 'n02097047', 'n02097130', 'n02097209', 'n02097298',
|
| 45 |
+
'n02097474', 'n02097658', 'n02098105', 'n02098286', 'n02098413',
|
| 46 |
+
'n02099267', 'n02099429', 'n02099601', 'n02099712', 'n02099849',
|
| 47 |
+
'n02100236', 'n02100583', 'n02100735', 'n02100877', 'n02101006',
|
| 48 |
+
'n02101388', 'n02101556', 'n02102040', 'n02102177', 'n02102318',
|
| 49 |
+
'n02102480', 'n02102973', 'n02104029', 'n02104365', 'n02105056',
|
| 50 |
+
'n02105162', 'n02105251', 'n02105412', 'n02105505', 'n02105641',
|
| 51 |
+
'n02105855', 'n02106030', 'n02106166', 'n02106382', 'n02106550',
|
| 52 |
+
'n02106662', 'n02107142', 'n02107312', 'n02107574', 'n02107683',
|
| 53 |
+
'n02107908', 'n02108000', 'n02108089', 'n02108422', 'n02108551',
|
| 54 |
+
'n02108915', 'n02109047', 'n02109525', 'n02109961', 'n02110063',
|
| 55 |
+
'n02110185', 'n02110341', 'n02110627', 'n02110806', 'n02110958',
|
| 56 |
+
'n02111129', 'n02111277', 'n02111500', 'n02111889', 'n02112018',
|
| 57 |
+
'n02112137', 'n02112350', 'n02112706', 'n02113023', 'n02113186',
|
| 58 |
+
'n02113624', 'n02113712', 'n02113799', 'n02113978', 'n02114367',
|
| 59 |
+
'n02114548', 'n02114712', 'n02114855', 'n02115641', 'n02115913',
|
| 60 |
+
'n02116738', 'n02117135', 'n02119022', 'n02119789', 'n02120079',
|
| 61 |
+
'n02120505', 'n02123045', 'n02123159', 'n02123394', 'n02123597',
|
| 62 |
+
'n02124075', 'n02125311', 'n02127052', 'n02128385', 'n02128757',
|
| 63 |
+
'n02128925', 'n02129165', 'n02129604', 'n02130308', 'n02132136',
|
| 64 |
+
'n02133161', 'n02134084', 'n02134418', 'n02137549', 'n02138441',
|
| 65 |
+
'n02165105', 'n02165456', 'n02167151', 'n02168699', 'n02169497',
|
| 66 |
+
'n02172182', 'n02174001', 'n02177972', 'n02190166', 'n02206856',
|
| 67 |
+
'n02219486', 'n02226429', 'n02229544', 'n02231487', 'n02233338',
|
| 68 |
+
'n02236044', 'n02256656', 'n02259212', 'n02264363', 'n02268443',
|
| 69 |
+
'n02268853', 'n02276258', 'n02277742', 'n02279972', 'n02280649',
|
| 70 |
+
'n02281406', 'n02281787', 'n02317335', 'n02319095', 'n02321529',
|
| 71 |
+
'n02325366', 'n02326432', 'n02328150', 'n02342885', 'n02346627',
|
| 72 |
+
'n02356798', 'n02361337', 'n02363005', 'n02364673', 'n02389026',
|
| 73 |
+
'n02391049', 'n02395406', 'n02396427', 'n02397096', 'n02398521',
|
| 74 |
+
'n02403003', 'n02408429', 'n02410509', 'n02412080', 'n02415577',
|
| 75 |
+
'n02417914', 'n02422106', 'n02422699', 'n02423022', 'n02437312',
|
| 76 |
+
'n02437616', 'n02441942', 'n02442845', 'n02443114', 'n02443484',
|
| 77 |
+
'n02444819', 'n02445715', 'n02447366', 'n02454379', 'n02457408',
|
| 78 |
+
'n02480495', 'n02480855', 'n02481823', 'n02483362', 'n02483708',
|
| 79 |
+
'n02484975', 'n02486261', 'n02486410', 'n02487347', 'n02488291',
|
| 80 |
+
'n02488702', 'n02489166', 'n02490219', 'n02492035', 'n02492660',
|
| 81 |
+
'n02493509', 'n02493793', 'n02494079', 'n02497673', 'n02500267',
|
| 82 |
+
'n02504013', 'n02504458', 'n02509815', 'n02510455', 'n02514041',
|
| 83 |
+
'n02526121', 'n02536864', 'n02606052', 'n02607072', 'n02640242',
|
| 84 |
+
'n02641379', 'n02643566', 'n02655020', 'n02666196', 'n02667093',
|
| 85 |
+
'n02669723', 'n02672831', 'n02676566', 'n02687172', 'n02690373',
|
| 86 |
+
'n02692877', 'n02699494', 'n02701002', 'n02704792', 'n02708093',
|
| 87 |
+
'n02727426', 'n02730930', 'n02747177', 'n02749479', 'n02769748',
|
| 88 |
+
'n02776631', 'n02777292', 'n02782093', 'n02783161', 'n02786058',
|
| 89 |
+
'n02787622', 'n02788148', 'n02790996', 'n02791124', 'n02791270',
|
| 90 |
+
'n02793495', 'n02794156', 'n02795169', 'n02797295', 'n02799071',
|
| 91 |
+
'n02802426', 'n02804414', 'n02804610', 'n02807133', 'n02808304',
|
| 92 |
+
'n02808440', 'n02814533', 'n02814860', 'n02815834', 'n02817516',
|
| 93 |
+
'n02823428', 'n02823750', 'n02825657', 'n02834397', 'n02835271',
|
| 94 |
+
'n02837789', 'n02840245', 'n02841315', 'n02843684', 'n02859443',
|
| 95 |
+
'n02860847', 'n02865351', 'n02869837', 'n02870880', 'n02871525',
|
| 96 |
+
'n02877765', 'n02879718', 'n02883205', 'n02892201', 'n02892767',
|
| 97 |
+
'n02894605', 'n02895154', 'n02906734', 'n02909870', 'n02910353',
|
| 98 |
+
'n02916936', 'n02917067', 'n02927161', 'n02930766', 'n02939185',
|
| 99 |
+
'n02948072', 'n02950826', 'n02951358', 'n02951585', 'n02963159',
|
| 100 |
+
'n02965783', 'n02966193', 'n02966687', 'n02971356', 'n02974003',
|
| 101 |
+
'n02977058', 'n02978881', 'n02979186', 'n02980441', 'n02981792',
|
| 102 |
+
'n02988304', 'n02992211', 'n02992529', 'n02999410', 'n03000134',
|
| 103 |
+
'n03000247', 'n03000684', 'n03014705', 'n03016953', 'n03017168',
|
| 104 |
+
'n03018349', 'n03026506', 'n03028079', 'n03032252', 'n03041632',
|
| 105 |
+
'n03042490', 'n03045698', 'n03047690', 'n03062245', 'n03063599',
|
| 106 |
+
'n03063689', 'n03065424', 'n03075370', 'n03085013', 'n03089624',
|
| 107 |
+
'n03095699', 'n03100240', 'n03109150', 'n03110669', 'n03124043',
|
| 108 |
+
'n03124170', 'n03125729', 'n03126707', 'n03127747', 'n03127925',
|
| 109 |
+
'n03131574', 'n03133878', 'n03134739', 'n03141823', 'n03146219',
|
| 110 |
+
'n03160309', 'n03179701', 'n03180011', 'n03187595', 'n03188531',
|
| 111 |
+
'n03196217', 'n03197337', 'n03201208', 'n03207743', 'n03207941',
|
| 112 |
+
'n03208938', 'n03216828', 'n03218198', 'n03220513', 'n03223299',
|
| 113 |
+
'n03240683', 'n03249569', 'n03250847', 'n03255030', 'n03259280',
|
| 114 |
+
'n03271574', 'n03272010', 'n03272562', 'n03290653', 'n03291819',
|
| 115 |
+
'n03297495', 'n03314780', 'n03325584', 'n03337140', 'n03344393',
|
| 116 |
+
'n03345487', 'n03347037', 'n03355925', 'n03372029', 'n03376595',
|
| 117 |
+
'n03379051', 'n03384352', 'n03388043', 'n03388183', 'n03388549',
|
| 118 |
+
'n03393912', 'n03394916', 'n03400231', 'n03404251', 'n03417042',
|
| 119 |
+
'n03424325', 'n03425413', 'n03443371', 'n03444034', 'n03445777',
|
| 120 |
+
'n03445924', 'n03447447', 'n03447721', 'n03450230', 'n03452741',
|
| 121 |
+
'n03457902', 'n03459775', 'n03461385', 'n03467068', 'n03476684',
|
| 122 |
+
'n03476991', 'n03478589', 'n03481172', 'n03482405', 'n03483316',
|
| 123 |
+
'n03485407', 'n03485794', 'n03492542', 'n03494278', 'n03495258',
|
| 124 |
+
'n03496892', 'n03498962', 'n03527444', 'n03529860', 'n03530642',
|
| 125 |
+
'n03532672', 'n03534580', 'n03535780', 'n03538406', 'n03544143',
|
| 126 |
+
'n03584254', 'n03584829', 'n03590841', 'n03594734', 'n03594945',
|
| 127 |
+
'n03595614', 'n03598930', 'n03599486', 'n03602883', 'n03617480',
|
| 128 |
+
'n03623198', 'n03627232', 'n03630383', 'n03633091', 'n03637318',
|
| 129 |
+
'n03642806', 'n03649909', 'n03657121', 'n03658185', 'n03661043',
|
| 130 |
+
'n03662601', 'n03666591', 'n03670208', 'n03673027', 'n03676483',
|
| 131 |
+
'n03680355', 'n03690938', 'n03691459', 'n03692522', 'n03697007',
|
| 132 |
+
'n03706229', 'n03709823', 'n03710193', 'n03710637', 'n03710721',
|
| 133 |
+
'n03717622', 'n03720891', 'n03721384', 'n03724870', 'n03729826',
|
| 134 |
+
'n03733131', 'n03733281', 'n03733805', 'n03742115', 'n03743016',
|
| 135 |
+
'n03759954', 'n03761084', 'n03763968', 'n03764736', 'n03769881',
|
| 136 |
+
'n03770439', 'n03770679', 'n03773504', 'n03775071', 'n03775546',
|
| 137 |
+
'n03776460', 'n03777568', 'n03777754', 'n03781244', 'n03782006',
|
| 138 |
+
'n03785016', 'n03786901', 'n03787032', 'n03788195', 'n03788365',
|
| 139 |
+
'n03791053', 'n03792782', 'n03792972', 'n03793489', 'n03794056',
|
| 140 |
+
'n03796401', 'n03803284', 'n03804744', 'n03814639', 'n03814906',
|
| 141 |
+
'n03825788', 'n03832673', 'n03837869', 'n03838899', 'n03840681',
|
| 142 |
+
'n03841143', 'n03843555', 'n03854065', 'n03857828', 'n03866082',
|
| 143 |
+
'n03868242', 'n03868863', 'n03871628', 'n03873416', 'n03874293',
|
| 144 |
+
'n03874599', 'n03876231', 'n03877472', 'n03877845', 'n03884397',
|
| 145 |
+
'n03887697', 'n03888257', 'n03888605', 'n03891251', 'n03891332',
|
| 146 |
+
'n03895866', 'n03899768', 'n03902125', 'n03903868', 'n03908618',
|
| 147 |
+
'n03908714', 'n03916031', 'n03920288', 'n03924679', 'n03929660',
|
| 148 |
+
'n03929855', 'n03930313', 'n03930630', 'n03933933', 'n03935335',
|
| 149 |
+
'n03937543', 'n03938244', 'n03942813', 'n03944341', 'n03947888',
|
| 150 |
+
'n03950228', 'n03954731', 'n03956157', 'n03958227', 'n03961711',
|
| 151 |
+
'n03967562', 'n03970156', 'n03976467', 'n03976657', 'n03977966',
|
| 152 |
+
'n03980874', 'n03982430', 'n03983396', 'n03991062', 'n03992509',
|
| 153 |
+
'n03995372', 'n03998194', 'n04004767', 'n04005630', 'n04008634',
|
| 154 |
+
'n04009552', 'n04019541', 'n04023962', 'n04026417', 'n04033901',
|
| 155 |
+
'n04033995', 'n04037443', 'n04039381', 'n04040759', 'n04041544',
|
| 156 |
+
'n04044716', 'n04049303', 'n04065272', 'n04067472', 'n04069434',
|
| 157 |
+
'n04070727', 'n04074963', 'n04081281', 'n04086273', 'n04090263',
|
| 158 |
+
'n04099969', 'n04111531', 'n04116512', 'n04118538', 'n04118776',
|
| 159 |
+
'n04120489', 'n04125021', 'n04127249', 'n04131690', 'n04133789',
|
| 160 |
+
'n04136333', 'n04141076', 'n04141327', 'n04141975', 'n04146614',
|
| 161 |
+
'n04147183', 'n04149813', 'n04152593', 'n04153751', 'n04154565',
|
| 162 |
+
'n04162706', 'n04179913', 'n04192698', 'n04200800', 'n04201297',
|
| 163 |
+
'n04204238', 'n04204347', 'n04208210', 'n04209133', 'n04209239',
|
| 164 |
+
'n04228054', 'n04229816', 'n04235860', 'n04238763', 'n04239074',
|
| 165 |
+
'n04243546', 'n04251144', 'n04252077', 'n04252225', 'n04254120',
|
| 166 |
+
'n04254680', 'n04254777', 'n04258138', 'n04259630', 'n04263257',
|
| 167 |
+
'n04264628', 'n04265275', 'n04266014', 'n04270147', 'n04273569',
|
| 168 |
+
'n04275548', 'n04277352', 'n04285008', 'n04286575', 'n04296562',
|
| 169 |
+
'n04310018', 'n04311004', 'n04311174', 'n04317175', 'n04325704',
|
| 170 |
+
'n04326547', 'n04328186', 'n04330267', 'n04332243', 'n04335435',
|
| 171 |
+
'n04336792', 'n04344873', 'n04346328', 'n04347754', 'n04350905',
|
| 172 |
+
'n04355338', 'n04355933', 'n04356056', 'n04357314', 'n04366367',
|
| 173 |
+
'n04367480', 'n04370456', 'n04371430', 'n04371774', 'n04372370',
|
| 174 |
+
'n04376876', 'n04380533', 'n04389033', 'n04392985', 'n04398044',
|
| 175 |
+
'n04399382', 'n04404412', 'n04409515', 'n04417672', 'n04418357',
|
| 176 |
+
'n04423845', 'n04428191', 'n04429376', 'n04435653', 'n04442312',
|
| 177 |
+
'n04443257', 'n04447861', 'n04456115', 'n04458633', 'n04461696',
|
| 178 |
+
'n04462240', 'n04465501', 'n04467665', 'n04476259', 'n04479046',
|
| 179 |
+
'n04482393', 'n04483307', 'n04485082', 'n04486054', 'n04487081',
|
| 180 |
+
'n04487394', 'n04493381', 'n04501370', 'n04505470', 'n04507155',
|
| 181 |
+
'n04509417', 'n04515003', 'n04517823', 'n04522168', 'n04523525',
|
| 182 |
+
'n04525038', 'n04525305', 'n04532106', 'n04532670', 'n04536866',
|
| 183 |
+
'n04540053', 'n04542943', 'n04548280', 'n04548362', 'n04550184',
|
| 184 |
+
'n04552348', 'n04553703', 'n04554684', 'n04557648', 'n04560804',
|
| 185 |
+
'n04562935', 'n04579145', 'n04579432', 'n04584207', 'n04589890',
|
| 186 |
+
'n04590129', 'n04591157', 'n04591713', 'n04592741', 'n04596742',
|
| 187 |
+
'n04597913', 'n04599235', 'n04604644', 'n04606251', 'n04612504',
|
| 188 |
+
'n04613696', 'n06359193', 'n06596364', 'n06785654', 'n06794110',
|
| 189 |
+
'n06874185', 'n07248320', 'n07565083', 'n07579787', 'n07583066',
|
| 190 |
+
'n07584110', 'n07590611', 'n07613480', 'n07614500', 'n07615774',
|
| 191 |
+
'n07684084', 'n07693725', 'n07695742', 'n07697313', 'n07697537',
|
| 192 |
+
'n07711569', 'n07714571', 'n07714990', 'n07715103', 'n07716358',
|
| 193 |
+
'n07716906', 'n07717410', 'n07717556', 'n07718472', 'n07718747',
|
| 194 |
+
'n07720875', 'n07730033', 'n07734744', 'n07742313', 'n07745940',
|
| 195 |
+
'n07747607', 'n07749582', 'n07753113', 'n07753275', 'n07753592',
|
| 196 |
+
'n07754684', 'n07760859', 'n07768694', 'n07802026', 'n07831146',
|
| 197 |
+
'n07836838', 'n07860988', 'n07871810', 'n07873807', 'n07875152',
|
| 198 |
+
'n07880968', 'n07892512', 'n07920052', 'n07930864', 'n07932039',
|
| 199 |
+
'n09193705', 'n09229709', 'n09246464', 'n09256479', 'n09288635',
|
| 200 |
+
'n09332890', 'n09399592', 'n09421951', 'n09428293', 'n09468604',
|
| 201 |
+
'n09472597', 'n09835506', 'n10148035', 'n10565667', 'n11879895',
|
| 202 |
+
'n11939491', 'n12057211', 'n12144580', 'n12267677', 'n12620546',
|
| 203 |
+
'n12768682', 'n12985857', 'n12998815', 'n13037406', 'n13040303',
|
| 204 |
+
'n13044778', 'n13052670', 'n13054560', 'n13133613', 'n15075141'
|
| 205 |
+
]
|
| 206 |
+
|
| 207 |
+
imagenet_a_wnids = [
|
| 208 |
+
'n01498041', 'n01531178', 'n01534433', 'n01558993', 'n01580077',
|
| 209 |
+
'n01614925', 'n01616318', 'n01631663', 'n01641577', 'n01669191',
|
| 210 |
+
'n01677366', 'n01687978', 'n01694178', 'n01698640', 'n01735189',
|
| 211 |
+
'n01770081', 'n01770393', 'n01774750', 'n01784675', 'n01819313',
|
| 212 |
+
'n01820546', 'n01833805', 'n01843383', 'n01847000', 'n01855672',
|
| 213 |
+
'n01882714', 'n01910747', 'n01914609', 'n01924916', 'n01944390',
|
| 214 |
+
'n01985128', 'n01986214', 'n02007558', 'n02009912', 'n02037110',
|
| 215 |
+
'n02051845', 'n02077923', 'n02085620', 'n02099601', 'n02106550',
|
| 216 |
+
'n02106662', 'n02110958', 'n02119022', 'n02123394', 'n02127052',
|
| 217 |
+
'n02129165', 'n02133161', 'n02137549', 'n02165456', 'n02174001',
|
| 218 |
+
'n02177972', 'n02190166', 'n02206856', 'n02219486', 'n02226429',
|
| 219 |
+
'n02231487', 'n02233338', 'n02236044', 'n02259212', 'n02268443',
|
| 220 |
+
'n02279972', 'n02280649', 'n02281787', 'n02317335', 'n02325366',
|
| 221 |
+
'n02346627', 'n02356798', 'n02361337', 'n02410509', 'n02445715',
|
| 222 |
+
'n02454379', 'n02486410', 'n02492035', 'n02504458', 'n02655020',
|
| 223 |
+
'n02669723', 'n02672831', 'n02676566', 'n02690373', 'n02701002',
|
| 224 |
+
'n02730930', 'n02777292', 'n02782093', 'n02787622', 'n02793495',
|
| 225 |
+
'n02797295', 'n02802426', 'n02814860', 'n02815834', 'n02837789',
|
| 226 |
+
'n02879718', 'n02883205', 'n02895154', 'n02906734', 'n02948072',
|
| 227 |
+
'n02951358', 'n02980441', 'n02992211', 'n02999410', 'n03014705',
|
| 228 |
+
'n03026506', 'n03124043', 'n03125729', 'n03187595', 'n03196217',
|
| 229 |
+
'n03223299', 'n03250847', 'n03255030', 'n03291819', 'n03325584',
|
| 230 |
+
'n03355925', 'n03384352', 'n03388043', 'n03417042', 'n03443371',
|
| 231 |
+
'n03444034', 'n03445924', 'n03452741', 'n03483316', 'n03584829',
|
| 232 |
+
'n03590841', 'n03594945', 'n03617480', 'n03666591', 'n03670208',
|
| 233 |
+
'n03717622', 'n03720891', 'n03721384', 'n03724870', 'n03775071',
|
| 234 |
+
'n03788195', 'n03804744', 'n03837869', 'n03840681', 'n03854065',
|
| 235 |
+
'n03888257', 'n03891332', 'n03935335', 'n03982430', 'n04019541',
|
| 236 |
+
'n04033901', 'n04039381', 'n04067472', 'n04086273', 'n04099969',
|
| 237 |
+
'n04118538', 'n04131690', 'n04133789', 'n04141076', 'n04146614',
|
| 238 |
+
'n04147183', 'n04179913', 'n04208210', 'n04235860', 'n04252077',
|
| 239 |
+
'n04252225', 'n04254120', 'n04270147', 'n04275548', 'n04310018',
|
| 240 |
+
'n04317175', 'n04344873', 'n04347754', 'n04355338', 'n04366367',
|
| 241 |
+
'n04376876', 'n04389033', 'n04399382', 'n04442312', 'n04456115',
|
| 242 |
+
'n04482393', 'n04507155', 'n04509417', 'n04532670', 'n04540053',
|
| 243 |
+
'n04554684', 'n04562935', 'n04591713', 'n04606251', 'n07583066',
|
| 244 |
+
'n07695742', 'n07697313', 'n07697537', 'n07714990', 'n07718472',
|
| 245 |
+
'n07720875', 'n07734744', 'n07749582', 'n07753592', 'n07760859',
|
| 246 |
+
'n07768694', 'n07831146', 'n09229709', 'n09246464', 'n09472597',
|
| 247 |
+
'n09835506', 'n11879895', 'n12057211', 'n12144580', 'n12267677'
|
| 248 |
+
]
|
| 249 |
+
|
| 250 |
+
imagenet_a_mask = [wnid in set(imagenet_a_wnids) for wnid in all_wnids]
|
| 251 |
+
|
| 252 |
+
imagenet_r_wnids = {
|
| 253 |
+
'n01443537', 'n01484850', 'n01494475', 'n01498041', 'n01514859',
|
| 254 |
+
'n01518878', 'n01531178', 'n01534433', 'n01614925', 'n01616318',
|
| 255 |
+
'n01630670', 'n01632777', 'n01644373', 'n01677366', 'n01694178',
|
| 256 |
+
'n01748264', 'n01770393', 'n01774750', 'n01784675', 'n01806143',
|
| 257 |
+
'n01820546', 'n01833805', 'n01843383', 'n01847000', 'n01855672',
|
| 258 |
+
'n01860187', 'n01882714', 'n01910747', 'n01944390', 'n01983481',
|
| 259 |
+
'n01986214', 'n02007558', 'n02009912', 'n02051845', 'n02056570',
|
| 260 |
+
'n02066245', 'n02071294', 'n02077923', 'n02085620', 'n02086240',
|
| 261 |
+
'n02088094', 'n02088238', 'n02088364', 'n02088466', 'n02091032',
|
| 262 |
+
'n02091134', 'n02092339', 'n02094433', 'n02096585', 'n02097298',
|
| 263 |
+
'n02098286', 'n02099601', 'n02099712', 'n02102318', 'n02106030',
|
| 264 |
+
'n02106166', 'n02106550', 'n02106662', 'n02108089', 'n02108915',
|
| 265 |
+
'n02109525', 'n02110185', 'n02110341', 'n02110958', 'n02112018',
|
| 266 |
+
'n02112137', 'n02113023', 'n02113624', 'n02113799', 'n02114367',
|
| 267 |
+
'n02117135', 'n02119022', 'n02123045', 'n02128385', 'n02128757',
|
| 268 |
+
'n02129165', 'n02129604', 'n02130308', 'n02134084', 'n02138441',
|
| 269 |
+
'n02165456', 'n02190166', 'n02206856', 'n02219486', 'n02226429',
|
| 270 |
+
'n02233338', 'n02236044', 'n02268443', 'n02279972', 'n02317335',
|
| 271 |
+
'n02325366', 'n02346627', 'n02356798', 'n02363005', 'n02364673',
|
| 272 |
+
'n02391049', 'n02395406', 'n02398521', 'n02410509', 'n02423022',
|
| 273 |
+
'n02437616', 'n02445715', 'n02447366', 'n02480495', 'n02480855',
|
| 274 |
+
'n02481823', 'n02483362', 'n02486410', 'n02510455', 'n02526121',
|
| 275 |
+
'n02607072', 'n02655020', 'n02672831', 'n02701002', 'n02749479',
|
| 276 |
+
'n02769748', 'n02793495', 'n02797295', 'n02802426', 'n02808440',
|
| 277 |
+
'n02814860', 'n02823750', 'n02841315', 'n02843684', 'n02883205',
|
| 278 |
+
'n02906734', 'n02909870', 'n02939185', 'n02948072', 'n02950826',
|
| 279 |
+
'n02951358', 'n02966193', 'n02980441', 'n02992529', 'n03124170',
|
| 280 |
+
'n03272010', 'n03345487', 'n03372029', 'n03424325', 'n03452741',
|
| 281 |
+
'n03467068', 'n03481172', 'n03494278', 'n03495258', 'n03498962',
|
| 282 |
+
'n03594945', 'n03602883', 'n03630383', 'n03649909', 'n03676483',
|
| 283 |
+
'n03710193', 'n03773504', 'n03775071', 'n03888257', 'n03930630',
|
| 284 |
+
'n03947888', 'n04086273', 'n04118538', 'n04133789', 'n04141076',
|
| 285 |
+
'n04146614', 'n04147183', 'n04192698', 'n04254680', 'n04266014',
|
| 286 |
+
'n04275548', 'n04310018', 'n04325704', 'n04347754', 'n04389033',
|
| 287 |
+
'n04409515', 'n04465501', 'n04487394', 'n04522168', 'n04536866',
|
| 288 |
+
'n04552348', 'n04591713', 'n07614500', 'n07693725', 'n07695742',
|
| 289 |
+
'n07697313', 'n07697537', 'n07714571', 'n07714990', 'n07718472',
|
| 290 |
+
'n07720875', 'n07734744', 'n07742313', 'n07745940', 'n07749582',
|
| 291 |
+
'n07753275', 'n07753592', 'n07768694', 'n07873807', 'n07880968',
|
| 292 |
+
'n07920052', 'n09472597', 'n09835506', 'n10565667', 'n12267677'
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
imagenet_r_mask = [wnid in imagenet_r_wnids for wnid in all_wnids]
|
VLMEvalKit_old/InternVL/classification/dataset/imagenetv2.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Code from https://github.com/mlfoundations/wise-ft/blob/master/src/datasets/imagenetv2.py
|
| 2 |
+
Thanks to the authors of wise-ft."""
|
| 3 |
+
import pathlib
|
| 4 |
+
import shutil
|
| 5 |
+
import tarfile
|
| 6 |
+
|
| 7 |
+
import requests
|
| 8 |
+
from PIL import Image
|
| 9 |
+
from torch.utils.data import Dataset
|
| 10 |
+
from tqdm import tqdm
|
| 11 |
+
|
| 12 |
+
URLS = {'matched-frequency': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-matched-frequency.tar.gz',
|
| 13 |
+
'threshold-0.7': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-threshold0.7.tar.gz',
|
| 14 |
+
'top-images': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenetv2-top-images.tar.gz',
|
| 15 |
+
'val': 'https://imagenetv2public.s3-us-west-2.amazonaws.com/imagenet_validation.tar.gz'}
|
| 16 |
+
|
| 17 |
+
FNAMES = {'matched-frequency': 'imagenetv2-matched-frequency-format-val',
|
| 18 |
+
'threshold-0.7': 'imagenetv2-threshold0.7-format-val',
|
| 19 |
+
'top-images': 'imagenetv2-top-images-format-val',
|
| 20 |
+
'val': 'imagenet_validation'}
|
| 21 |
+
|
| 22 |
+
V2_DATASET_SIZE = 10000
|
| 23 |
+
VAL_DATASET_SIZE = 50000
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
class ImageNetV2Dataset(Dataset):
|
| 27 |
+
def __init__(self, variant='matched-frequency', transform=None, location='.'):
|
| 28 |
+
self.dataset_root = pathlib.Path(f'{location}/ImageNetV2-{variant}/')
|
| 29 |
+
self.tar_root = pathlib.Path(f'{location}/ImageNetV2-{variant}.tar.gz')
|
| 30 |
+
self.fnames = list(self.dataset_root.glob('**/*.jpeg'))
|
| 31 |
+
self.transform = transform
|
| 32 |
+
assert variant in URLS, f'unknown V2 Variant: {variant}'
|
| 33 |
+
if not self.dataset_root.exists() or len(self.fnames) != V2_DATASET_SIZE:
|
| 34 |
+
if not self.tar_root.exists():
|
| 35 |
+
print(f'Dataset {variant} not found on disk, downloading....')
|
| 36 |
+
response = requests.get(URLS[variant], stream=True)
|
| 37 |
+
total_size_in_bytes = int(response.headers.get('content-length', 0))
|
| 38 |
+
block_size = 1024 # 1 Kibibyte
|
| 39 |
+
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
|
| 40 |
+
with open(self.tar_root, 'wb') as f:
|
| 41 |
+
for data in response.iter_content(block_size):
|
| 42 |
+
progress_bar.update(len(data))
|
| 43 |
+
f.write(data)
|
| 44 |
+
progress_bar.close()
|
| 45 |
+
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
|
| 46 |
+
assert False, f'Downloading from {URLS[variant]} failed'
|
| 47 |
+
print('Extracting....')
|
| 48 |
+
tarfile.open(self.tar_root).extractall(f'{location}')
|
| 49 |
+
shutil.move(f'{location}/{FNAMES[variant]}', self.dataset_root)
|
| 50 |
+
self.fnames = list(self.dataset_root.glob('**/*.jpeg'))
|
| 51 |
+
|
| 52 |
+
def __len__(self):
|
| 53 |
+
return len(self.fnames)
|
| 54 |
+
|
| 55 |
+
def __getitem__(self, i):
|
| 56 |
+
img, label = Image.open(self.fnames[i]), int(self.fnames[i].parent.name)
|
| 57 |
+
if self.transform is not None:
|
| 58 |
+
img = self.transform(img)
|
| 59 |
+
return img, label
|
VLMEvalKit_old/InternVL/classification/dataset/samplers.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
import math
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import torch
|
| 12 |
+
import torch.distributed as dist
|
| 13 |
+
from torch.utils.data.sampler import Sampler
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SubsetRandomSampler(torch.utils.data.Sampler):
|
| 17 |
+
"""Samples elements randomly from a given list of indices, without
|
| 18 |
+
replacement.
|
| 19 |
+
|
| 20 |
+
Arguments:
|
| 21 |
+
indices (sequence): a sequence of indices
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
def __init__(self, indices):
|
| 25 |
+
self.epoch = 0
|
| 26 |
+
self.indices = indices
|
| 27 |
+
|
| 28 |
+
def __iter__(self):
|
| 29 |
+
return (self.indices[i] for i in torch.randperm(len(self.indices)))
|
| 30 |
+
|
| 31 |
+
def __len__(self):
|
| 32 |
+
return len(self.indices)
|
| 33 |
+
|
| 34 |
+
def set_epoch(self, epoch):
|
| 35 |
+
self.epoch = epoch
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
class NodeDistributedSampler(Sampler):
|
| 39 |
+
"""Sampler that restricts data loading to a subset of the dataset.
|
| 40 |
+
It is especially useful in conjunction with
|
| 41 |
+
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
|
| 42 |
+
process can pass a DistributedSampler instance as a DataLoader sampler,
|
| 43 |
+
and load a subset of the original dataset that is exclusive to it.
|
| 44 |
+
.. note::
|
| 45 |
+
Dataset is assumed to be of constant size.
|
| 46 |
+
Arguments:
|
| 47 |
+
dataset: Dataset used for sampling.
|
| 48 |
+
num_replicas (optional): Number of processes participating in
|
| 49 |
+
distributed training.
|
| 50 |
+
rank (optional): Rank of the current process within num_replicas.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(self,
|
| 54 |
+
dataset,
|
| 55 |
+
num_replicas=None,
|
| 56 |
+
rank=None,
|
| 57 |
+
local_rank=None,
|
| 58 |
+
local_size=None):
|
| 59 |
+
if num_replicas is None:
|
| 60 |
+
if not dist.is_available():
|
| 61 |
+
raise RuntimeError(
|
| 62 |
+
'Requires distributed package to be available')
|
| 63 |
+
num_replicas = dist.get_world_size()
|
| 64 |
+
if rank is None:
|
| 65 |
+
if not dist.is_available():
|
| 66 |
+
raise RuntimeError(
|
| 67 |
+
'Requires distributed package to be available')
|
| 68 |
+
rank = dist.get_rank()
|
| 69 |
+
if local_rank is None:
|
| 70 |
+
local_rank = int(os.environ.get('LOCAL_RANK', 0))
|
| 71 |
+
if local_size is None:
|
| 72 |
+
local_size = int(os.environ.get('LOCAL_SIZE', 1))
|
| 73 |
+
self.dataset = dataset
|
| 74 |
+
self.num_replicas = num_replicas
|
| 75 |
+
self.num_parts = local_size
|
| 76 |
+
self.rank = rank
|
| 77 |
+
self.local_rank = local_rank
|
| 78 |
+
self.epoch = 0
|
| 79 |
+
self.num_samples = int(
|
| 80 |
+
math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
|
| 81 |
+
self.total_size = self.num_samples * self.num_replicas
|
| 82 |
+
|
| 83 |
+
self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
|
| 84 |
+
|
| 85 |
+
def __iter__(self):
|
| 86 |
+
# deterministically shuffle based on epoch
|
| 87 |
+
g = torch.Generator()
|
| 88 |
+
g.manual_seed(self.epoch)
|
| 89 |
+
|
| 90 |
+
t = torch.Generator()
|
| 91 |
+
t.manual_seed(0)
|
| 92 |
+
|
| 93 |
+
indices = torch.randperm(len(self.dataset), generator=t).tolist()
|
| 94 |
+
# indices = range(len(self.dataset))
|
| 95 |
+
indices = [i for i in indices if i % self.num_parts == self.local_rank]
|
| 96 |
+
|
| 97 |
+
# add extra samples to make it evenly divisible
|
| 98 |
+
indices += indices[:(self.total_size_parts - len(indices))]
|
| 99 |
+
assert len(indices) == self.total_size_parts
|
| 100 |
+
|
| 101 |
+
# subsample
|
| 102 |
+
indices = indices[self.rank // self.num_parts:self.
|
| 103 |
+
total_size_parts:self.num_replicas // self.num_parts]
|
| 104 |
+
|
| 105 |
+
index = torch.randperm(len(indices), generator=g).tolist()
|
| 106 |
+
indices = list(np.array(indices)[index])
|
| 107 |
+
|
| 108 |
+
assert len(indices) == self.num_samples
|
| 109 |
+
|
| 110 |
+
return iter(indices)
|
| 111 |
+
|
| 112 |
+
def __len__(self):
|
| 113 |
+
return self.num_samples
|
| 114 |
+
|
| 115 |
+
def set_epoch(self, epoch):
|
| 116 |
+
self.epoch = epoch
|
VLMEvalKit_old/InternVL/classification/dataset/zipreader.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
import io
|
| 8 |
+
import os
|
| 9 |
+
import zipfile
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
from PIL import Image, ImageFile
|
| 13 |
+
|
| 14 |
+
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def is_zip_path(img_or_path):
|
| 18 |
+
"""judge if this is a zip path."""
|
| 19 |
+
return '.zip@' in img_or_path
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ZipReader(object):
|
| 23 |
+
"""A class to read zipped files."""
|
| 24 |
+
zip_bank = dict()
|
| 25 |
+
|
| 26 |
+
def __init__(self):
|
| 27 |
+
super(ZipReader, self).__init__()
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def get_zipfile(path):
|
| 31 |
+
zip_bank = ZipReader.zip_bank
|
| 32 |
+
if path not in zip_bank:
|
| 33 |
+
zfile = zipfile.ZipFile(path, 'r')
|
| 34 |
+
zip_bank[path] = zfile
|
| 35 |
+
return zip_bank[path]
|
| 36 |
+
|
| 37 |
+
@staticmethod
|
| 38 |
+
def split_zip_style_path(path):
|
| 39 |
+
pos_at = path.index('@')
|
| 40 |
+
assert pos_at != -1, "character '@' is not found from the given path '%s'" % path
|
| 41 |
+
|
| 42 |
+
zip_path = path[0:pos_at]
|
| 43 |
+
folder_path = path[pos_at + 1:]
|
| 44 |
+
folder_path = str.strip(folder_path, '/')
|
| 45 |
+
return zip_path, folder_path
|
| 46 |
+
|
| 47 |
+
@staticmethod
|
| 48 |
+
def list_folder(path):
|
| 49 |
+
zip_path, folder_path = ZipReader.split_zip_style_path(path)
|
| 50 |
+
|
| 51 |
+
zfile = ZipReader.get_zipfile(zip_path)
|
| 52 |
+
folder_list = []
|
| 53 |
+
for file_foler_name in zfile.namelist():
|
| 54 |
+
file_foler_name = str.strip(file_foler_name, '/')
|
| 55 |
+
if file_foler_name.startswith(folder_path) and \
|
| 56 |
+
len(os.path.splitext(file_foler_name)[-1]) == 0 and \
|
| 57 |
+
file_foler_name != folder_path:
|
| 58 |
+
if len(folder_path) == 0:
|
| 59 |
+
folder_list.append(file_foler_name)
|
| 60 |
+
else:
|
| 61 |
+
folder_list.append(file_foler_name[len(folder_path) + 1:])
|
| 62 |
+
|
| 63 |
+
return folder_list
|
| 64 |
+
|
| 65 |
+
@staticmethod
|
| 66 |
+
def list_files(path, extension=None):
|
| 67 |
+
if extension is None:
|
| 68 |
+
extension = ['.*']
|
| 69 |
+
zip_path, folder_path = ZipReader.split_zip_style_path(path)
|
| 70 |
+
|
| 71 |
+
zfile = ZipReader.get_zipfile(zip_path)
|
| 72 |
+
file_lists = []
|
| 73 |
+
for file_foler_name in zfile.namelist():
|
| 74 |
+
file_foler_name = str.strip(file_foler_name, '/')
|
| 75 |
+
if file_foler_name.startswith(folder_path) and \
|
| 76 |
+
str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
|
| 77 |
+
if len(folder_path) == 0:
|
| 78 |
+
file_lists.append(file_foler_name)
|
| 79 |
+
else:
|
| 80 |
+
file_lists.append(file_foler_name[len(folder_path) + 1:])
|
| 81 |
+
|
| 82 |
+
return file_lists
|
| 83 |
+
|
| 84 |
+
@staticmethod
|
| 85 |
+
def read(path):
|
| 86 |
+
zip_path, path_img = ZipReader.split_zip_style_path(path)
|
| 87 |
+
zfile = ZipReader.get_zipfile(zip_path)
|
| 88 |
+
data = zfile.read(path_img)
|
| 89 |
+
return data
|
| 90 |
+
|
| 91 |
+
@staticmethod
|
| 92 |
+
def imread(path):
|
| 93 |
+
zip_path, path_img = ZipReader.split_zip_style_path(path)
|
| 94 |
+
zfile = ZipReader.get_zipfile(zip_path)
|
| 95 |
+
data = zfile.read(path_img)
|
| 96 |
+
try:
|
| 97 |
+
im = Image.open(io.BytesIO(data))
|
| 98 |
+
except:
|
| 99 |
+
print('ERROR IMG LOADED: ', path_img)
|
| 100 |
+
random_img = np.random.rand(224, 224, 3) * 255
|
| 101 |
+
im = Image.fromarray(np.uint8(random_img))
|
| 102 |
+
return im
|
VLMEvalKit_old/InternVL/classification/meta_data/22k_class_to_idx.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
VLMEvalKit_old/InternVL/classification/meta_data/imagenet_classes.json
ADDED
|
@@ -0,0 +1,1002 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"n01440764": 0,
|
| 3 |
+
"n01443537": 1,
|
| 4 |
+
"n01484850": 2,
|
| 5 |
+
"n01491361": 3,
|
| 6 |
+
"n01494475": 4,
|
| 7 |
+
"n01496331": 5,
|
| 8 |
+
"n01498041": 6,
|
| 9 |
+
"n01514668": 7,
|
| 10 |
+
"n01514859": 8,
|
| 11 |
+
"n01518878": 9,
|
| 12 |
+
"n01530575": 10,
|
| 13 |
+
"n01531178": 11,
|
| 14 |
+
"n01532829": 12,
|
| 15 |
+
"n01534433": 13,
|
| 16 |
+
"n01537544": 14,
|
| 17 |
+
"n01558993": 15,
|
| 18 |
+
"n01560419": 16,
|
| 19 |
+
"n01580077": 17,
|
| 20 |
+
"n01582220": 18,
|
| 21 |
+
"n01592084": 19,
|
| 22 |
+
"n01601694": 20,
|
| 23 |
+
"n01608432": 21,
|
| 24 |
+
"n01614925": 22,
|
| 25 |
+
"n01616318": 23,
|
| 26 |
+
"n01622779": 24,
|
| 27 |
+
"n01629819": 25,
|
| 28 |
+
"n01630670": 26,
|
| 29 |
+
"n01631663": 27,
|
| 30 |
+
"n01632458": 28,
|
| 31 |
+
"n01632777": 29,
|
| 32 |
+
"n01641577": 30,
|
| 33 |
+
"n01644373": 31,
|
| 34 |
+
"n01644900": 32,
|
| 35 |
+
"n01664065": 33,
|
| 36 |
+
"n01665541": 34,
|
| 37 |
+
"n01667114": 35,
|
| 38 |
+
"n01667778": 36,
|
| 39 |
+
"n01669191": 37,
|
| 40 |
+
"n01675722": 38,
|
| 41 |
+
"n01677366": 39,
|
| 42 |
+
"n01682714": 40,
|
| 43 |
+
"n01685808": 41,
|
| 44 |
+
"n01687978": 42,
|
| 45 |
+
"n01688243": 43,
|
| 46 |
+
"n01689811": 44,
|
| 47 |
+
"n01692333": 45,
|
| 48 |
+
"n01693334": 46,
|
| 49 |
+
"n01694178": 47,
|
| 50 |
+
"n01695060": 48,
|
| 51 |
+
"n01697457": 49,
|
| 52 |
+
"n01698640": 50,
|
| 53 |
+
"n01704323": 51,
|
| 54 |
+
"n01728572": 52,
|
| 55 |
+
"n01728920": 53,
|
| 56 |
+
"n01729322": 54,
|
| 57 |
+
"n01729977": 55,
|
| 58 |
+
"n01734418": 56,
|
| 59 |
+
"n01735189": 57,
|
| 60 |
+
"n01737021": 58,
|
| 61 |
+
"n01739381": 59,
|
| 62 |
+
"n01740131": 60,
|
| 63 |
+
"n01742172": 61,
|
| 64 |
+
"n01744401": 62,
|
| 65 |
+
"n01748264": 63,
|
| 66 |
+
"n01749939": 64,
|
| 67 |
+
"n01751748": 65,
|
| 68 |
+
"n01753488": 66,
|
| 69 |
+
"n01755581": 67,
|
| 70 |
+
"n01756291": 68,
|
| 71 |
+
"n01768244": 69,
|
| 72 |
+
"n01770081": 70,
|
| 73 |
+
"n01770393": 71,
|
| 74 |
+
"n01773157": 72,
|
| 75 |
+
"n01773549": 73,
|
| 76 |
+
"n01773797": 74,
|
| 77 |
+
"n01774384": 75,
|
| 78 |
+
"n01774750": 76,
|
| 79 |
+
"n01775062": 77,
|
| 80 |
+
"n01776313": 78,
|
| 81 |
+
"n01784675": 79,
|
| 82 |
+
"n01795545": 80,
|
| 83 |
+
"n01796340": 81,
|
| 84 |
+
"n01797886": 82,
|
| 85 |
+
"n01798484": 83,
|
| 86 |
+
"n01806143": 84,
|
| 87 |
+
"n01806567": 85,
|
| 88 |
+
"n01807496": 86,
|
| 89 |
+
"n01817953": 87,
|
| 90 |
+
"n01818515": 88,
|
| 91 |
+
"n01819313": 89,
|
| 92 |
+
"n01820546": 90,
|
| 93 |
+
"n01824575": 91,
|
| 94 |
+
"n01828970": 92,
|
| 95 |
+
"n01829413": 93,
|
| 96 |
+
"n01833805": 94,
|
| 97 |
+
"n01843065": 95,
|
| 98 |
+
"n01843383": 96,
|
| 99 |
+
"n01847000": 97,
|
| 100 |
+
"n01855032": 98,
|
| 101 |
+
"n01855672": 99,
|
| 102 |
+
"n01860187": 100,
|
| 103 |
+
"n01871265": 101,
|
| 104 |
+
"n01872401": 102,
|
| 105 |
+
"n01873310": 103,
|
| 106 |
+
"n01877812": 104,
|
| 107 |
+
"n01882714": 105,
|
| 108 |
+
"n01883070": 106,
|
| 109 |
+
"n01910747": 107,
|
| 110 |
+
"n01914609": 108,
|
| 111 |
+
"n01917289": 109,
|
| 112 |
+
"n01924916": 110,
|
| 113 |
+
"n01930112": 111,
|
| 114 |
+
"n01943899": 112,
|
| 115 |
+
"n01944390": 113,
|
| 116 |
+
"n01945685": 114,
|
| 117 |
+
"n01950731": 115,
|
| 118 |
+
"n01955084": 116,
|
| 119 |
+
"n01968897": 117,
|
| 120 |
+
"n01978287": 118,
|
| 121 |
+
"n01978455": 119,
|
| 122 |
+
"n01980166": 120,
|
| 123 |
+
"n01981276": 121,
|
| 124 |
+
"n01983481": 122,
|
| 125 |
+
"n01984695": 123,
|
| 126 |
+
"n01985128": 124,
|
| 127 |
+
"n01986214": 125,
|
| 128 |
+
"n01990800": 126,
|
| 129 |
+
"n02002556": 127,
|
| 130 |
+
"n02002724": 128,
|
| 131 |
+
"n02006656": 129,
|
| 132 |
+
"n02007558": 130,
|
| 133 |
+
"n02009229": 131,
|
| 134 |
+
"n02009912": 132,
|
| 135 |
+
"n02011460": 133,
|
| 136 |
+
"n02012849": 134,
|
| 137 |
+
"n02013706": 135,
|
| 138 |
+
"n02017213": 136,
|
| 139 |
+
"n02018207": 137,
|
| 140 |
+
"n02018795": 138,
|
| 141 |
+
"n02025239": 139,
|
| 142 |
+
"n02027492": 140,
|
| 143 |
+
"n02028035": 141,
|
| 144 |
+
"n02033041": 142,
|
| 145 |
+
"n02037110": 143,
|
| 146 |
+
"n02051845": 144,
|
| 147 |
+
"n02056570": 145,
|
| 148 |
+
"n02058221": 146,
|
| 149 |
+
"n02066245": 147,
|
| 150 |
+
"n02071294": 148,
|
| 151 |
+
"n02074367": 149,
|
| 152 |
+
"n02077923": 150,
|
| 153 |
+
"n02085620": 151,
|
| 154 |
+
"n02085782": 152,
|
| 155 |
+
"n02085936": 153,
|
| 156 |
+
"n02086079": 154,
|
| 157 |
+
"n02086240": 155,
|
| 158 |
+
"n02086646": 156,
|
| 159 |
+
"n02086910": 157,
|
| 160 |
+
"n02087046": 158,
|
| 161 |
+
"n02087394": 159,
|
| 162 |
+
"n02088094": 160,
|
| 163 |
+
"n02088238": 161,
|
| 164 |
+
"n02088364": 162,
|
| 165 |
+
"n02088466": 163,
|
| 166 |
+
"n02088632": 164,
|
| 167 |
+
"n02089078": 165,
|
| 168 |
+
"n02089867": 166,
|
| 169 |
+
"n02089973": 167,
|
| 170 |
+
"n02090379": 168,
|
| 171 |
+
"n02090622": 169,
|
| 172 |
+
"n02090721": 170,
|
| 173 |
+
"n02091032": 171,
|
| 174 |
+
"n02091134": 172,
|
| 175 |
+
"n02091244": 173,
|
| 176 |
+
"n02091467": 174,
|
| 177 |
+
"n02091635": 175,
|
| 178 |
+
"n02091831": 176,
|
| 179 |
+
"n02092002": 177,
|
| 180 |
+
"n02092339": 178,
|
| 181 |
+
"n02093256": 179,
|
| 182 |
+
"n02093428": 180,
|
| 183 |
+
"n02093647": 181,
|
| 184 |
+
"n02093754": 182,
|
| 185 |
+
"n02093859": 183,
|
| 186 |
+
"n02093991": 184,
|
| 187 |
+
"n02094114": 185,
|
| 188 |
+
"n02094258": 186,
|
| 189 |
+
"n02094433": 187,
|
| 190 |
+
"n02095314": 188,
|
| 191 |
+
"n02095570": 189,
|
| 192 |
+
"n02095889": 190,
|
| 193 |
+
"n02096051": 191,
|
| 194 |
+
"n02096177": 192,
|
| 195 |
+
"n02096294": 193,
|
| 196 |
+
"n02096437": 194,
|
| 197 |
+
"n02096585": 195,
|
| 198 |
+
"n02097047": 196,
|
| 199 |
+
"n02097130": 197,
|
| 200 |
+
"n02097209": 198,
|
| 201 |
+
"n02097298": 199,
|
| 202 |
+
"n02097474": 200,
|
| 203 |
+
"n02097658": 201,
|
| 204 |
+
"n02098105": 202,
|
| 205 |
+
"n02098286": 203,
|
| 206 |
+
"n02098413": 204,
|
| 207 |
+
"n02099267": 205,
|
| 208 |
+
"n02099429": 206,
|
| 209 |
+
"n02099601": 207,
|
| 210 |
+
"n02099712": 208,
|
| 211 |
+
"n02099849": 209,
|
| 212 |
+
"n02100236": 210,
|
| 213 |
+
"n02100583": 211,
|
| 214 |
+
"n02100735": 212,
|
| 215 |
+
"n02100877": 213,
|
| 216 |
+
"n02101006": 214,
|
| 217 |
+
"n02101388": 215,
|
| 218 |
+
"n02101556": 216,
|
| 219 |
+
"n02102040": 217,
|
| 220 |
+
"n02102177": 218,
|
| 221 |
+
"n02102318": 219,
|
| 222 |
+
"n02102480": 220,
|
| 223 |
+
"n02102973": 221,
|
| 224 |
+
"n02104029": 222,
|
| 225 |
+
"n02104365": 223,
|
| 226 |
+
"n02105056": 224,
|
| 227 |
+
"n02105162": 225,
|
| 228 |
+
"n02105251": 226,
|
| 229 |
+
"n02105412": 227,
|
| 230 |
+
"n02105505": 228,
|
| 231 |
+
"n02105641": 229,
|
| 232 |
+
"n02105855": 230,
|
| 233 |
+
"n02106030": 231,
|
| 234 |
+
"n02106166": 232,
|
| 235 |
+
"n02106382": 233,
|
| 236 |
+
"n02106550": 234,
|
| 237 |
+
"n02106662": 235,
|
| 238 |
+
"n02107142": 236,
|
| 239 |
+
"n02107312": 237,
|
| 240 |
+
"n02107574": 238,
|
| 241 |
+
"n02107683": 239,
|
| 242 |
+
"n02107908": 240,
|
| 243 |
+
"n02108000": 241,
|
| 244 |
+
"n02108089": 242,
|
| 245 |
+
"n02108422": 243,
|
| 246 |
+
"n02108551": 244,
|
| 247 |
+
"n02108915": 245,
|
| 248 |
+
"n02109047": 246,
|
| 249 |
+
"n02109525": 247,
|
| 250 |
+
"n02109961": 248,
|
| 251 |
+
"n02110063": 249,
|
| 252 |
+
"n02110185": 250,
|
| 253 |
+
"n02110341": 251,
|
| 254 |
+
"n02110627": 252,
|
| 255 |
+
"n02110806": 253,
|
| 256 |
+
"n02110958": 254,
|
| 257 |
+
"n02111129": 255,
|
| 258 |
+
"n02111277": 256,
|
| 259 |
+
"n02111500": 257,
|
| 260 |
+
"n02111889": 258,
|
| 261 |
+
"n02112018": 259,
|
| 262 |
+
"n02112137": 260,
|
| 263 |
+
"n02112350": 261,
|
| 264 |
+
"n02112706": 262,
|
| 265 |
+
"n02113023": 263,
|
| 266 |
+
"n02113186": 264,
|
| 267 |
+
"n02113624": 265,
|
| 268 |
+
"n02113712": 266,
|
| 269 |
+
"n02113799": 267,
|
| 270 |
+
"n02113978": 268,
|
| 271 |
+
"n02114367": 269,
|
| 272 |
+
"n02114548": 270,
|
| 273 |
+
"n02114712": 271,
|
| 274 |
+
"n02114855": 272,
|
| 275 |
+
"n02115641": 273,
|
| 276 |
+
"n02115913": 274,
|
| 277 |
+
"n02116738": 275,
|
| 278 |
+
"n02117135": 276,
|
| 279 |
+
"n02119022": 277,
|
| 280 |
+
"n02119789": 278,
|
| 281 |
+
"n02120079": 279,
|
| 282 |
+
"n02120505": 280,
|
| 283 |
+
"n02123045": 281,
|
| 284 |
+
"n02123159": 282,
|
| 285 |
+
"n02123394": 283,
|
| 286 |
+
"n02123597": 284,
|
| 287 |
+
"n02124075": 285,
|
| 288 |
+
"n02125311": 286,
|
| 289 |
+
"n02127052": 287,
|
| 290 |
+
"n02128385": 288,
|
| 291 |
+
"n02128757": 289,
|
| 292 |
+
"n02128925": 290,
|
| 293 |
+
"n02129165": 291,
|
| 294 |
+
"n02129604": 292,
|
| 295 |
+
"n02130308": 293,
|
| 296 |
+
"n02132136": 294,
|
| 297 |
+
"n02133161": 295,
|
| 298 |
+
"n02134084": 296,
|
| 299 |
+
"n02134418": 297,
|
| 300 |
+
"n02137549": 298,
|
| 301 |
+
"n02138441": 299,
|
| 302 |
+
"n02165105": 300,
|
| 303 |
+
"n02165456": 301,
|
| 304 |
+
"n02167151": 302,
|
| 305 |
+
"n02168699": 303,
|
| 306 |
+
"n02169497": 304,
|
| 307 |
+
"n02172182": 305,
|
| 308 |
+
"n02174001": 306,
|
| 309 |
+
"n02177972": 307,
|
| 310 |
+
"n02190166": 308,
|
| 311 |
+
"n02206856": 309,
|
| 312 |
+
"n02219486": 310,
|
| 313 |
+
"n02226429": 311,
|
| 314 |
+
"n02229544": 312,
|
| 315 |
+
"n02231487": 313,
|
| 316 |
+
"n02233338": 314,
|
| 317 |
+
"n02236044": 315,
|
| 318 |
+
"n02256656": 316,
|
| 319 |
+
"n02259212": 317,
|
| 320 |
+
"n02264363": 318,
|
| 321 |
+
"n02268443": 319,
|
| 322 |
+
"n02268853": 320,
|
| 323 |
+
"n02276258": 321,
|
| 324 |
+
"n02277742": 322,
|
| 325 |
+
"n02279972": 323,
|
| 326 |
+
"n02280649": 324,
|
| 327 |
+
"n02281406": 325,
|
| 328 |
+
"n02281787": 326,
|
| 329 |
+
"n02317335": 327,
|
| 330 |
+
"n02319095": 328,
|
| 331 |
+
"n02321529": 329,
|
| 332 |
+
"n02325366": 330,
|
| 333 |
+
"n02326432": 331,
|
| 334 |
+
"n02328150": 332,
|
| 335 |
+
"n02342885": 333,
|
| 336 |
+
"n02346627": 334,
|
| 337 |
+
"n02356798": 335,
|
| 338 |
+
"n02361337": 336,
|
| 339 |
+
"n02363005": 337,
|
| 340 |
+
"n02364673": 338,
|
| 341 |
+
"n02389026": 339,
|
| 342 |
+
"n02391049": 340,
|
| 343 |
+
"n02395406": 341,
|
| 344 |
+
"n02396427": 342,
|
| 345 |
+
"n02397096": 343,
|
| 346 |
+
"n02398521": 344,
|
| 347 |
+
"n02403003": 345,
|
| 348 |
+
"n02408429": 346,
|
| 349 |
+
"n02410509": 347,
|
| 350 |
+
"n02412080": 348,
|
| 351 |
+
"n02415577": 349,
|
| 352 |
+
"n02417914": 350,
|
| 353 |
+
"n02422106": 351,
|
| 354 |
+
"n02422699": 352,
|
| 355 |
+
"n02423022": 353,
|
| 356 |
+
"n02437312": 354,
|
| 357 |
+
"n02437616": 355,
|
| 358 |
+
"n02441942": 356,
|
| 359 |
+
"n02442845": 357,
|
| 360 |
+
"n02443114": 358,
|
| 361 |
+
"n02443484": 359,
|
| 362 |
+
"n02444819": 360,
|
| 363 |
+
"n02445715": 361,
|
| 364 |
+
"n02447366": 362,
|
| 365 |
+
"n02454379": 363,
|
| 366 |
+
"n02457408": 364,
|
| 367 |
+
"n02480495": 365,
|
| 368 |
+
"n02480855": 366,
|
| 369 |
+
"n02481823": 367,
|
| 370 |
+
"n02483362": 368,
|
| 371 |
+
"n02483708": 369,
|
| 372 |
+
"n02484975": 370,
|
| 373 |
+
"n02486261": 371,
|
| 374 |
+
"n02486410": 372,
|
| 375 |
+
"n02487347": 373,
|
| 376 |
+
"n02488291": 374,
|
| 377 |
+
"n02488702": 375,
|
| 378 |
+
"n02489166": 376,
|
| 379 |
+
"n02490219": 377,
|
| 380 |
+
"n02492035": 378,
|
| 381 |
+
"n02492660": 379,
|
| 382 |
+
"n02493509": 380,
|
| 383 |
+
"n02493793": 381,
|
| 384 |
+
"n02494079": 382,
|
| 385 |
+
"n02497673": 383,
|
| 386 |
+
"n02500267": 384,
|
| 387 |
+
"n02504013": 385,
|
| 388 |
+
"n02504458": 386,
|
| 389 |
+
"n02509815": 387,
|
| 390 |
+
"n02510455": 388,
|
| 391 |
+
"n02514041": 389,
|
| 392 |
+
"n02526121": 390,
|
| 393 |
+
"n02536864": 391,
|
| 394 |
+
"n02606052": 392,
|
| 395 |
+
"n02607072": 393,
|
| 396 |
+
"n02640242": 394,
|
| 397 |
+
"n02641379": 395,
|
| 398 |
+
"n02643566": 396,
|
| 399 |
+
"n02655020": 397,
|
| 400 |
+
"n02666196": 398,
|
| 401 |
+
"n02667093": 399,
|
| 402 |
+
"n02669723": 400,
|
| 403 |
+
"n02672831": 401,
|
| 404 |
+
"n02676566": 402,
|
| 405 |
+
"n02687172": 403,
|
| 406 |
+
"n02690373": 404,
|
| 407 |
+
"n02692877": 405,
|
| 408 |
+
"n02699494": 406,
|
| 409 |
+
"n02701002": 407,
|
| 410 |
+
"n02704792": 408,
|
| 411 |
+
"n02708093": 409,
|
| 412 |
+
"n02727426": 410,
|
| 413 |
+
"n02730930": 411,
|
| 414 |
+
"n02747177": 412,
|
| 415 |
+
"n02749479": 413,
|
| 416 |
+
"n02769748": 414,
|
| 417 |
+
"n02776631": 415,
|
| 418 |
+
"n02777292": 416,
|
| 419 |
+
"n02782093": 417,
|
| 420 |
+
"n02783161": 418,
|
| 421 |
+
"n02786058": 419,
|
| 422 |
+
"n02787622": 420,
|
| 423 |
+
"n02788148": 421,
|
| 424 |
+
"n02790996": 422,
|
| 425 |
+
"n02791124": 423,
|
| 426 |
+
"n02791270": 424,
|
| 427 |
+
"n02793495": 425,
|
| 428 |
+
"n02794156": 426,
|
| 429 |
+
"n02795169": 427,
|
| 430 |
+
"n02797295": 428,
|
| 431 |
+
"n02799071": 429,
|
| 432 |
+
"n02802426": 430,
|
| 433 |
+
"n02804414": 431,
|
| 434 |
+
"n02804610": 432,
|
| 435 |
+
"n02807133": 433,
|
| 436 |
+
"n02808304": 434,
|
| 437 |
+
"n02808440": 435,
|
| 438 |
+
"n02814533": 436,
|
| 439 |
+
"n02814860": 437,
|
| 440 |
+
"n02815834": 438,
|
| 441 |
+
"n02817516": 439,
|
| 442 |
+
"n02823428": 440,
|
| 443 |
+
"n02823750": 441,
|
| 444 |
+
"n02825657": 442,
|
| 445 |
+
"n02834397": 443,
|
| 446 |
+
"n02835271": 444,
|
| 447 |
+
"n02837789": 445,
|
| 448 |
+
"n02840245": 446,
|
| 449 |
+
"n02841315": 447,
|
| 450 |
+
"n02843684": 448,
|
| 451 |
+
"n02859443": 449,
|
| 452 |
+
"n02860847": 450,
|
| 453 |
+
"n02865351": 451,
|
| 454 |
+
"n02869837": 452,
|
| 455 |
+
"n02870880": 453,
|
| 456 |
+
"n02871525": 454,
|
| 457 |
+
"n02877765": 455,
|
| 458 |
+
"n02879718": 456,
|
| 459 |
+
"n02883205": 457,
|
| 460 |
+
"n02892201": 458,
|
| 461 |
+
"n02892767": 459,
|
| 462 |
+
"n02894605": 460,
|
| 463 |
+
"n02895154": 461,
|
| 464 |
+
"n02906734": 462,
|
| 465 |
+
"n02909870": 463,
|
| 466 |
+
"n02910353": 464,
|
| 467 |
+
"n02916936": 465,
|
| 468 |
+
"n02917067": 466,
|
| 469 |
+
"n02927161": 467,
|
| 470 |
+
"n02930766": 468,
|
| 471 |
+
"n02939185": 469,
|
| 472 |
+
"n02948072": 470,
|
| 473 |
+
"n02950826": 471,
|
| 474 |
+
"n02951358": 472,
|
| 475 |
+
"n02951585": 473,
|
| 476 |
+
"n02963159": 474,
|
| 477 |
+
"n02965783": 475,
|
| 478 |
+
"n02966193": 476,
|
| 479 |
+
"n02966687": 477,
|
| 480 |
+
"n02971356": 478,
|
| 481 |
+
"n02974003": 479,
|
| 482 |
+
"n02977058": 480,
|
| 483 |
+
"n02978881": 481,
|
| 484 |
+
"n02979186": 482,
|
| 485 |
+
"n02980441": 483,
|
| 486 |
+
"n02981792": 484,
|
| 487 |
+
"n02988304": 485,
|
| 488 |
+
"n02992211": 486,
|
| 489 |
+
"n02992529": 487,
|
| 490 |
+
"n02999410": 488,
|
| 491 |
+
"n03000134": 489,
|
| 492 |
+
"n03000247": 490,
|
| 493 |
+
"n03000684": 491,
|
| 494 |
+
"n03014705": 492,
|
| 495 |
+
"n03016953": 493,
|
| 496 |
+
"n03017168": 494,
|
| 497 |
+
"n03018349": 495,
|
| 498 |
+
"n03026506": 496,
|
| 499 |
+
"n03028079": 497,
|
| 500 |
+
"n03032252": 498,
|
| 501 |
+
"n03041632": 499,
|
| 502 |
+
"n03042490": 500,
|
| 503 |
+
"n03045698": 501,
|
| 504 |
+
"n03047690": 502,
|
| 505 |
+
"n03062245": 503,
|
| 506 |
+
"n03063599": 504,
|
| 507 |
+
"n03063689": 505,
|
| 508 |
+
"n03065424": 506,
|
| 509 |
+
"n03075370": 507,
|
| 510 |
+
"n03085013": 508,
|
| 511 |
+
"n03089624": 509,
|
| 512 |
+
"n03095699": 510,
|
| 513 |
+
"n03100240": 511,
|
| 514 |
+
"n03109150": 512,
|
| 515 |
+
"n03110669": 513,
|
| 516 |
+
"n03124043": 514,
|
| 517 |
+
"n03124170": 515,
|
| 518 |
+
"n03125729": 516,
|
| 519 |
+
"n03126707": 517,
|
| 520 |
+
"n03127747": 518,
|
| 521 |
+
"n03127925": 519,
|
| 522 |
+
"n03131574": 520,
|
| 523 |
+
"n03133878": 521,
|
| 524 |
+
"n03134739": 522,
|
| 525 |
+
"n03141823": 523,
|
| 526 |
+
"n03146219": 524,
|
| 527 |
+
"n03160309": 525,
|
| 528 |
+
"n03179701": 526,
|
| 529 |
+
"n03180011": 527,
|
| 530 |
+
"n03187595": 528,
|
| 531 |
+
"n03188531": 529,
|
| 532 |
+
"n03196217": 530,
|
| 533 |
+
"n03197337": 531,
|
| 534 |
+
"n03201208": 532,
|
| 535 |
+
"n03207743": 533,
|
| 536 |
+
"n03207941": 534,
|
| 537 |
+
"n03208938": 535,
|
| 538 |
+
"n03216828": 536,
|
| 539 |
+
"n03218198": 537,
|
| 540 |
+
"n03220513": 538,
|
| 541 |
+
"n03223299": 539,
|
| 542 |
+
"n03240683": 540,
|
| 543 |
+
"n03249569": 541,
|
| 544 |
+
"n03250847": 542,
|
| 545 |
+
"n03255030": 543,
|
| 546 |
+
"n03259280": 544,
|
| 547 |
+
"n03271574": 545,
|
| 548 |
+
"n03272010": 546,
|
| 549 |
+
"n03272562": 547,
|
| 550 |
+
"n03290653": 548,
|
| 551 |
+
"n03291819": 549,
|
| 552 |
+
"n03297495": 550,
|
| 553 |
+
"n03314780": 551,
|
| 554 |
+
"n03325584": 552,
|
| 555 |
+
"n03337140": 553,
|
| 556 |
+
"n03344393": 554,
|
| 557 |
+
"n03345487": 555,
|
| 558 |
+
"n03347037": 556,
|
| 559 |
+
"n03355925": 557,
|
| 560 |
+
"n03372029": 558,
|
| 561 |
+
"n03376595": 559,
|
| 562 |
+
"n03379051": 560,
|
| 563 |
+
"n03384352": 561,
|
| 564 |
+
"n03388043": 562,
|
| 565 |
+
"n03388183": 563,
|
| 566 |
+
"n03388549": 564,
|
| 567 |
+
"n03393912": 565,
|
| 568 |
+
"n03394916": 566,
|
| 569 |
+
"n03400231": 567,
|
| 570 |
+
"n03404251": 568,
|
| 571 |
+
"n03417042": 569,
|
| 572 |
+
"n03424325": 570,
|
| 573 |
+
"n03425413": 571,
|
| 574 |
+
"n03443371": 572,
|
| 575 |
+
"n03444034": 573,
|
| 576 |
+
"n03445777": 574,
|
| 577 |
+
"n03445924": 575,
|
| 578 |
+
"n03447447": 576,
|
| 579 |
+
"n03447721": 577,
|
| 580 |
+
"n03450230": 578,
|
| 581 |
+
"n03452741": 579,
|
| 582 |
+
"n03457902": 580,
|
| 583 |
+
"n03459775": 581,
|
| 584 |
+
"n03461385": 582,
|
| 585 |
+
"n03467068": 583,
|
| 586 |
+
"n03476684": 584,
|
| 587 |
+
"n03476991": 585,
|
| 588 |
+
"n03478589": 586,
|
| 589 |
+
"n03481172": 587,
|
| 590 |
+
"n03482405": 588,
|
| 591 |
+
"n03483316": 589,
|
| 592 |
+
"n03485407": 590,
|
| 593 |
+
"n03485794": 591,
|
| 594 |
+
"n03492542": 592,
|
| 595 |
+
"n03494278": 593,
|
| 596 |
+
"n03495258": 594,
|
| 597 |
+
"n03496892": 595,
|
| 598 |
+
"n03498962": 596,
|
| 599 |
+
"n03527444": 597,
|
| 600 |
+
"n03529860": 598,
|
| 601 |
+
"n03530642": 599,
|
| 602 |
+
"n03532672": 600,
|
| 603 |
+
"n03534580": 601,
|
| 604 |
+
"n03535780": 602,
|
| 605 |
+
"n03538406": 603,
|
| 606 |
+
"n03544143": 604,
|
| 607 |
+
"n03584254": 605,
|
| 608 |
+
"n03584829": 606,
|
| 609 |
+
"n03590841": 607,
|
| 610 |
+
"n03594734": 608,
|
| 611 |
+
"n03594945": 609,
|
| 612 |
+
"n03595614": 610,
|
| 613 |
+
"n03598930": 611,
|
| 614 |
+
"n03599486": 612,
|
| 615 |
+
"n03602883": 613,
|
| 616 |
+
"n03617480": 614,
|
| 617 |
+
"n03623198": 615,
|
| 618 |
+
"n03627232": 616,
|
| 619 |
+
"n03630383": 617,
|
| 620 |
+
"n03633091": 618,
|
| 621 |
+
"n03637318": 619,
|
| 622 |
+
"n03642806": 620,
|
| 623 |
+
"n03649909": 621,
|
| 624 |
+
"n03657121": 622,
|
| 625 |
+
"n03658185": 623,
|
| 626 |
+
"n03661043": 624,
|
| 627 |
+
"n03662601": 625,
|
| 628 |
+
"n03666591": 626,
|
| 629 |
+
"n03670208": 627,
|
| 630 |
+
"n03673027": 628,
|
| 631 |
+
"n03676483": 629,
|
| 632 |
+
"n03680355": 630,
|
| 633 |
+
"n03690938": 631,
|
| 634 |
+
"n03691459": 632,
|
| 635 |
+
"n03692522": 633,
|
| 636 |
+
"n03697007": 634,
|
| 637 |
+
"n03706229": 635,
|
| 638 |
+
"n03709823": 636,
|
| 639 |
+
"n03710193": 637,
|
| 640 |
+
"n03710637": 638,
|
| 641 |
+
"n03710721": 639,
|
| 642 |
+
"n03717622": 640,
|
| 643 |
+
"n03720891": 641,
|
| 644 |
+
"n03721384": 642,
|
| 645 |
+
"n03724870": 643,
|
| 646 |
+
"n03729826": 644,
|
| 647 |
+
"n03733131": 645,
|
| 648 |
+
"n03733281": 646,
|
| 649 |
+
"n03733805": 647,
|
| 650 |
+
"n03742115": 648,
|
| 651 |
+
"n03743016": 649,
|
| 652 |
+
"n03759954": 650,
|
| 653 |
+
"n03761084": 651,
|
| 654 |
+
"n03763968": 652,
|
| 655 |
+
"n03764736": 653,
|
| 656 |
+
"n03769881": 654,
|
| 657 |
+
"n03770439": 655,
|
| 658 |
+
"n03770679": 656,
|
| 659 |
+
"n03773504": 657,
|
| 660 |
+
"n03775071": 658,
|
| 661 |
+
"n03775546": 659,
|
| 662 |
+
"n03776460": 660,
|
| 663 |
+
"n03777568": 661,
|
| 664 |
+
"n03777754": 662,
|
| 665 |
+
"n03781244": 663,
|
| 666 |
+
"n03782006": 664,
|
| 667 |
+
"n03785016": 665,
|
| 668 |
+
"n03786901": 666,
|
| 669 |
+
"n03787032": 667,
|
| 670 |
+
"n03788195": 668,
|
| 671 |
+
"n03788365": 669,
|
| 672 |
+
"n03791053": 670,
|
| 673 |
+
"n03792782": 671,
|
| 674 |
+
"n03792972": 672,
|
| 675 |
+
"n03793489": 673,
|
| 676 |
+
"n03794056": 674,
|
| 677 |
+
"n03796401": 675,
|
| 678 |
+
"n03803284": 676,
|
| 679 |
+
"n03804744": 677,
|
| 680 |
+
"n03814639": 678,
|
| 681 |
+
"n03814906": 679,
|
| 682 |
+
"n03825788": 680,
|
| 683 |
+
"n03832673": 681,
|
| 684 |
+
"n03837869": 682,
|
| 685 |
+
"n03838899": 683,
|
| 686 |
+
"n03840681": 684,
|
| 687 |
+
"n03841143": 685,
|
| 688 |
+
"n03843555": 686,
|
| 689 |
+
"n03854065": 687,
|
| 690 |
+
"n03857828": 688,
|
| 691 |
+
"n03866082": 689,
|
| 692 |
+
"n03868242": 690,
|
| 693 |
+
"n03868863": 691,
|
| 694 |
+
"n03871628": 692,
|
| 695 |
+
"n03873416": 693,
|
| 696 |
+
"n03874293": 694,
|
| 697 |
+
"n03874599": 695,
|
| 698 |
+
"n03876231": 696,
|
| 699 |
+
"n03877472": 697,
|
| 700 |
+
"n03877845": 698,
|
| 701 |
+
"n03884397": 699,
|
| 702 |
+
"n03887697": 700,
|
| 703 |
+
"n03888257": 701,
|
| 704 |
+
"n03888605": 702,
|
| 705 |
+
"n03891251": 703,
|
| 706 |
+
"n03891332": 704,
|
| 707 |
+
"n03895866": 705,
|
| 708 |
+
"n03899768": 706,
|
| 709 |
+
"n03902125": 707,
|
| 710 |
+
"n03903868": 708,
|
| 711 |
+
"n03908618": 709,
|
| 712 |
+
"n03908714": 710,
|
| 713 |
+
"n03916031": 711,
|
| 714 |
+
"n03920288": 712,
|
| 715 |
+
"n03924679": 713,
|
| 716 |
+
"n03929660": 714,
|
| 717 |
+
"n03929855": 715,
|
| 718 |
+
"n03930313": 716,
|
| 719 |
+
"n03930630": 717,
|
| 720 |
+
"n03933933": 718,
|
| 721 |
+
"n03935335": 719,
|
| 722 |
+
"n03937543": 720,
|
| 723 |
+
"n03938244": 721,
|
| 724 |
+
"n03942813": 722,
|
| 725 |
+
"n03944341": 723,
|
| 726 |
+
"n03947888": 724,
|
| 727 |
+
"n03950228": 725,
|
| 728 |
+
"n03954731": 726,
|
| 729 |
+
"n03956157": 727,
|
| 730 |
+
"n03958227": 728,
|
| 731 |
+
"n03961711": 729,
|
| 732 |
+
"n03967562": 730,
|
| 733 |
+
"n03970156": 731,
|
| 734 |
+
"n03976467": 732,
|
| 735 |
+
"n03976657": 733,
|
| 736 |
+
"n03977966": 734,
|
| 737 |
+
"n03980874": 735,
|
| 738 |
+
"n03982430": 736,
|
| 739 |
+
"n03983396": 737,
|
| 740 |
+
"n03991062": 738,
|
| 741 |
+
"n03992509": 739,
|
| 742 |
+
"n03995372": 740,
|
| 743 |
+
"n03998194": 741,
|
| 744 |
+
"n04004767": 742,
|
| 745 |
+
"n04005630": 743,
|
| 746 |
+
"n04008634": 744,
|
| 747 |
+
"n04009552": 745,
|
| 748 |
+
"n04019541": 746,
|
| 749 |
+
"n04023962": 747,
|
| 750 |
+
"n04026417": 748,
|
| 751 |
+
"n04033901": 749,
|
| 752 |
+
"n04033995": 750,
|
| 753 |
+
"n04037443": 751,
|
| 754 |
+
"n04039381": 752,
|
| 755 |
+
"n04040759": 753,
|
| 756 |
+
"n04041544": 754,
|
| 757 |
+
"n04044716": 755,
|
| 758 |
+
"n04049303": 756,
|
| 759 |
+
"n04065272": 757,
|
| 760 |
+
"n04067472": 758,
|
| 761 |
+
"n04069434": 759,
|
| 762 |
+
"n04070727": 760,
|
| 763 |
+
"n04074963": 761,
|
| 764 |
+
"n04081281": 762,
|
| 765 |
+
"n04086273": 763,
|
| 766 |
+
"n04090263": 764,
|
| 767 |
+
"n04099969": 765,
|
| 768 |
+
"n04111531": 766,
|
| 769 |
+
"n04116512": 767,
|
| 770 |
+
"n04118538": 768,
|
| 771 |
+
"n04118776": 769,
|
| 772 |
+
"n04120489": 770,
|
| 773 |
+
"n04125021": 771,
|
| 774 |
+
"n04127249": 772,
|
| 775 |
+
"n04131690": 773,
|
| 776 |
+
"n04133789": 774,
|
| 777 |
+
"n04136333": 775,
|
| 778 |
+
"n04141076": 776,
|
| 779 |
+
"n04141327": 777,
|
| 780 |
+
"n04141975": 778,
|
| 781 |
+
"n04146614": 779,
|
| 782 |
+
"n04147183": 780,
|
| 783 |
+
"n04149813": 781,
|
| 784 |
+
"n04152593": 782,
|
| 785 |
+
"n04153751": 783,
|
| 786 |
+
"n04154565": 784,
|
| 787 |
+
"n04162706": 785,
|
| 788 |
+
"n04179913": 786,
|
| 789 |
+
"n04192698": 787,
|
| 790 |
+
"n04200800": 788,
|
| 791 |
+
"n04201297": 789,
|
| 792 |
+
"n04204238": 790,
|
| 793 |
+
"n04204347": 791,
|
| 794 |
+
"n04208210": 792,
|
| 795 |
+
"n04209133": 793,
|
| 796 |
+
"n04209239": 794,
|
| 797 |
+
"n04228054": 795,
|
| 798 |
+
"n04229816": 796,
|
| 799 |
+
"n04235860": 797,
|
| 800 |
+
"n04238763": 798,
|
| 801 |
+
"n04239074": 799,
|
| 802 |
+
"n04243546": 800,
|
| 803 |
+
"n04251144": 801,
|
| 804 |
+
"n04252077": 802,
|
| 805 |
+
"n04252225": 803,
|
| 806 |
+
"n04254120": 804,
|
| 807 |
+
"n04254680": 805,
|
| 808 |
+
"n04254777": 806,
|
| 809 |
+
"n04258138": 807,
|
| 810 |
+
"n04259630": 808,
|
| 811 |
+
"n04263257": 809,
|
| 812 |
+
"n04264628": 810,
|
| 813 |
+
"n04265275": 811,
|
| 814 |
+
"n04266014": 812,
|
| 815 |
+
"n04270147": 813,
|
| 816 |
+
"n04273569": 814,
|
| 817 |
+
"n04275548": 815,
|
| 818 |
+
"n04277352": 816,
|
| 819 |
+
"n04285008": 817,
|
| 820 |
+
"n04286575": 818,
|
| 821 |
+
"n04296562": 819,
|
| 822 |
+
"n04310018": 820,
|
| 823 |
+
"n04311004": 821,
|
| 824 |
+
"n04311174": 822,
|
| 825 |
+
"n04317175": 823,
|
| 826 |
+
"n04325704": 824,
|
| 827 |
+
"n04326547": 825,
|
| 828 |
+
"n04328186": 826,
|
| 829 |
+
"n04330267": 827,
|
| 830 |
+
"n04332243": 828,
|
| 831 |
+
"n04335435": 829,
|
| 832 |
+
"n04336792": 830,
|
| 833 |
+
"n04344873": 831,
|
| 834 |
+
"n04346328": 832,
|
| 835 |
+
"n04347754": 833,
|
| 836 |
+
"n04350905": 834,
|
| 837 |
+
"n04355338": 835,
|
| 838 |
+
"n04355933": 836,
|
| 839 |
+
"n04356056": 837,
|
| 840 |
+
"n04357314": 838,
|
| 841 |
+
"n04366367": 839,
|
| 842 |
+
"n04367480": 840,
|
| 843 |
+
"n04370456": 841,
|
| 844 |
+
"n04371430": 842,
|
| 845 |
+
"n04371774": 843,
|
| 846 |
+
"n04372370": 844,
|
| 847 |
+
"n04376876": 845,
|
| 848 |
+
"n04380533": 846,
|
| 849 |
+
"n04389033": 847,
|
| 850 |
+
"n04392985": 848,
|
| 851 |
+
"n04398044": 849,
|
| 852 |
+
"n04399382": 850,
|
| 853 |
+
"n04404412": 851,
|
| 854 |
+
"n04409515": 852,
|
| 855 |
+
"n04417672": 853,
|
| 856 |
+
"n04418357": 854,
|
| 857 |
+
"n04423845": 855,
|
| 858 |
+
"n04428191": 856,
|
| 859 |
+
"n04429376": 857,
|
| 860 |
+
"n04435653": 858,
|
| 861 |
+
"n04442312": 859,
|
| 862 |
+
"n04443257": 860,
|
| 863 |
+
"n04447861": 861,
|
| 864 |
+
"n04456115": 862,
|
| 865 |
+
"n04458633": 863,
|
| 866 |
+
"n04461696": 864,
|
| 867 |
+
"n04462240": 865,
|
| 868 |
+
"n04465501": 866,
|
| 869 |
+
"n04467665": 867,
|
| 870 |
+
"n04476259": 868,
|
| 871 |
+
"n04479046": 869,
|
| 872 |
+
"n04482393": 870,
|
| 873 |
+
"n04483307": 871,
|
| 874 |
+
"n04485082": 872,
|
| 875 |
+
"n04486054": 873,
|
| 876 |
+
"n04487081": 874,
|
| 877 |
+
"n04487394": 875,
|
| 878 |
+
"n04493381": 876,
|
| 879 |
+
"n04501370": 877,
|
| 880 |
+
"n04505470": 878,
|
| 881 |
+
"n04507155": 879,
|
| 882 |
+
"n04509417": 880,
|
| 883 |
+
"n04515003": 881,
|
| 884 |
+
"n04517823": 882,
|
| 885 |
+
"n04522168": 883,
|
| 886 |
+
"n04523525": 884,
|
| 887 |
+
"n04525038": 885,
|
| 888 |
+
"n04525305": 886,
|
| 889 |
+
"n04532106": 887,
|
| 890 |
+
"n04532670": 888,
|
| 891 |
+
"n04536866": 889,
|
| 892 |
+
"n04540053": 890,
|
| 893 |
+
"n04542943": 891,
|
| 894 |
+
"n04548280": 892,
|
| 895 |
+
"n04548362": 893,
|
| 896 |
+
"n04550184": 894,
|
| 897 |
+
"n04552348": 895,
|
| 898 |
+
"n04553703": 896,
|
| 899 |
+
"n04554684": 897,
|
| 900 |
+
"n04557648": 898,
|
| 901 |
+
"n04560804": 899,
|
| 902 |
+
"n04562935": 900,
|
| 903 |
+
"n04579145": 901,
|
| 904 |
+
"n04579432": 902,
|
| 905 |
+
"n04584207": 903,
|
| 906 |
+
"n04589890": 904,
|
| 907 |
+
"n04590129": 905,
|
| 908 |
+
"n04591157": 906,
|
| 909 |
+
"n04591713": 907,
|
| 910 |
+
"n04592741": 908,
|
| 911 |
+
"n04596742": 909,
|
| 912 |
+
"n04597913": 910,
|
| 913 |
+
"n04599235": 911,
|
| 914 |
+
"n04604644": 912,
|
| 915 |
+
"n04606251": 913,
|
| 916 |
+
"n04612504": 914,
|
| 917 |
+
"n04613696": 915,
|
| 918 |
+
"n06359193": 916,
|
| 919 |
+
"n06596364": 917,
|
| 920 |
+
"n06785654": 918,
|
| 921 |
+
"n06794110": 919,
|
| 922 |
+
"n06874185": 920,
|
| 923 |
+
"n07248320": 921,
|
| 924 |
+
"n07565083": 922,
|
| 925 |
+
"n07579787": 923,
|
| 926 |
+
"n07583066": 924,
|
| 927 |
+
"n07584110": 925,
|
| 928 |
+
"n07590611": 926,
|
| 929 |
+
"n07613480": 927,
|
| 930 |
+
"n07614500": 928,
|
| 931 |
+
"n07615774": 929,
|
| 932 |
+
"n07684084": 930,
|
| 933 |
+
"n07693725": 931,
|
| 934 |
+
"n07695742": 932,
|
| 935 |
+
"n07697313": 933,
|
| 936 |
+
"n07697537": 934,
|
| 937 |
+
"n07711569": 935,
|
| 938 |
+
"n07714571": 936,
|
| 939 |
+
"n07714990": 937,
|
| 940 |
+
"n07715103": 938,
|
| 941 |
+
"n07716358": 939,
|
| 942 |
+
"n07716906": 940,
|
| 943 |
+
"n07717410": 941,
|
| 944 |
+
"n07717556": 942,
|
| 945 |
+
"n07718472": 943,
|
| 946 |
+
"n07718747": 944,
|
| 947 |
+
"n07720875": 945,
|
| 948 |
+
"n07730033": 946,
|
| 949 |
+
"n07734744": 947,
|
| 950 |
+
"n07742313": 948,
|
| 951 |
+
"n07745940": 949,
|
| 952 |
+
"n07747607": 950,
|
| 953 |
+
"n07749582": 951,
|
| 954 |
+
"n07753113": 952,
|
| 955 |
+
"n07753275": 953,
|
| 956 |
+
"n07753592": 954,
|
| 957 |
+
"n07754684": 955,
|
| 958 |
+
"n07760859": 956,
|
| 959 |
+
"n07768694": 957,
|
| 960 |
+
"n07802026": 958,
|
| 961 |
+
"n07831146": 959,
|
| 962 |
+
"n07836838": 960,
|
| 963 |
+
"n07860988": 961,
|
| 964 |
+
"n07871810": 962,
|
| 965 |
+
"n07873807": 963,
|
| 966 |
+
"n07875152": 964,
|
| 967 |
+
"n07880968": 965,
|
| 968 |
+
"n07892512": 966,
|
| 969 |
+
"n07920052": 967,
|
| 970 |
+
"n07930864": 968,
|
| 971 |
+
"n07932039": 969,
|
| 972 |
+
"n09193705": 970,
|
| 973 |
+
"n09229709": 971,
|
| 974 |
+
"n09246464": 972,
|
| 975 |
+
"n09256479": 973,
|
| 976 |
+
"n09288635": 974,
|
| 977 |
+
"n09332890": 975,
|
| 978 |
+
"n09399592": 976,
|
| 979 |
+
"n09421951": 977,
|
| 980 |
+
"n09428293": 978,
|
| 981 |
+
"n09468604": 979,
|
| 982 |
+
"n09472597": 980,
|
| 983 |
+
"n09835506": 981,
|
| 984 |
+
"n10148035": 982,
|
| 985 |
+
"n10565667": 983,
|
| 986 |
+
"n11879895": 984,
|
| 987 |
+
"n11939491": 985,
|
| 988 |
+
"n12057211": 986,
|
| 989 |
+
"n12144580": 987,
|
| 990 |
+
"n12267677": 988,
|
| 991 |
+
"n12620546": 989,
|
| 992 |
+
"n12768682": 990,
|
| 993 |
+
"n12985857": 991,
|
| 994 |
+
"n12998815": 992,
|
| 995 |
+
"n13037406": 993,
|
| 996 |
+
"n13040303": 994,
|
| 997 |
+
"n13044778": 995,
|
| 998 |
+
"n13052670": 996,
|
| 999 |
+
"n13054560": 997,
|
| 1000 |
+
"n13133613": 998,
|
| 1001 |
+
"n15075141": 999
|
| 1002 |
+
}
|
VLMEvalKit_old/InternVL/classification/meta_data/map22kto1k.txt
ADDED
|
@@ -0,0 +1,1000 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
359
|
| 2 |
+
368
|
| 3 |
+
460
|
| 4 |
+
475
|
| 5 |
+
486
|
| 6 |
+
492
|
| 7 |
+
496
|
| 8 |
+
514
|
| 9 |
+
516
|
| 10 |
+
525
|
| 11 |
+
547
|
| 12 |
+
548
|
| 13 |
+
556
|
| 14 |
+
563
|
| 15 |
+
575
|
| 16 |
+
641
|
| 17 |
+
648
|
| 18 |
+
723
|
| 19 |
+
733
|
| 20 |
+
765
|
| 21 |
+
801
|
| 22 |
+
826
|
| 23 |
+
852
|
| 24 |
+
858
|
| 25 |
+
878
|
| 26 |
+
896
|
| 27 |
+
900
|
| 28 |
+
905
|
| 29 |
+
908
|
| 30 |
+
910
|
| 31 |
+
935
|
| 32 |
+
946
|
| 33 |
+
947
|
| 34 |
+
994
|
| 35 |
+
999
|
| 36 |
+
1003
|
| 37 |
+
1005
|
| 38 |
+
1010
|
| 39 |
+
1027
|
| 40 |
+
1029
|
| 41 |
+
1048
|
| 42 |
+
1055
|
| 43 |
+
1064
|
| 44 |
+
1065
|
| 45 |
+
1069
|
| 46 |
+
1075
|
| 47 |
+
1079
|
| 48 |
+
1081
|
| 49 |
+
1085
|
| 50 |
+
1088
|
| 51 |
+
1093
|
| 52 |
+
1106
|
| 53 |
+
1143
|
| 54 |
+
1144
|
| 55 |
+
1145
|
| 56 |
+
1147
|
| 57 |
+
1168
|
| 58 |
+
1171
|
| 59 |
+
1178
|
| 60 |
+
1187
|
| 61 |
+
1190
|
| 62 |
+
1197
|
| 63 |
+
1205
|
| 64 |
+
1216
|
| 65 |
+
1223
|
| 66 |
+
1230
|
| 67 |
+
1236
|
| 68 |
+
1241
|
| 69 |
+
1245
|
| 70 |
+
1257
|
| 71 |
+
1259
|
| 72 |
+
1260
|
| 73 |
+
1267
|
| 74 |
+
1268
|
| 75 |
+
1269
|
| 76 |
+
1271
|
| 77 |
+
1272
|
| 78 |
+
1273
|
| 79 |
+
1277
|
| 80 |
+
1303
|
| 81 |
+
1344
|
| 82 |
+
1349
|
| 83 |
+
1355
|
| 84 |
+
1357
|
| 85 |
+
1384
|
| 86 |
+
1388
|
| 87 |
+
1391
|
| 88 |
+
1427
|
| 89 |
+
1429
|
| 90 |
+
1432
|
| 91 |
+
1437
|
| 92 |
+
1450
|
| 93 |
+
1461
|
| 94 |
+
1462
|
| 95 |
+
1474
|
| 96 |
+
1502
|
| 97 |
+
1503
|
| 98 |
+
1512
|
| 99 |
+
1552
|
| 100 |
+
1555
|
| 101 |
+
1577
|
| 102 |
+
1584
|
| 103 |
+
1587
|
| 104 |
+
1589
|
| 105 |
+
1599
|
| 106 |
+
1615
|
| 107 |
+
1616
|
| 108 |
+
1681
|
| 109 |
+
1692
|
| 110 |
+
1701
|
| 111 |
+
1716
|
| 112 |
+
1729
|
| 113 |
+
1757
|
| 114 |
+
1759
|
| 115 |
+
1764
|
| 116 |
+
1777
|
| 117 |
+
1786
|
| 118 |
+
1822
|
| 119 |
+
1841
|
| 120 |
+
1842
|
| 121 |
+
1848
|
| 122 |
+
1850
|
| 123 |
+
1856
|
| 124 |
+
1860
|
| 125 |
+
1861
|
| 126 |
+
1864
|
| 127 |
+
1876
|
| 128 |
+
1897
|
| 129 |
+
1898
|
| 130 |
+
1910
|
| 131 |
+
1913
|
| 132 |
+
1918
|
| 133 |
+
1922
|
| 134 |
+
1928
|
| 135 |
+
1932
|
| 136 |
+
1935
|
| 137 |
+
1947
|
| 138 |
+
1951
|
| 139 |
+
1953
|
| 140 |
+
1970
|
| 141 |
+
1977
|
| 142 |
+
1979
|
| 143 |
+
2001
|
| 144 |
+
2017
|
| 145 |
+
2067
|
| 146 |
+
2081
|
| 147 |
+
2087
|
| 148 |
+
2112
|
| 149 |
+
2128
|
| 150 |
+
2135
|
| 151 |
+
2147
|
| 152 |
+
2174
|
| 153 |
+
2175
|
| 154 |
+
2176
|
| 155 |
+
2177
|
| 156 |
+
2178
|
| 157 |
+
2181
|
| 158 |
+
2183
|
| 159 |
+
2184
|
| 160 |
+
2187
|
| 161 |
+
2189
|
| 162 |
+
2190
|
| 163 |
+
2191
|
| 164 |
+
2192
|
| 165 |
+
2193
|
| 166 |
+
2197
|
| 167 |
+
2202
|
| 168 |
+
2203
|
| 169 |
+
2206
|
| 170 |
+
2208
|
| 171 |
+
2209
|
| 172 |
+
2211
|
| 173 |
+
2212
|
| 174 |
+
2213
|
| 175 |
+
2214
|
| 176 |
+
2215
|
| 177 |
+
2216
|
| 178 |
+
2217
|
| 179 |
+
2219
|
| 180 |
+
2222
|
| 181 |
+
2223
|
| 182 |
+
2224
|
| 183 |
+
2225
|
| 184 |
+
2226
|
| 185 |
+
2227
|
| 186 |
+
2228
|
| 187 |
+
2229
|
| 188 |
+
2230
|
| 189 |
+
2236
|
| 190 |
+
2238
|
| 191 |
+
2240
|
| 192 |
+
2241
|
| 193 |
+
2242
|
| 194 |
+
2243
|
| 195 |
+
2244
|
| 196 |
+
2245
|
| 197 |
+
2247
|
| 198 |
+
2248
|
| 199 |
+
2249
|
| 200 |
+
2250
|
| 201 |
+
2251
|
| 202 |
+
2252
|
| 203 |
+
2255
|
| 204 |
+
2256
|
| 205 |
+
2257
|
| 206 |
+
2262
|
| 207 |
+
2263
|
| 208 |
+
2264
|
| 209 |
+
2265
|
| 210 |
+
2266
|
| 211 |
+
2268
|
| 212 |
+
2270
|
| 213 |
+
2271
|
| 214 |
+
2272
|
| 215 |
+
2273
|
| 216 |
+
2275
|
| 217 |
+
2276
|
| 218 |
+
2279
|
| 219 |
+
2280
|
| 220 |
+
2281
|
| 221 |
+
2282
|
| 222 |
+
2285
|
| 223 |
+
2289
|
| 224 |
+
2292
|
| 225 |
+
2295
|
| 226 |
+
2296
|
| 227 |
+
2297
|
| 228 |
+
2298
|
| 229 |
+
2299
|
| 230 |
+
2300
|
| 231 |
+
2301
|
| 232 |
+
2302
|
| 233 |
+
2303
|
| 234 |
+
2304
|
| 235 |
+
2305
|
| 236 |
+
2306
|
| 237 |
+
2309
|
| 238 |
+
2310
|
| 239 |
+
2312
|
| 240 |
+
2313
|
| 241 |
+
2314
|
| 242 |
+
2315
|
| 243 |
+
2316
|
| 244 |
+
2318
|
| 245 |
+
2319
|
| 246 |
+
2321
|
| 247 |
+
2322
|
| 248 |
+
2326
|
| 249 |
+
2329
|
| 250 |
+
2330
|
| 251 |
+
2331
|
| 252 |
+
2332
|
| 253 |
+
2334
|
| 254 |
+
2335
|
| 255 |
+
2336
|
| 256 |
+
2337
|
| 257 |
+
2338
|
| 258 |
+
2339
|
| 259 |
+
2341
|
| 260 |
+
2342
|
| 261 |
+
2343
|
| 262 |
+
2344
|
| 263 |
+
2346
|
| 264 |
+
2348
|
| 265 |
+
2349
|
| 266 |
+
2351
|
| 267 |
+
2352
|
| 268 |
+
2353
|
| 269 |
+
2355
|
| 270 |
+
2357
|
| 271 |
+
2358
|
| 272 |
+
2359
|
| 273 |
+
2360
|
| 274 |
+
2364
|
| 275 |
+
2365
|
| 276 |
+
2368
|
| 277 |
+
2369
|
| 278 |
+
2377
|
| 279 |
+
2382
|
| 280 |
+
2383
|
| 281 |
+
2385
|
| 282 |
+
2397
|
| 283 |
+
2398
|
| 284 |
+
2400
|
| 285 |
+
2402
|
| 286 |
+
2405
|
| 287 |
+
2412
|
| 288 |
+
2421
|
| 289 |
+
2428
|
| 290 |
+
2431
|
| 291 |
+
2432
|
| 292 |
+
2433
|
| 293 |
+
2436
|
| 294 |
+
2441
|
| 295 |
+
2445
|
| 296 |
+
2450
|
| 297 |
+
2453
|
| 298 |
+
2454
|
| 299 |
+
2465
|
| 300 |
+
2469
|
| 301 |
+
2532
|
| 302 |
+
2533
|
| 303 |
+
2538
|
| 304 |
+
2544
|
| 305 |
+
2547
|
| 306 |
+
2557
|
| 307 |
+
2565
|
| 308 |
+
2578
|
| 309 |
+
2612
|
| 310 |
+
2658
|
| 311 |
+
2702
|
| 312 |
+
2722
|
| 313 |
+
2731
|
| 314 |
+
2738
|
| 315 |
+
2741
|
| 316 |
+
2747
|
| 317 |
+
2810
|
| 318 |
+
2818
|
| 319 |
+
2833
|
| 320 |
+
2844
|
| 321 |
+
2845
|
| 322 |
+
2867
|
| 323 |
+
2874
|
| 324 |
+
2882
|
| 325 |
+
2884
|
| 326 |
+
2888
|
| 327 |
+
2889
|
| 328 |
+
3008
|
| 329 |
+
3012
|
| 330 |
+
3019
|
| 331 |
+
3029
|
| 332 |
+
3033
|
| 333 |
+
3042
|
| 334 |
+
3091
|
| 335 |
+
3106
|
| 336 |
+
3138
|
| 337 |
+
3159
|
| 338 |
+
3164
|
| 339 |
+
3169
|
| 340 |
+
3280
|
| 341 |
+
3296
|
| 342 |
+
3311
|
| 343 |
+
3318
|
| 344 |
+
3320
|
| 345 |
+
3324
|
| 346 |
+
3330
|
| 347 |
+
3366
|
| 348 |
+
3375
|
| 349 |
+
3381
|
| 350 |
+
3406
|
| 351 |
+
3419
|
| 352 |
+
3432
|
| 353 |
+
3434
|
| 354 |
+
3435
|
| 355 |
+
3493
|
| 356 |
+
3495
|
| 357 |
+
3503
|
| 358 |
+
3509
|
| 359 |
+
3511
|
| 360 |
+
3513
|
| 361 |
+
3517
|
| 362 |
+
3521
|
| 363 |
+
3526
|
| 364 |
+
3546
|
| 365 |
+
3554
|
| 366 |
+
3600
|
| 367 |
+
3601
|
| 368 |
+
3606
|
| 369 |
+
3612
|
| 370 |
+
3613
|
| 371 |
+
3616
|
| 372 |
+
3622
|
| 373 |
+
3623
|
| 374 |
+
3627
|
| 375 |
+
3632
|
| 376 |
+
3634
|
| 377 |
+
3636
|
| 378 |
+
3638
|
| 379 |
+
3644
|
| 380 |
+
3646
|
| 381 |
+
3649
|
| 382 |
+
3650
|
| 383 |
+
3651
|
| 384 |
+
3656
|
| 385 |
+
3663
|
| 386 |
+
3673
|
| 387 |
+
3674
|
| 388 |
+
3689
|
| 389 |
+
3690
|
| 390 |
+
3702
|
| 391 |
+
3733
|
| 392 |
+
3769
|
| 393 |
+
3971
|
| 394 |
+
3974
|
| 395 |
+
4065
|
| 396 |
+
4068
|
| 397 |
+
4073
|
| 398 |
+
4102
|
| 399 |
+
4136
|
| 400 |
+
4140
|
| 401 |
+
4151
|
| 402 |
+
4159
|
| 403 |
+
4165
|
| 404 |
+
4207
|
| 405 |
+
4219
|
| 406 |
+
4226
|
| 407 |
+
4249
|
| 408 |
+
4256
|
| 409 |
+
4263
|
| 410 |
+
4270
|
| 411 |
+
4313
|
| 412 |
+
4321
|
| 413 |
+
4378
|
| 414 |
+
4386
|
| 415 |
+
4478
|
| 416 |
+
4508
|
| 417 |
+
4512
|
| 418 |
+
4536
|
| 419 |
+
4542
|
| 420 |
+
4550
|
| 421 |
+
4560
|
| 422 |
+
4562
|
| 423 |
+
4570
|
| 424 |
+
4571
|
| 425 |
+
4572
|
| 426 |
+
4583
|
| 427 |
+
4588
|
| 428 |
+
4594
|
| 429 |
+
4604
|
| 430 |
+
4608
|
| 431 |
+
4623
|
| 432 |
+
4634
|
| 433 |
+
4636
|
| 434 |
+
4646
|
| 435 |
+
4651
|
| 436 |
+
4652
|
| 437 |
+
4686
|
| 438 |
+
4688
|
| 439 |
+
4691
|
| 440 |
+
4699
|
| 441 |
+
4724
|
| 442 |
+
4727
|
| 443 |
+
4737
|
| 444 |
+
4770
|
| 445 |
+
4774
|
| 446 |
+
4789
|
| 447 |
+
4802
|
| 448 |
+
4807
|
| 449 |
+
4819
|
| 450 |
+
4880
|
| 451 |
+
4886
|
| 452 |
+
4908
|
| 453 |
+
4927
|
| 454 |
+
4931
|
| 455 |
+
4936
|
| 456 |
+
4964
|
| 457 |
+
4976
|
| 458 |
+
4993
|
| 459 |
+
5028
|
| 460 |
+
5033
|
| 461 |
+
5043
|
| 462 |
+
5046
|
| 463 |
+
5096
|
| 464 |
+
5111
|
| 465 |
+
5114
|
| 466 |
+
5131
|
| 467 |
+
5132
|
| 468 |
+
5183
|
| 469 |
+
5199
|
| 470 |
+
5235
|
| 471 |
+
5275
|
| 472 |
+
5291
|
| 473 |
+
5293
|
| 474 |
+
5294
|
| 475 |
+
5343
|
| 476 |
+
5360
|
| 477 |
+
5362
|
| 478 |
+
5364
|
| 479 |
+
5390
|
| 480 |
+
5402
|
| 481 |
+
5418
|
| 482 |
+
5428
|
| 483 |
+
5430
|
| 484 |
+
5437
|
| 485 |
+
5443
|
| 486 |
+
5473
|
| 487 |
+
5484
|
| 488 |
+
5486
|
| 489 |
+
5505
|
| 490 |
+
5507
|
| 491 |
+
5508
|
| 492 |
+
5510
|
| 493 |
+
5567
|
| 494 |
+
5578
|
| 495 |
+
5580
|
| 496 |
+
5584
|
| 497 |
+
5606
|
| 498 |
+
5613
|
| 499 |
+
5629
|
| 500 |
+
5672
|
| 501 |
+
5676
|
| 502 |
+
5692
|
| 503 |
+
5701
|
| 504 |
+
5760
|
| 505 |
+
5769
|
| 506 |
+
5770
|
| 507 |
+
5779
|
| 508 |
+
5814
|
| 509 |
+
5850
|
| 510 |
+
5871
|
| 511 |
+
5893
|
| 512 |
+
5911
|
| 513 |
+
5949
|
| 514 |
+
5954
|
| 515 |
+
6005
|
| 516 |
+
6006
|
| 517 |
+
6012
|
| 518 |
+
6017
|
| 519 |
+
6023
|
| 520 |
+
6024
|
| 521 |
+
6040
|
| 522 |
+
6050
|
| 523 |
+
6054
|
| 524 |
+
6087
|
| 525 |
+
6105
|
| 526 |
+
6157
|
| 527 |
+
6235
|
| 528 |
+
6237
|
| 529 |
+
6256
|
| 530 |
+
6259
|
| 531 |
+
6286
|
| 532 |
+
6291
|
| 533 |
+
6306
|
| 534 |
+
6339
|
| 535 |
+
6341
|
| 536 |
+
6343
|
| 537 |
+
6379
|
| 538 |
+
6383
|
| 539 |
+
6393
|
| 540 |
+
6405
|
| 541 |
+
6479
|
| 542 |
+
6511
|
| 543 |
+
6517
|
| 544 |
+
6541
|
| 545 |
+
6561
|
| 546 |
+
6608
|
| 547 |
+
6611
|
| 548 |
+
6615
|
| 549 |
+
6678
|
| 550 |
+
6682
|
| 551 |
+
6707
|
| 552 |
+
6752
|
| 553 |
+
6798
|
| 554 |
+
6850
|
| 555 |
+
6880
|
| 556 |
+
6885
|
| 557 |
+
6890
|
| 558 |
+
6920
|
| 559 |
+
6981
|
| 560 |
+
7000
|
| 561 |
+
7009
|
| 562 |
+
7038
|
| 563 |
+
7049
|
| 564 |
+
7050
|
| 565 |
+
7052
|
| 566 |
+
7073
|
| 567 |
+
7078
|
| 568 |
+
7098
|
| 569 |
+
7111
|
| 570 |
+
7165
|
| 571 |
+
7198
|
| 572 |
+
7204
|
| 573 |
+
7280
|
| 574 |
+
7283
|
| 575 |
+
7286
|
| 576 |
+
7287
|
| 577 |
+
7293
|
| 578 |
+
7294
|
| 579 |
+
7305
|
| 580 |
+
7318
|
| 581 |
+
7341
|
| 582 |
+
7346
|
| 583 |
+
7354
|
| 584 |
+
7382
|
| 585 |
+
7427
|
| 586 |
+
7428
|
| 587 |
+
7435
|
| 588 |
+
7445
|
| 589 |
+
7450
|
| 590 |
+
7455
|
| 591 |
+
7467
|
| 592 |
+
7469
|
| 593 |
+
7497
|
| 594 |
+
7502
|
| 595 |
+
7506
|
| 596 |
+
7514
|
| 597 |
+
7523
|
| 598 |
+
7651
|
| 599 |
+
7661
|
| 600 |
+
7664
|
| 601 |
+
7672
|
| 602 |
+
7679
|
| 603 |
+
7685
|
| 604 |
+
7696
|
| 605 |
+
7730
|
| 606 |
+
7871
|
| 607 |
+
7873
|
| 608 |
+
7895
|
| 609 |
+
7914
|
| 610 |
+
7915
|
| 611 |
+
7920
|
| 612 |
+
7934
|
| 613 |
+
7935
|
| 614 |
+
7949
|
| 615 |
+
8009
|
| 616 |
+
8036
|
| 617 |
+
8051
|
| 618 |
+
8065
|
| 619 |
+
8074
|
| 620 |
+
8090
|
| 621 |
+
8112
|
| 622 |
+
8140
|
| 623 |
+
8164
|
| 624 |
+
8168
|
| 625 |
+
8178
|
| 626 |
+
8182
|
| 627 |
+
8198
|
| 628 |
+
8212
|
| 629 |
+
8216
|
| 630 |
+
8230
|
| 631 |
+
8242
|
| 632 |
+
8288
|
| 633 |
+
8289
|
| 634 |
+
8295
|
| 635 |
+
8318
|
| 636 |
+
8352
|
| 637 |
+
8368
|
| 638 |
+
8371
|
| 639 |
+
8375
|
| 640 |
+
8376
|
| 641 |
+
8401
|
| 642 |
+
8416
|
| 643 |
+
8419
|
| 644 |
+
8436
|
| 645 |
+
8460
|
| 646 |
+
8477
|
| 647 |
+
8478
|
| 648 |
+
8482
|
| 649 |
+
8498
|
| 650 |
+
8500
|
| 651 |
+
8539
|
| 652 |
+
8543
|
| 653 |
+
8552
|
| 654 |
+
8555
|
| 655 |
+
8580
|
| 656 |
+
8584
|
| 657 |
+
8586
|
| 658 |
+
8594
|
| 659 |
+
8598
|
| 660 |
+
8601
|
| 661 |
+
8606
|
| 662 |
+
8610
|
| 663 |
+
8611
|
| 664 |
+
8622
|
| 665 |
+
8627
|
| 666 |
+
8639
|
| 667 |
+
8649
|
| 668 |
+
8650
|
| 669 |
+
8653
|
| 670 |
+
8654
|
| 671 |
+
8667
|
| 672 |
+
8672
|
| 673 |
+
8673
|
| 674 |
+
8674
|
| 675 |
+
8676
|
| 676 |
+
8684
|
| 677 |
+
8720
|
| 678 |
+
8723
|
| 679 |
+
8750
|
| 680 |
+
8753
|
| 681 |
+
8801
|
| 682 |
+
8815
|
| 683 |
+
8831
|
| 684 |
+
8835
|
| 685 |
+
8842
|
| 686 |
+
8845
|
| 687 |
+
8858
|
| 688 |
+
8897
|
| 689 |
+
8916
|
| 690 |
+
8951
|
| 691 |
+
8954
|
| 692 |
+
8959
|
| 693 |
+
8970
|
| 694 |
+
8976
|
| 695 |
+
8981
|
| 696 |
+
8983
|
| 697 |
+
8989
|
| 698 |
+
8991
|
| 699 |
+
8993
|
| 700 |
+
9019
|
| 701 |
+
9039
|
| 702 |
+
9042
|
| 703 |
+
9043
|
| 704 |
+
9056
|
| 705 |
+
9057
|
| 706 |
+
9070
|
| 707 |
+
9087
|
| 708 |
+
9098
|
| 709 |
+
9106
|
| 710 |
+
9130
|
| 711 |
+
9131
|
| 712 |
+
9155
|
| 713 |
+
9171
|
| 714 |
+
9183
|
| 715 |
+
9198
|
| 716 |
+
9199
|
| 717 |
+
9201
|
| 718 |
+
9204
|
| 719 |
+
9211
|
| 720 |
+
9220
|
| 721 |
+
9224
|
| 722 |
+
9228
|
| 723 |
+
9249
|
| 724 |
+
9259
|
| 725 |
+
9270
|
| 726 |
+
9278
|
| 727 |
+
9294
|
| 728 |
+
9299
|
| 729 |
+
9309
|
| 730 |
+
9321
|
| 731 |
+
9344
|
| 732 |
+
9351
|
| 733 |
+
9375
|
| 734 |
+
9376
|
| 735 |
+
9381
|
| 736 |
+
9391
|
| 737 |
+
9400
|
| 738 |
+
9404
|
| 739 |
+
9440
|
| 740 |
+
9448
|
| 741 |
+
9463
|
| 742 |
+
9474
|
| 743 |
+
9501
|
| 744 |
+
9504
|
| 745 |
+
9513
|
| 746 |
+
9514
|
| 747 |
+
9544
|
| 748 |
+
9566
|
| 749 |
+
9575
|
| 750 |
+
9607
|
| 751 |
+
9608
|
| 752 |
+
9623
|
| 753 |
+
9632
|
| 754 |
+
9638
|
| 755 |
+
9642
|
| 756 |
+
9655
|
| 757 |
+
9673
|
| 758 |
+
9739
|
| 759 |
+
9751
|
| 760 |
+
9759
|
| 761 |
+
9766
|
| 762 |
+
9777
|
| 763 |
+
9801
|
| 764 |
+
9819
|
| 765 |
+
9838
|
| 766 |
+
9878
|
| 767 |
+
9923
|
| 768 |
+
9955
|
| 769 |
+
9960
|
| 770 |
+
9962
|
| 771 |
+
9969
|
| 772 |
+
9996
|
| 773 |
+
10009
|
| 774 |
+
10030
|
| 775 |
+
10039
|
| 776 |
+
10051
|
| 777 |
+
10072
|
| 778 |
+
10074
|
| 779 |
+
10077
|
| 780 |
+
10093
|
| 781 |
+
10096
|
| 782 |
+
10108
|
| 783 |
+
10117
|
| 784 |
+
10120
|
| 785 |
+
10123
|
| 786 |
+
10157
|
| 787 |
+
10225
|
| 788 |
+
10275
|
| 789 |
+
10303
|
| 790 |
+
10306
|
| 791 |
+
10313
|
| 792 |
+
10314
|
| 793 |
+
10331
|
| 794 |
+
10336
|
| 795 |
+
10337
|
| 796 |
+
10412
|
| 797 |
+
10422
|
| 798 |
+
10450
|
| 799 |
+
10462
|
| 800 |
+
10464
|
| 801 |
+
10486
|
| 802 |
+
10518
|
| 803 |
+
10521
|
| 804 |
+
10522
|
| 805 |
+
10531
|
| 806 |
+
10533
|
| 807 |
+
10534
|
| 808 |
+
10550
|
| 809 |
+
10558
|
| 810 |
+
10573
|
| 811 |
+
10582
|
| 812 |
+
10585
|
| 813 |
+
10588
|
| 814 |
+
10611
|
| 815 |
+
10625
|
| 816 |
+
10634
|
| 817 |
+
10637
|
| 818 |
+
10676
|
| 819 |
+
10682
|
| 820 |
+
10725
|
| 821 |
+
10775
|
| 822 |
+
10781
|
| 823 |
+
10782
|
| 824 |
+
10806
|
| 825 |
+
10836
|
| 826 |
+
10839
|
| 827 |
+
10847
|
| 828 |
+
10858
|
| 829 |
+
10870
|
| 830 |
+
10880
|
| 831 |
+
10883
|
| 832 |
+
10907
|
| 833 |
+
10913
|
| 834 |
+
10920
|
| 835 |
+
10935
|
| 836 |
+
10946
|
| 837 |
+
10950
|
| 838 |
+
10951
|
| 839 |
+
10956
|
| 840 |
+
10998
|
| 841 |
+
11002
|
| 842 |
+
11017
|
| 843 |
+
11022
|
| 844 |
+
11024
|
| 845 |
+
11026
|
| 846 |
+
11044
|
| 847 |
+
11054
|
| 848 |
+
11094
|
| 849 |
+
11109
|
| 850 |
+
11136
|
| 851 |
+
11136
|
| 852 |
+
11167
|
| 853 |
+
11185
|
| 854 |
+
11220
|
| 855 |
+
11222
|
| 856 |
+
11241
|
| 857 |
+
11254
|
| 858 |
+
11258
|
| 859 |
+
11278
|
| 860 |
+
11305
|
| 861 |
+
11310
|
| 862 |
+
11330
|
| 863 |
+
11366
|
| 864 |
+
11376
|
| 865 |
+
11388
|
| 866 |
+
11391
|
| 867 |
+
11400
|
| 868 |
+
11406
|
| 869 |
+
11436
|
| 870 |
+
11448
|
| 871 |
+
11465
|
| 872 |
+
11468
|
| 873 |
+
11472
|
| 874 |
+
11477
|
| 875 |
+
11482
|
| 876 |
+
11483
|
| 877 |
+
11506
|
| 878 |
+
11535
|
| 879 |
+
11557
|
| 880 |
+
11565
|
| 881 |
+
11574
|
| 882 |
+
11583
|
| 883 |
+
11593
|
| 884 |
+
11610
|
| 885 |
+
11611
|
| 886 |
+
11618
|
| 887 |
+
11620
|
| 888 |
+
11639
|
| 889 |
+
11642
|
| 890 |
+
11663
|
| 891 |
+
11673
|
| 892 |
+
11688
|
| 893 |
+
11708
|
| 894 |
+
11709
|
| 895 |
+
11715
|
| 896 |
+
11720
|
| 897 |
+
11725
|
| 898 |
+
11728
|
| 899 |
+
11742
|
| 900 |
+
11759
|
| 901 |
+
11770
|
| 902 |
+
11836
|
| 903 |
+
11838
|
| 904 |
+
11855
|
| 905 |
+
11875
|
| 906 |
+
11877
|
| 907 |
+
11883
|
| 908 |
+
11888
|
| 909 |
+
11895
|
| 910 |
+
11916
|
| 911 |
+
11922
|
| 912 |
+
11929
|
| 913 |
+
11943
|
| 914 |
+
11951
|
| 915 |
+
11979
|
| 916 |
+
11983
|
| 917 |
+
12213
|
| 918 |
+
12228
|
| 919 |
+
12238
|
| 920 |
+
12240
|
| 921 |
+
12241
|
| 922 |
+
12246
|
| 923 |
+
12282
|
| 924 |
+
12348
|
| 925 |
+
12368
|
| 926 |
+
12372
|
| 927 |
+
12421
|
| 928 |
+
12559
|
| 929 |
+
12565
|
| 930 |
+
12574
|
| 931 |
+
12687
|
| 932 |
+
12754
|
| 933 |
+
12767
|
| 934 |
+
12777
|
| 935 |
+
12779
|
| 936 |
+
12811
|
| 937 |
+
12831
|
| 938 |
+
12834
|
| 939 |
+
12835
|
| 940 |
+
12842
|
| 941 |
+
12846
|
| 942 |
+
12848
|
| 943 |
+
12849
|
| 944 |
+
12855
|
| 945 |
+
12857
|
| 946 |
+
12872
|
| 947 |
+
12937
|
| 948 |
+
12970
|
| 949 |
+
13016
|
| 950 |
+
13037
|
| 951 |
+
13045
|
| 952 |
+
13058
|
| 953 |
+
13084
|
| 954 |
+
13085
|
| 955 |
+
13087
|
| 956 |
+
13093
|
| 957 |
+
13133
|
| 958 |
+
13181
|
| 959 |
+
13229
|
| 960 |
+
13405
|
| 961 |
+
13443
|
| 962 |
+
13613
|
| 963 |
+
13689
|
| 964 |
+
13697
|
| 965 |
+
13708
|
| 966 |
+
13748
|
| 967 |
+
13803
|
| 968 |
+
13981
|
| 969 |
+
14050
|
| 970 |
+
14058
|
| 971 |
+
14218
|
| 972 |
+
14245
|
| 973 |
+
14255
|
| 974 |
+
14263
|
| 975 |
+
14293
|
| 976 |
+
14323
|
| 977 |
+
14366
|
| 978 |
+
14388
|
| 979 |
+
14393
|
| 980 |
+
14437
|
| 981 |
+
14441
|
| 982 |
+
14964
|
| 983 |
+
15730
|
| 984 |
+
16742
|
| 985 |
+
18035
|
| 986 |
+
18203
|
| 987 |
+
18533
|
| 988 |
+
18790
|
| 989 |
+
19100
|
| 990 |
+
20017
|
| 991 |
+
20460
|
| 992 |
+
21024
|
| 993 |
+
21043
|
| 994 |
+
21161
|
| 995 |
+
21169
|
| 996 |
+
21179
|
| 997 |
+
21194
|
| 998 |
+
21198
|
| 999 |
+
21367
|
| 1000 |
+
21815
|
VLMEvalKit_old/InternVL/classification/meta_data/real.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
VLMEvalKit_old/InternVL/classification/models/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
from .build import build_model
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/answer/answer_bard.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/results/test_sqa_llava_13b_v0.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/eval/table/results/test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/llava_llama.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2023 Haotian Liu
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from typing import List, Optional, Tuple, Union
|
| 17 |
+
|
| 18 |
+
import torch
|
| 19 |
+
import torch.nn as nn
|
| 20 |
+
from torch.nn import CrossEntropyLoss
|
| 21 |
+
|
| 22 |
+
from transformers import AutoConfig, AutoModelForCausalLM, \
|
| 23 |
+
LlamaConfig, LlamaModel, LlamaForCausalLM
|
| 24 |
+
|
| 25 |
+
from transformers.modeling_outputs import CausalLMOutputWithPast
|
| 26 |
+
|
| 27 |
+
from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class LlavaConfig(LlamaConfig):
|
| 31 |
+
model_type = "llava_llama"
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
|
| 35 |
+
config_class = LlavaConfig
|
| 36 |
+
|
| 37 |
+
def __init__(self, config: LlamaConfig):
|
| 38 |
+
super(LlavaLlamaModel, self).__init__(config)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
|
| 42 |
+
config_class = LlavaConfig
|
| 43 |
+
|
| 44 |
+
def __init__(self, config):
|
| 45 |
+
super(LlamaForCausalLM, self).__init__(config)
|
| 46 |
+
self.model = LlavaLlamaModel(config)
|
| 47 |
+
|
| 48 |
+
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 49 |
+
|
| 50 |
+
# Initialize weights and apply final processing
|
| 51 |
+
self.post_init()
|
| 52 |
+
|
| 53 |
+
def get_model(self):
|
| 54 |
+
return self.model
|
| 55 |
+
|
| 56 |
+
def forward(
|
| 57 |
+
self,
|
| 58 |
+
input_ids: torch.LongTensor = None,
|
| 59 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 60 |
+
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
| 61 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 62 |
+
labels: Optional[torch.LongTensor] = None,
|
| 63 |
+
use_cache: Optional[bool] = None,
|
| 64 |
+
output_attentions: Optional[bool] = None,
|
| 65 |
+
output_hidden_states: Optional[bool] = None,
|
| 66 |
+
images: Optional[torch.FloatTensor] = None,
|
| 67 |
+
return_dict: Optional[bool] = None,
|
| 68 |
+
) -> Union[Tuple, CausalLMOutputWithPast]:
|
| 69 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 70 |
+
output_hidden_states = (
|
| 71 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 72 |
+
)
|
| 73 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 74 |
+
|
| 75 |
+
input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
|
| 76 |
+
|
| 77 |
+
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
| 78 |
+
outputs = self.model(
|
| 79 |
+
input_ids=input_ids,
|
| 80 |
+
attention_mask=attention_mask,
|
| 81 |
+
past_key_values=past_key_values,
|
| 82 |
+
inputs_embeds=inputs_embeds,
|
| 83 |
+
use_cache=use_cache,
|
| 84 |
+
output_attentions=output_attentions,
|
| 85 |
+
output_hidden_states=output_hidden_states,
|
| 86 |
+
return_dict=return_dict
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
hidden_states = outputs[0]
|
| 90 |
+
logits = self.lm_head(hidden_states)
|
| 91 |
+
|
| 92 |
+
loss = None
|
| 93 |
+
if labels is not None:
|
| 94 |
+
# Shift so that tokens < n predict n
|
| 95 |
+
shift_logits = logits[..., :-1, :].contiguous()
|
| 96 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 97 |
+
# Flatten the tokens
|
| 98 |
+
loss_fct = CrossEntropyLoss()
|
| 99 |
+
shift_logits = shift_logits.view(-1, self.config.vocab_size)
|
| 100 |
+
shift_labels = shift_labels.view(-1)
|
| 101 |
+
# Enable model/pipeline parallelism
|
| 102 |
+
shift_labels = shift_labels.to(shift_logits.device)
|
| 103 |
+
loss = loss_fct(shift_logits, shift_labels)
|
| 104 |
+
|
| 105 |
+
if not return_dict:
|
| 106 |
+
output = (logits,) + outputs[1:]
|
| 107 |
+
return (loss,) + output if loss is not None else output
|
| 108 |
+
|
| 109 |
+
return CausalLMOutputWithPast(
|
| 110 |
+
loss=loss,
|
| 111 |
+
logits=logits,
|
| 112 |
+
past_key_values=outputs.past_key_values,
|
| 113 |
+
hidden_states=outputs.hidden_states,
|
| 114 |
+
attentions=outputs.attentions,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
def prepare_inputs_for_generation(
|
| 118 |
+
self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
|
| 119 |
+
):
|
| 120 |
+
if past_key_values:
|
| 121 |
+
input_ids = input_ids[:, -1:]
|
| 122 |
+
|
| 123 |
+
# if `inputs_embeds` are passed, we only want to use them in the 1st generation step
|
| 124 |
+
if inputs_embeds is not None and past_key_values is None:
|
| 125 |
+
model_inputs = {"inputs_embeds": inputs_embeds}
|
| 126 |
+
else:
|
| 127 |
+
model_inputs = {"input_ids": input_ids}
|
| 128 |
+
|
| 129 |
+
model_inputs.update(
|
| 130 |
+
{
|
| 131 |
+
"past_key_values": past_key_values,
|
| 132 |
+
"use_cache": kwargs.get("use_cache"),
|
| 133 |
+
"attention_mask": attention_mask,
|
| 134 |
+
"images": kwargs.get("images", None),
|
| 135 |
+
}
|
| 136 |
+
)
|
| 137 |
+
return model_inputs
|
| 138 |
+
|
| 139 |
+
AutoConfig.register("llava_llama", LlavaConfig)
|
| 140 |
+
AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/llava_mpt.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Copyright 2023 Haotian Liu
|
| 2 |
+
#
|
| 3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 |
+
# you may not use this file except in compliance with the License.
|
| 5 |
+
# You may obtain a copy of the License at
|
| 6 |
+
#
|
| 7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 8 |
+
#
|
| 9 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 12 |
+
# See the License for the specific language governing permissions and
|
| 13 |
+
# limitations under the License.
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
from typing import Optional, Tuple
|
| 17 |
+
|
| 18 |
+
import torch
|
| 19 |
+
|
| 20 |
+
from transformers import AutoConfig, AutoModelForCausalLM, \
|
| 21 |
+
MptConfig, MptForCausalLM, MptModel
|
| 22 |
+
from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class LlavaMptConfig(MptConfig):
|
| 26 |
+
model_type = "llava_mpt"
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class LlavaMptModel(LlavaMetaModel, MptModel):
|
| 30 |
+
config_class = LlavaMptConfig
|
| 31 |
+
|
| 32 |
+
def __init__(self, config: MptConfig):
|
| 33 |
+
config.hidden_size = config.d_model
|
| 34 |
+
super(LlavaMptModel, self).__init__(config)
|
| 35 |
+
|
| 36 |
+
def embed_tokens(self, x):
|
| 37 |
+
return self.wte(x)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
|
| 41 |
+
config_class = LlavaMptConfig
|
| 42 |
+
supports_gradient_checkpointing = True
|
| 43 |
+
|
| 44 |
+
def __init__(self, config):
|
| 45 |
+
super(MptForCausalLM, self).__init__(config)
|
| 46 |
+
|
| 47 |
+
self.transformer = LlavaMptModel(config)
|
| 48 |
+
self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 49 |
+
|
| 50 |
+
# Initialize weights and apply final processing
|
| 51 |
+
self.post_init()
|
| 52 |
+
|
| 53 |
+
def get_model(self):
|
| 54 |
+
return self.transformer
|
| 55 |
+
|
| 56 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
| 57 |
+
if isinstance(module, LlavaMptModel):
|
| 58 |
+
module.gradient_checkpointing = value
|
| 59 |
+
|
| 60 |
+
def forward(
|
| 61 |
+
self,
|
| 62 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 63 |
+
past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
|
| 64 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 65 |
+
inputs_embeds: Optional[torch.Tensor] = None,
|
| 66 |
+
labels: Optional[torch.Tensor] = None,
|
| 67 |
+
use_cache: Optional[bool] = None,
|
| 68 |
+
output_attentions: Optional[bool] = None,
|
| 69 |
+
output_hidden_states: Optional[bool] = None,
|
| 70 |
+
return_dict: Optional[bool] = None,
|
| 71 |
+
images=None):
|
| 72 |
+
input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(
|
| 73 |
+
input_ids, attention_mask, past_key_values, labels, images)
|
| 74 |
+
|
| 75 |
+
return super().forward(
|
| 76 |
+
input_ids,
|
| 77 |
+
past_key_values=past_key_values,
|
| 78 |
+
attention_mask=attention_mask,
|
| 79 |
+
inputs_embeds=inputs_embeds,
|
| 80 |
+
labels=labels,
|
| 81 |
+
use_cache=use_cache,
|
| 82 |
+
output_attentions=output_attentions,
|
| 83 |
+
output_hidden_states=output_hidden_states,
|
| 84 |
+
return_dict=return_dict,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
|
| 88 |
+
images = kwargs.pop("images", None)
|
| 89 |
+
_inputs = super().prepare_inputs_for_generation(
|
| 90 |
+
input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
|
| 91 |
+
)
|
| 92 |
+
_inputs['images'] = images
|
| 93 |
+
return _inputs
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
AutoConfig.register("llava_mpt", LlavaMptConfig)
|
| 97 |
+
AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/adapt_tokenizer.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Union
|
| 2 |
+
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
| 3 |
+
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
| 4 |
+
NUM_SENTINEL_TOKENS: int = 100
|
| 5 |
+
|
| 6 |
+
def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
|
| 7 |
+
"""Adds sentinel tokens and padding token (if missing).
|
| 8 |
+
|
| 9 |
+
Expands the tokenizer vocabulary to include sentinel tokens
|
| 10 |
+
used in mixture-of-denoiser tasks as well as a padding token.
|
| 11 |
+
|
| 12 |
+
All added tokens are added as special tokens. No tokens are
|
| 13 |
+
added if sentinel tokens and padding token already exist.
|
| 14 |
+
"""
|
| 15 |
+
sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
|
| 16 |
+
tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
|
| 17 |
+
if tokenizer.pad_token is None:
|
| 18 |
+
tokenizer.add_tokens('<pad>', special_tokens=True)
|
| 19 |
+
tokenizer.pad_token = '<pad>'
|
| 20 |
+
assert tokenizer.pad_token_id is not None
|
| 21 |
+
sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
|
| 22 |
+
_sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
|
| 23 |
+
tokenizer.sentinel_token_ids = _sentinel_token_ids
|
| 24 |
+
|
| 25 |
+
class AutoTokenizerForMOD(AutoTokenizer):
|
| 26 |
+
"""AutoTokenizer + Adaptation for MOD.
|
| 27 |
+
|
| 28 |
+
A simple wrapper around AutoTokenizer to make instantiating
|
| 29 |
+
an MOD-adapted tokenizer a bit easier.
|
| 30 |
+
|
| 31 |
+
MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
|
| 32 |
+
a padding token, and a property to get the token ids of the
|
| 33 |
+
sentinel tokens.
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
@classmethod
|
| 37 |
+
def from_pretrained(cls, *args, **kwargs):
|
| 38 |
+
"""See `AutoTokenizer.from_pretrained` docstring."""
|
| 39 |
+
tokenizer = super().from_pretrained(*args, **kwargs)
|
| 40 |
+
adapt_tokenizer_for_denoising(tokenizer)
|
| 41 |
+
return tokenizer
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/attention.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Attention layers."""
|
| 2 |
+
import math
|
| 3 |
+
import warnings
|
| 4 |
+
from typing import Optional
|
| 5 |
+
import torch
|
| 6 |
+
import torch.nn as nn
|
| 7 |
+
from einops import rearrange
|
| 8 |
+
from packaging import version
|
| 9 |
+
from torch import nn
|
| 10 |
+
from .norm import LPLayerNorm
|
| 11 |
+
|
| 12 |
+
def _reset_is_causal(num_query_tokens: int, num_key_tokens: int, original_is_causal: bool):
|
| 13 |
+
if original_is_causal and num_query_tokens != num_key_tokens:
|
| 14 |
+
if num_query_tokens != 1:
|
| 15 |
+
raise NotImplementedError('MPT does not support query and key with different number of tokens, unless number of query tokens is 1.')
|
| 16 |
+
else:
|
| 17 |
+
return False
|
| 18 |
+
return original_is_causal
|
| 19 |
+
|
| 20 |
+
def scaled_multihead_dot_product_attention(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
| 21 |
+
q = rearrange(query, 'b s (h d) -> b h s d', h=n_heads)
|
| 22 |
+
kv_n_heads = 1 if multiquery else n_heads
|
| 23 |
+
k = rearrange(key, 'b s (h d) -> b h d s', h=kv_n_heads)
|
| 24 |
+
v = rearrange(value, 'b s (h d) -> b h s d', h=kv_n_heads)
|
| 25 |
+
if past_key_value is not None:
|
| 26 |
+
if len(past_key_value) != 0:
|
| 27 |
+
k = torch.cat([past_key_value[0], k], dim=3)
|
| 28 |
+
v = torch.cat([past_key_value[1], v], dim=2)
|
| 29 |
+
past_key_value = (k, v)
|
| 30 |
+
(b, _, s_q, d) = q.shape
|
| 31 |
+
s_k = k.size(-1)
|
| 32 |
+
if softmax_scale is None:
|
| 33 |
+
softmax_scale = 1 / math.sqrt(d)
|
| 34 |
+
attn_weight = q.matmul(k) * softmax_scale
|
| 35 |
+
if attn_bias is not None:
|
| 36 |
+
_s_q = max(0, attn_bias.size(2) - s_q)
|
| 37 |
+
_s_k = max(0, attn_bias.size(3) - s_k)
|
| 38 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
| 39 |
+
if attn_bias.size(-1) != 1 and attn_bias.size(-1) != s_k or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q):
|
| 40 |
+
raise RuntimeError(f'attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}.')
|
| 41 |
+
attn_weight = attn_weight + attn_bias
|
| 42 |
+
min_val = torch.finfo(q.dtype).min
|
| 43 |
+
if key_padding_mask is not None:
|
| 44 |
+
if attn_bias is not None:
|
| 45 |
+
warnings.warn('Propogating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unneccessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
| 46 |
+
attn_weight = attn_weight.masked_fill(~key_padding_mask.view((b, 1, 1, s_k)), min_val)
|
| 47 |
+
if is_causal and (not q.size(2) == 1):
|
| 48 |
+
s = max(s_q, s_k)
|
| 49 |
+
causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
|
| 50 |
+
causal_mask = causal_mask.tril()
|
| 51 |
+
causal_mask = causal_mask.to(torch.bool)
|
| 52 |
+
causal_mask = ~causal_mask
|
| 53 |
+
causal_mask = causal_mask[-s_q:, -s_k:]
|
| 54 |
+
attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
|
| 55 |
+
attn_weight = torch.softmax(attn_weight, dim=-1)
|
| 56 |
+
if dropout_p:
|
| 57 |
+
attn_weight = torch.nn.functional.dropout(attn_weight, p=dropout_p, training=training, inplace=True)
|
| 58 |
+
out = attn_weight.to(v.dtype).matmul(v)
|
| 59 |
+
out = rearrange(out, 'b h s d -> b s (h d)')
|
| 60 |
+
if needs_weights:
|
| 61 |
+
return (out, attn_weight, past_key_value)
|
| 62 |
+
return (out, None, past_key_value)
|
| 63 |
+
|
| 64 |
+
def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
|
| 65 |
+
for tensor in tensors:
|
| 66 |
+
if tensor.dtype not in valid_dtypes:
|
| 67 |
+
raise TypeError(f'tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}.')
|
| 68 |
+
if not tensor.is_cuda:
|
| 69 |
+
raise TypeError(f'Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r}).')
|
| 70 |
+
|
| 71 |
+
def flash_attn_fn(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
| 72 |
+
try:
|
| 73 |
+
from flash_attn import bert_padding, flash_attn_interface
|
| 74 |
+
except:
|
| 75 |
+
raise RuntimeError('Please install flash-attn==1.0.3.post0')
|
| 76 |
+
check_valid_inputs(query, key, value)
|
| 77 |
+
if past_key_value is not None:
|
| 78 |
+
if len(past_key_value) != 0:
|
| 79 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
| 80 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
| 81 |
+
past_key_value = (key, value)
|
| 82 |
+
if attn_bias is not None:
|
| 83 |
+
_s_q = max(0, attn_bias.size(2) - query.size(1))
|
| 84 |
+
_s_k = max(0, attn_bias.size(3) - key.size(1))
|
| 85 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
| 86 |
+
if attn_bias is not None:
|
| 87 |
+
raise NotImplementedError(f'attn_bias not implemented for flash attn.')
|
| 88 |
+
(batch_size, seqlen) = query.shape[:2]
|
| 89 |
+
if key_padding_mask is None:
|
| 90 |
+
key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
|
| 91 |
+
query_padding_mask = key_padding_mask[:, -query.size(1):]
|
| 92 |
+
(query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(query, query_padding_mask)
|
| 93 |
+
query_unpad = rearrange(query_unpad, 'nnz (h d) -> nnz h d', h=n_heads)
|
| 94 |
+
(key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(key, key_padding_mask)
|
| 95 |
+
key_unpad = rearrange(key_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
| 96 |
+
(value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
|
| 97 |
+
value_unpad = rearrange(value_unpad, 'nnz (h d) -> nnz h d', h=1 if multiquery else n_heads)
|
| 98 |
+
if multiquery:
|
| 99 |
+
key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
|
| 100 |
+
value_unpad = value_unpad.expand(value_unpad.size(0), n_heads, value_unpad.size(-1))
|
| 101 |
+
dropout_p = dropout_p if training else 0.0
|
| 102 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
| 103 |
+
output_unpad = flash_attn_interface.flash_attn_unpadded_func(query_unpad, key_unpad, value_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, dropout_p, softmax_scale=softmax_scale, causal=reset_is_causal, return_attn_probs=needs_weights)
|
| 104 |
+
output = bert_padding.pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'), indices_q, batch_size, seqlen)
|
| 105 |
+
return (output, None, past_key_value)
|
| 106 |
+
|
| 107 |
+
def triton_flash_attn_fn(query, key, value, n_heads, past_key_value=None, softmax_scale=None, attn_bias=None, key_padding_mask=None, is_causal=False, dropout_p=0.0, training=False, needs_weights=False, multiquery=False):
|
| 108 |
+
try:
|
| 109 |
+
from .flash_attn_triton import flash_attn_func
|
| 110 |
+
except:
|
| 111 |
+
_installed = False
|
| 112 |
+
if version.parse(torch.__version__) < version.parse('2.0.0'):
|
| 113 |
+
_installed = True
|
| 114 |
+
try:
|
| 115 |
+
from flash_attn.flash_attn_triton import flash_attn_func
|
| 116 |
+
except:
|
| 117 |
+
_installed = False
|
| 118 |
+
if not _installed:
|
| 119 |
+
raise RuntimeError('Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed.')
|
| 120 |
+
check_valid_inputs(query, key, value)
|
| 121 |
+
if past_key_value is not None:
|
| 122 |
+
if len(past_key_value) != 0:
|
| 123 |
+
key = torch.cat([past_key_value[0], key], dim=1)
|
| 124 |
+
value = torch.cat([past_key_value[1], value], dim=1)
|
| 125 |
+
past_key_value = (key, value)
|
| 126 |
+
if attn_bias is not None:
|
| 127 |
+
_s_q = max(0, attn_bias.size(2) - query.size(1))
|
| 128 |
+
_s_k = max(0, attn_bias.size(3) - key.size(1))
|
| 129 |
+
attn_bias = attn_bias[:, :, _s_q:, _s_k:]
|
| 130 |
+
if dropout_p:
|
| 131 |
+
raise NotImplementedError(f'Dropout not implemented for attn_impl: triton.')
|
| 132 |
+
if needs_weights:
|
| 133 |
+
raise NotImplementedError(f'attn_impl: triton cannot return attn weights.')
|
| 134 |
+
if key_padding_mask is not None:
|
| 135 |
+
warnings.warn('Propagating key_padding_mask to the attention module ' + 'and applying it within the attention module can cause ' + 'unnecessary computation/memory usage. Consider integrating ' + 'into attn_bias once and passing that to each attention ' + 'module instead.')
|
| 136 |
+
(b_size, s_k) = key_padding_mask.shape[:2]
|
| 137 |
+
if attn_bias is None:
|
| 138 |
+
attn_bias = query.new_zeros(b_size, 1, 1, s_k)
|
| 139 |
+
attn_bias = attn_bias.masked_fill(~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min)
|
| 140 |
+
query = rearrange(query, 'b s (h d) -> b s h d', h=n_heads)
|
| 141 |
+
key = rearrange(key, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
| 142 |
+
value = rearrange(value, 'b s (h d) -> b s h d', h=1 if multiquery else n_heads)
|
| 143 |
+
if multiquery:
|
| 144 |
+
key = key.expand(*key.shape[:2], n_heads, key.size(-1))
|
| 145 |
+
value = value.expand(*value.shape[:2], n_heads, value.size(-1))
|
| 146 |
+
reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
|
| 147 |
+
attn_output = flash_attn_func(query, key, value, attn_bias, reset_is_causal, softmax_scale)
|
| 148 |
+
output = attn_output.view(*attn_output.shape[:2], -1)
|
| 149 |
+
return (output, None, past_key_value)
|
| 150 |
+
|
| 151 |
+
class MultiheadAttention(nn.Module):
|
| 152 |
+
"""Multi-head self attention.
|
| 153 |
+
|
| 154 |
+
Using torch or triton attention implementation enables user to also use
|
| 155 |
+
additive bias.
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
|
| 159 |
+
super().__init__()
|
| 160 |
+
self.attn_impl = attn_impl
|
| 161 |
+
self.clip_qkv = clip_qkv
|
| 162 |
+
self.qk_ln = qk_ln
|
| 163 |
+
self.d_model = d_model
|
| 164 |
+
self.n_heads = n_heads
|
| 165 |
+
self.softmax_scale = softmax_scale
|
| 166 |
+
if self.softmax_scale is None:
|
| 167 |
+
self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
|
| 168 |
+
self.attn_dropout_p = attn_pdrop
|
| 169 |
+
self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
|
| 170 |
+
fuse_splits = (d_model, 2 * d_model)
|
| 171 |
+
self.Wqkv._fused = (0, fuse_splits)
|
| 172 |
+
if self.qk_ln:
|
| 173 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
| 174 |
+
self.q_ln = layernorm_class(self.d_model, device=device)
|
| 175 |
+
self.k_ln = layernorm_class(self.d_model, device=device)
|
| 176 |
+
if self.attn_impl == 'flash':
|
| 177 |
+
self.attn_fn = flash_attn_fn
|
| 178 |
+
elif self.attn_impl == 'triton':
|
| 179 |
+
self.attn_fn = triton_flash_attn_fn
|
| 180 |
+
if verbose:
|
| 181 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
| 182 |
+
elif self.attn_impl == 'torch':
|
| 183 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
| 184 |
+
if torch.cuda.is_available() and verbose:
|
| 185 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
| 186 |
+
else:
|
| 187 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
| 188 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
| 189 |
+
self.out_proj._is_residual = True
|
| 190 |
+
|
| 191 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
| 192 |
+
qkv = self.Wqkv(x)
|
| 193 |
+
if self.clip_qkv:
|
| 194 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
| 195 |
+
(query, key, value) = qkv.chunk(3, dim=2)
|
| 196 |
+
key_padding_mask = attention_mask
|
| 197 |
+
if self.qk_ln:
|
| 198 |
+
dtype = query.dtype
|
| 199 |
+
query = self.q_ln(query).to(dtype)
|
| 200 |
+
key = self.k_ln(key).to(dtype)
|
| 201 |
+
(context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights)
|
| 202 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
| 203 |
+
|
| 204 |
+
class MultiQueryAttention(nn.Module):
|
| 205 |
+
"""Multi-Query self attention.
|
| 206 |
+
|
| 207 |
+
Using torch or triton attention implementation enables user to also use
|
| 208 |
+
additive bias.
|
| 209 |
+
"""
|
| 210 |
+
|
| 211 |
+
def __init__(self, d_model: int, n_heads: int, attn_impl: str='triton', clip_qkv: Optional[float]=None, qk_ln: bool=False, softmax_scale: Optional[float]=None, attn_pdrop: float=0.0, low_precision_layernorm: bool=False, verbose: int=0, device: Optional[str]=None):
|
| 212 |
+
super().__init__()
|
| 213 |
+
self.attn_impl = attn_impl
|
| 214 |
+
self.clip_qkv = clip_qkv
|
| 215 |
+
self.qk_ln = qk_ln
|
| 216 |
+
self.d_model = d_model
|
| 217 |
+
self.n_heads = n_heads
|
| 218 |
+
self.head_dim = d_model // n_heads
|
| 219 |
+
self.softmax_scale = softmax_scale
|
| 220 |
+
if self.softmax_scale is None:
|
| 221 |
+
self.softmax_scale = 1 / math.sqrt(self.head_dim)
|
| 222 |
+
self.attn_dropout_p = attn_pdrop
|
| 223 |
+
self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
|
| 224 |
+
fuse_splits = (d_model, d_model + self.head_dim)
|
| 225 |
+
self.Wqkv._fused = (0, fuse_splits)
|
| 226 |
+
if self.qk_ln:
|
| 227 |
+
layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
|
| 228 |
+
self.q_ln = layernorm_class(d_model, device=device)
|
| 229 |
+
self.k_ln = layernorm_class(self.head_dim, device=device)
|
| 230 |
+
if self.attn_impl == 'flash':
|
| 231 |
+
self.attn_fn = flash_attn_fn
|
| 232 |
+
elif self.attn_impl == 'triton':
|
| 233 |
+
self.attn_fn = triton_flash_attn_fn
|
| 234 |
+
if verbose:
|
| 235 |
+
warnings.warn('While `attn_impl: triton` can be faster than `attn_impl: flash` ' + 'it uses more memory. When training larger models this can trigger ' + 'alloc retries which hurts performance. If encountered, we recommend ' + 'using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`.')
|
| 236 |
+
elif self.attn_impl == 'torch':
|
| 237 |
+
self.attn_fn = scaled_multihead_dot_product_attention
|
| 238 |
+
if torch.cuda.is_available() and verbose:
|
| 239 |
+
warnings.warn('Using `attn_impl: torch`. If your model does not use `alibi` or ' + '`prefix_lm` we recommend using `attn_impl: flash` otherwise ' + 'we recommend using `attn_impl: triton`.')
|
| 240 |
+
else:
|
| 241 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
| 242 |
+
self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
|
| 243 |
+
self.out_proj._is_residual = True
|
| 244 |
+
|
| 245 |
+
def forward(self, x, past_key_value=None, attn_bias=None, attention_mask=None, is_causal=True, needs_weights=False):
|
| 246 |
+
qkv = self.Wqkv(x)
|
| 247 |
+
if self.clip_qkv:
|
| 248 |
+
qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
|
| 249 |
+
(query, key, value) = qkv.split([self.d_model, self.head_dim, self.head_dim], dim=2)
|
| 250 |
+
key_padding_mask = attention_mask
|
| 251 |
+
if self.qk_ln:
|
| 252 |
+
dtype = query.dtype
|
| 253 |
+
query = self.q_ln(query).to(dtype)
|
| 254 |
+
key = self.k_ln(key).to(dtype)
|
| 255 |
+
(context, attn_weights, past_key_value) = self.attn_fn(query, key, value, self.n_heads, past_key_value=past_key_value, softmax_scale=self.softmax_scale, attn_bias=attn_bias, key_padding_mask=key_padding_mask, is_causal=is_causal, dropout_p=self.attn_dropout_p, training=self.training, needs_weights=needs_weights, multiquery=True)
|
| 256 |
+
return (self.out_proj(context), attn_weights, past_key_value)
|
| 257 |
+
|
| 258 |
+
def attn_bias_shape(attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id):
|
| 259 |
+
if attn_impl == 'flash':
|
| 260 |
+
return None
|
| 261 |
+
elif attn_impl in ['torch', 'triton']:
|
| 262 |
+
if alibi:
|
| 263 |
+
if (prefix_lm or not causal) or use_sequence_id:
|
| 264 |
+
return (1, n_heads, seq_len, seq_len)
|
| 265 |
+
return (1, n_heads, 1, seq_len)
|
| 266 |
+
elif prefix_lm or use_sequence_id:
|
| 267 |
+
return (1, 1, seq_len, seq_len)
|
| 268 |
+
return None
|
| 269 |
+
else:
|
| 270 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
| 271 |
+
|
| 272 |
+
def build_attn_bias(attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8):
|
| 273 |
+
if attn_impl == 'flash':
|
| 274 |
+
return None
|
| 275 |
+
elif attn_impl in ['torch', 'triton']:
|
| 276 |
+
if alibi:
|
| 277 |
+
(device, dtype) = (attn_bias.device, attn_bias.dtype)
|
| 278 |
+
attn_bias = attn_bias.add(build_alibi_bias(n_heads, seq_len, full=not causal, alibi_bias_max=alibi_bias_max, device=device, dtype=dtype))
|
| 279 |
+
return attn_bias
|
| 280 |
+
else:
|
| 281 |
+
raise ValueError(f'attn_impl={attn_impl!r} is an invalid setting.')
|
| 282 |
+
|
| 283 |
+
def gen_slopes(n_heads, alibi_bias_max=8, device=None):
|
| 284 |
+
_n_heads = 2 ** math.ceil(math.log2(n_heads))
|
| 285 |
+
m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
|
| 286 |
+
m = m.mul(alibi_bias_max / _n_heads)
|
| 287 |
+
slopes = 1.0 / torch.pow(2, m)
|
| 288 |
+
if _n_heads != n_heads:
|
| 289 |
+
slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
|
| 290 |
+
return slopes.view(1, n_heads, 1, 1)
|
| 291 |
+
|
| 292 |
+
def build_alibi_bias(n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None):
|
| 293 |
+
alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, 1, seq_len)
|
| 294 |
+
if full:
|
| 295 |
+
alibi_bias = alibi_bias - torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(1, 1, seq_len, 1)
|
| 296 |
+
alibi_bias = alibi_bias.abs().mul(-1)
|
| 297 |
+
slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
|
| 298 |
+
alibi_bias = alibi_bias * slopes
|
| 299 |
+
return alibi_bias.to(dtype=dtype)
|
| 300 |
+
ATTN_CLASS_REGISTRY = {'multihead_attention': MultiheadAttention, 'multiquery_attention': MultiQueryAttention}
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/blocks.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""GPT Blocks used for the GPT Model."""
|
| 2 |
+
from typing import Dict, Optional, Tuple
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
from .attention import ATTN_CLASS_REGISTRY
|
| 6 |
+
from .norm import NORM_CLASS_REGISTRY
|
| 7 |
+
|
| 8 |
+
class MPTMLP(nn.Module):
|
| 9 |
+
|
| 10 |
+
def __init__(self, d_model: int, expansion_ratio: int, device: Optional[str]=None):
|
| 11 |
+
super().__init__()
|
| 12 |
+
self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
|
| 13 |
+
self.act = nn.GELU(approximate='none')
|
| 14 |
+
self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
|
| 15 |
+
self.down_proj._is_residual = True
|
| 16 |
+
|
| 17 |
+
def forward(self, x):
|
| 18 |
+
return self.down_proj(self.act(self.up_proj(x)))
|
| 19 |
+
|
| 20 |
+
class MPTBlock(nn.Module):
|
| 21 |
+
|
| 22 |
+
def __init__(self, d_model: int, n_heads: int, expansion_ratio: int, attn_config: Dict={'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}, resid_pdrop: float=0.0, norm_type: str='low_precision_layernorm', verbose: int=0, device: Optional[str]=None, **kwargs):
|
| 23 |
+
del kwargs
|
| 24 |
+
super().__init__()
|
| 25 |
+
norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
|
| 26 |
+
attn_class = ATTN_CLASS_REGISTRY[attn_config['attn_type']]
|
| 27 |
+
self.norm_1 = norm_class(d_model, device=device)
|
| 28 |
+
self.attn = attn_class(attn_impl=attn_config['attn_impl'], clip_qkv=attn_config['clip_qkv'], qk_ln=attn_config['qk_ln'], softmax_scale=attn_config['softmax_scale'], attn_pdrop=attn_config['attn_pdrop'], d_model=d_model, n_heads=n_heads, verbose=verbose, device=device)
|
| 29 |
+
self.norm_2 = norm_class(d_model, device=device)
|
| 30 |
+
self.ffn = MPTMLP(d_model=d_model, expansion_ratio=expansion_ratio, device=device)
|
| 31 |
+
self.resid_attn_dropout = nn.Dropout(resid_pdrop)
|
| 32 |
+
self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
|
| 33 |
+
|
| 34 |
+
def forward(self, x: torch.Tensor, past_key_value: Optional[Tuple[torch.Tensor]]=None, attn_bias: Optional[torch.Tensor]=None, attention_mask: Optional[torch.ByteTensor]=None, is_causal: bool=True) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
|
| 35 |
+
a = self.norm_1(x)
|
| 36 |
+
(b, attn_weights, past_key_value) = self.attn(a, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=is_causal)
|
| 37 |
+
x = x + self.resid_attn_dropout(b)
|
| 38 |
+
m = self.norm_2(x)
|
| 39 |
+
n = self.ffn(m)
|
| 40 |
+
x = x + self.resid_ffn_dropout(n)
|
| 41 |
+
return (x, attn_weights, past_key_value)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/configuration_mpt.py
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""A HuggingFace-style model configuration."""
|
| 2 |
+
from typing import Dict, Optional, Union
|
| 3 |
+
from transformers import PretrainedConfig
|
| 4 |
+
attn_config_defaults: Dict = {'attn_type': 'multihead_attention', 'attn_pdrop': 0.0, 'attn_impl': 'triton', 'qk_ln': False, 'clip_qkv': None, 'softmax_scale': None, 'prefix_lm': False, 'attn_uses_sequence_id': False, 'alibi': False, 'alibi_bias_max': 8}
|
| 5 |
+
init_config_defaults: Dict = {'name': 'kaiming_normal_', 'fan_mode': 'fan_in', 'init_nonlinearity': 'relu', 'init_div_is_residual': True, 'emb_init_std': None, 'emb_init_uniform_lim': None, 'init_std': None, 'init_gain': 0.0}
|
| 6 |
+
|
| 7 |
+
class MPTConfig(PretrainedConfig):
|
| 8 |
+
model_type = 'mpt'
|
| 9 |
+
|
| 10 |
+
def __init__(self, d_model: int=2048, n_heads: int=16, n_layers: int=24, expansion_ratio: int=4, max_seq_len: int=2048, vocab_size: int=50368, resid_pdrop: float=0.0, emb_pdrop: float=0.0, learned_pos_emb: bool=True, attn_config: Dict=attn_config_defaults, init_device: str='cpu', logit_scale: Optional[Union[float, str]]=None, no_bias: bool=False, verbose: int=0, embedding_fraction: float=1.0, norm_type: str='low_precision_layernorm', use_cache: bool=False, init_config: Dict=init_config_defaults, **kwargs):
|
| 11 |
+
"""The MPT configuration class.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
d_model (int): The size of the embedding dimension of the model.
|
| 15 |
+
n_heads (int): The number of attention heads.
|
| 16 |
+
n_layers (int): The number of layers in the model.
|
| 17 |
+
expansion_ratio (int): The ratio of the up/down scale in the MLP.
|
| 18 |
+
max_seq_len (int): The maximum sequence length of the model.
|
| 19 |
+
vocab_size (int): The size of the vocabulary.
|
| 20 |
+
resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
|
| 21 |
+
emb_pdrop (float): The dropout probability for the embedding layer.
|
| 22 |
+
learned_pos_emb (bool): Whether to use learned positional embeddings
|
| 23 |
+
attn_config (Dict): A dictionary used to configure the model's attention module:
|
| 24 |
+
attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
|
| 25 |
+
attn_pdrop (float): The dropout probability for the attention layers.
|
| 26 |
+
attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
|
| 27 |
+
qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
|
| 28 |
+
clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
|
| 29 |
+
this value.
|
| 30 |
+
softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
|
| 31 |
+
use the default scale of ``1/sqrt(d_keys)``.
|
| 32 |
+
prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
|
| 33 |
+
extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
|
| 34 |
+
can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
|
| 35 |
+
attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
|
| 36 |
+
When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
|
| 37 |
+
which sub-sequence each token belongs to.
|
| 38 |
+
Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
|
| 39 |
+
alibi (bool): Whether to use the alibi bias instead of position embeddings.
|
| 40 |
+
alibi_bias_max (int): The maximum value of the alibi bias.
|
| 41 |
+
init_device (str): The device to use for parameter initialization.
|
| 42 |
+
logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
|
| 43 |
+
no_bias (bool): Whether to use bias in all layers.
|
| 44 |
+
verbose (int): The verbosity level. 0 is silent.
|
| 45 |
+
embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
|
| 46 |
+
norm_type (str): choose type of norm to use
|
| 47 |
+
multiquery_attention (bool): Whether to use multiquery attention implementation.
|
| 48 |
+
use_cache (bool): Whether or not the model should return the last key/values attentions
|
| 49 |
+
init_config (Dict): A dictionary used to configure the model initialization:
|
| 50 |
+
init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
|
| 51 |
+
'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
|
| 52 |
+
'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
|
| 53 |
+
init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
|
| 54 |
+
emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
|
| 55 |
+
emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
|
| 56 |
+
used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
|
| 57 |
+
init_std (float): The standard deviation of the normal distribution used to initialize the model,
|
| 58 |
+
if using the baseline_ parameter initialization scheme.
|
| 59 |
+
init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
|
| 60 |
+
fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
|
| 61 |
+
init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
|
| 62 |
+
---
|
| 63 |
+
See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
|
| 64 |
+
"""
|
| 65 |
+
self.d_model = d_model
|
| 66 |
+
self.n_heads = n_heads
|
| 67 |
+
self.n_layers = n_layers
|
| 68 |
+
self.expansion_ratio = expansion_ratio
|
| 69 |
+
self.max_seq_len = max_seq_len
|
| 70 |
+
self.vocab_size = vocab_size
|
| 71 |
+
self.resid_pdrop = resid_pdrop
|
| 72 |
+
self.emb_pdrop = emb_pdrop
|
| 73 |
+
self.learned_pos_emb = learned_pos_emb
|
| 74 |
+
self.attn_config = attn_config
|
| 75 |
+
self.init_device = init_device
|
| 76 |
+
self.logit_scale = logit_scale
|
| 77 |
+
self.no_bias = no_bias
|
| 78 |
+
self.verbose = verbose
|
| 79 |
+
self.embedding_fraction = embedding_fraction
|
| 80 |
+
self.norm_type = norm_type
|
| 81 |
+
self.use_cache = use_cache
|
| 82 |
+
self.init_config = init_config
|
| 83 |
+
if 'name' in kwargs:
|
| 84 |
+
del kwargs['name']
|
| 85 |
+
if 'loss_fn' in kwargs:
|
| 86 |
+
del kwargs['loss_fn']
|
| 87 |
+
super().__init__(**kwargs)
|
| 88 |
+
self._validate_config()
|
| 89 |
+
|
| 90 |
+
def _set_config_defaults(self, config, config_defaults):
|
| 91 |
+
for (k, v) in config_defaults.items():
|
| 92 |
+
if k not in config:
|
| 93 |
+
config[k] = v
|
| 94 |
+
return config
|
| 95 |
+
|
| 96 |
+
def _validate_config(self):
|
| 97 |
+
self.attn_config = self._set_config_defaults(self.attn_config, attn_config_defaults)
|
| 98 |
+
self.init_config = self._set_config_defaults(self.init_config, init_config_defaults)
|
| 99 |
+
if self.d_model % self.n_heads != 0:
|
| 100 |
+
raise ValueError('d_model must be divisible by n_heads')
|
| 101 |
+
if any((prob < 0 or prob > 1 for prob in [self.attn_config['attn_pdrop'], self.resid_pdrop, self.emb_pdrop])):
|
| 102 |
+
raise ValueError("self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1")
|
| 103 |
+
if self.attn_config['attn_impl'] not in ['torch', 'flash', 'triton']:
|
| 104 |
+
raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
|
| 105 |
+
if self.attn_config['prefix_lm'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
| 106 |
+
raise NotImplementedError('prefix_lm only implemented with torch and triton attention.')
|
| 107 |
+
if self.attn_config['alibi'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
| 108 |
+
raise NotImplementedError('alibi only implemented with torch and triton attention.')
|
| 109 |
+
if self.attn_config['attn_uses_sequence_id'] and self.attn_config['attn_impl'] not in ['torch', 'triton']:
|
| 110 |
+
raise NotImplementedError('attn_uses_sequence_id only implemented with torch and triton attention.')
|
| 111 |
+
if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
|
| 112 |
+
raise ValueError('model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!')
|
| 113 |
+
if isinstance(self.logit_scale, str) and self.logit_scale != 'inv_sqrt_d_model':
|
| 114 |
+
raise ValueError(f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
|
| 115 |
+
if self.init_config.get('name', None) is None:
|
| 116 |
+
raise ValueError(f"self.init_config={self.init_config!r} 'name' needs to be set.")
|
| 117 |
+
if not self.learned_pos_emb and (not self.attn_config['alibi']):
|
| 118 |
+
raise ValueError(f'Positional information must be provided to the model using either learned_pos_emb or alibi.')
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/custom_embedding.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
from torch import Tensor
|
| 5 |
+
|
| 6 |
+
class SharedEmbedding(nn.Embedding):
|
| 7 |
+
|
| 8 |
+
def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
|
| 9 |
+
if unembed:
|
| 10 |
+
return F.linear(input, self.weight)
|
| 11 |
+
return super().forward(input)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/flash_attn_triton.py
ADDED
|
@@ -0,0 +1,484 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
|
| 3 |
+
update imports to use 'triton_pre_mlir'
|
| 4 |
+
|
| 5 |
+
*Experimental* implementation of FlashAttention in Triton.
|
| 6 |
+
Tested with triton==2.0.0.dev20221202.
|
| 7 |
+
Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
|
| 8 |
+
other than 64:
|
| 9 |
+
https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
|
| 10 |
+
We'll update this implementation with the new Triton backend once this is fixed.
|
| 11 |
+
|
| 12 |
+
We use the FlashAttention implementation from Phil Tillet a starting point.
|
| 13 |
+
https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
|
| 14 |
+
|
| 15 |
+
Changes:
|
| 16 |
+
- Implement both causal and non-causal attention.
|
| 17 |
+
- Implement both self-attention and cross-attention.
|
| 18 |
+
- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
|
| 19 |
+
- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
|
| 20 |
+
- Support attention bias.
|
| 21 |
+
- Speed up the forward pass a bit, and only store the LSE instead of m and l.
|
| 22 |
+
- Make the backward for d=128 much faster by reducing register spilling.
|
| 23 |
+
- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
|
| 24 |
+
small batch size * nheads.
|
| 25 |
+
|
| 26 |
+
Caution:
|
| 27 |
+
- This is an *experimental* implementation. The forward pass should be quite robust but
|
| 28 |
+
I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
|
| 29 |
+
- This implementation has only been tested on A100.
|
| 30 |
+
- If you plan to use headdim other than 64 and 128, you should test for race conditions
|
| 31 |
+
(due to the Triton compiler), as done in tests/test_flash_attn.py
|
| 32 |
+
"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
|
| 33 |
+
for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
|
| 34 |
+
that there are none left for other head dimensions.
|
| 35 |
+
|
| 36 |
+
Differences between this Triton version and the CUDA version:
|
| 37 |
+
- Triton version doesn't support dropout.
|
| 38 |
+
- Triton forward is generally faster than CUDA forward, while Triton backward is
|
| 39 |
+
generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
|
| 40 |
+
than CUDA forward + backward.
|
| 41 |
+
- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
|
| 42 |
+
- Triton version supports attention bias, while CUDA version doesn't.
|
| 43 |
+
"""
|
| 44 |
+
import math
|
| 45 |
+
import torch
|
| 46 |
+
import triton_pre_mlir as triton
|
| 47 |
+
import triton_pre_mlir.language as tl
|
| 48 |
+
|
| 49 |
+
@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
|
| 50 |
+
@triton.jit
|
| 51 |
+
def _fwd_kernel(Q, K, V, Bias, Out, Lse, TMP, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_ob, stride_oh, stride_om, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
| 52 |
+
start_m = tl.program_id(0)
|
| 53 |
+
off_hb = tl.program_id(1)
|
| 54 |
+
off_b = off_hb // nheads
|
| 55 |
+
off_h = off_hb % nheads
|
| 56 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
| 57 |
+
offs_n = tl.arange(0, BLOCK_N)
|
| 58 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
| 59 |
+
q_ptrs = Q + off_b * stride_qb + off_h * stride_qh + (offs_m[:, None] * stride_qm + offs_d[None, :])
|
| 60 |
+
k_ptrs = K + off_b * stride_kb + off_h * stride_kh + (offs_n[:, None] * stride_kn + offs_d[None, :])
|
| 61 |
+
v_ptrs = V + off_b * stride_vb + off_h * stride_vh + (offs_n[:, None] * stride_vn + offs_d[None, :])
|
| 62 |
+
if BIAS_TYPE == 'vector':
|
| 63 |
+
b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
|
| 64 |
+
elif BIAS_TYPE == 'matrix':
|
| 65 |
+
b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + (offs_m[:, None] * stride_bm + offs_n[None, :])
|
| 66 |
+
t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
|
| 67 |
+
lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
|
| 68 |
+
m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float('inf')
|
| 69 |
+
acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
|
| 70 |
+
if EVEN_M & EVEN_N:
|
| 71 |
+
if EVEN_HEADDIM:
|
| 72 |
+
q = tl.load(q_ptrs)
|
| 73 |
+
else:
|
| 74 |
+
q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
| 75 |
+
elif EVEN_HEADDIM:
|
| 76 |
+
q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
|
| 77 |
+
else:
|
| 78 |
+
q = tl.load(q_ptrs, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
| 79 |
+
end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
|
| 80 |
+
for start_n in range(0, end_n, BLOCK_N):
|
| 81 |
+
start_n = tl.multiple_of(start_n, BLOCK_N)
|
| 82 |
+
if EVEN_N & EVEN_M:
|
| 83 |
+
if EVEN_HEADDIM:
|
| 84 |
+
k = tl.load(k_ptrs + start_n * stride_kn)
|
| 85 |
+
else:
|
| 86 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=offs_d[None, :] < headdim, other=0.0)
|
| 87 |
+
elif EVEN_HEADDIM:
|
| 88 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
|
| 89 |
+
else:
|
| 90 |
+
k = tl.load(k_ptrs + start_n * stride_kn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
| 91 |
+
qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
|
| 92 |
+
qk += tl.dot(q, k, trans_b=True)
|
| 93 |
+
if not EVEN_N:
|
| 94 |
+
qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float('-inf'))
|
| 95 |
+
if IS_CAUSAL:
|
| 96 |
+
qk += tl.where(offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float('-inf'))
|
| 97 |
+
if BIAS_TYPE != 'none':
|
| 98 |
+
if BIAS_TYPE == 'vector':
|
| 99 |
+
if EVEN_N:
|
| 100 |
+
bias = tl.load(b_ptrs + start_n).to(tl.float32)
|
| 101 |
+
else:
|
| 102 |
+
bias = tl.load(b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0).to(tl.float32)
|
| 103 |
+
bias = bias[None, :]
|
| 104 |
+
elif BIAS_TYPE == 'matrix':
|
| 105 |
+
if EVEN_M & EVEN_N:
|
| 106 |
+
bias = tl.load(b_ptrs + start_n).to(tl.float32)
|
| 107 |
+
else:
|
| 108 |
+
bias = tl.load(b_ptrs + start_n, mask=(offs_m[:, None] < seqlen_q) & ((start_n + offs_n)[None, :] < seqlen_k), other=0.0).to(tl.float32)
|
| 109 |
+
qk = qk * softmax_scale + bias
|
| 110 |
+
m_ij = tl.maximum(tl.max(qk, 1), lse_i)
|
| 111 |
+
p = tl.exp(qk - m_ij[:, None])
|
| 112 |
+
else:
|
| 113 |
+
m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
|
| 114 |
+
p = tl.exp(qk * softmax_scale - m_ij[:, None])
|
| 115 |
+
l_ij = tl.sum(p, 1)
|
| 116 |
+
acc_o_scale = tl.exp(m_i - m_ij)
|
| 117 |
+
tl.store(t_ptrs, acc_o_scale)
|
| 118 |
+
acc_o_scale = tl.load(t_ptrs)
|
| 119 |
+
acc_o = acc_o * acc_o_scale[:, None]
|
| 120 |
+
if EVEN_N & EVEN_M:
|
| 121 |
+
if EVEN_HEADDIM:
|
| 122 |
+
v = tl.load(v_ptrs + start_n * stride_vn)
|
| 123 |
+
else:
|
| 124 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=offs_d[None, :] < headdim, other=0.0)
|
| 125 |
+
elif EVEN_HEADDIM:
|
| 126 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=(start_n + offs_n)[:, None] < seqlen_k, other=0.0)
|
| 127 |
+
else:
|
| 128 |
+
v = tl.load(v_ptrs + start_n * stride_vn, mask=((start_n + offs_n)[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
| 129 |
+
p = p.to(v.dtype)
|
| 130 |
+
acc_o += tl.dot(p, v)
|
| 131 |
+
m_i = m_ij
|
| 132 |
+
l_i_new = tl.exp(lse_i - m_ij) + l_ij
|
| 133 |
+
lse_i = m_ij + tl.log(l_i_new)
|
| 134 |
+
o_scale = tl.exp(m_i - lse_i)
|
| 135 |
+
tl.store(t_ptrs, o_scale)
|
| 136 |
+
o_scale = tl.load(t_ptrs)
|
| 137 |
+
acc_o = acc_o * o_scale[:, None]
|
| 138 |
+
start_m = tl.program_id(0)
|
| 139 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
| 140 |
+
lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
|
| 141 |
+
tl.store(lse_ptrs, lse_i)
|
| 142 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
| 143 |
+
out_ptrs = Out + off_b * stride_ob + off_h * stride_oh + (offs_m[:, None] * stride_om + offs_d[None, :])
|
| 144 |
+
if EVEN_M:
|
| 145 |
+
if EVEN_HEADDIM:
|
| 146 |
+
tl.store(out_ptrs, acc_o)
|
| 147 |
+
else:
|
| 148 |
+
tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
|
| 149 |
+
elif EVEN_HEADDIM:
|
| 150 |
+
tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
|
| 151 |
+
else:
|
| 152 |
+
tl.store(out_ptrs, acc_o, mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
|
| 153 |
+
|
| 154 |
+
@triton.jit
|
| 155 |
+
def _bwd_preprocess_do_o_dot(Out, DO, Delta, stride_ob, stride_oh, stride_om, stride_dob, stride_doh, stride_dom, nheads, seqlen_q, seqlen_q_rounded, headdim, BLOCK_M: tl.constexpr, BLOCK_HEADDIM: tl.constexpr):
|
| 156 |
+
start_m = tl.program_id(0)
|
| 157 |
+
off_hb = tl.program_id(1)
|
| 158 |
+
off_b = off_hb // nheads
|
| 159 |
+
off_h = off_hb % nheads
|
| 160 |
+
offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
|
| 161 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
| 162 |
+
o = tl.load(Out + off_b * stride_ob + off_h * stride_oh + offs_m[:, None] * stride_om + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
|
| 163 |
+
do = tl.load(DO + off_b * stride_dob + off_h * stride_doh + offs_m[:, None] * stride_dom + offs_d[None, :], mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0).to(tl.float32)
|
| 164 |
+
delta = tl.sum(o * do, axis=1)
|
| 165 |
+
tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
|
| 166 |
+
|
| 167 |
+
@triton.jit
|
| 168 |
+
def _bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr):
|
| 169 |
+
if EVEN_N & EVEN_M:
|
| 170 |
+
if EVEN_HEADDIM:
|
| 171 |
+
tl.store(dv_ptrs, dv)
|
| 172 |
+
tl.store(dk_ptrs, dk)
|
| 173 |
+
else:
|
| 174 |
+
tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
|
| 175 |
+
tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
|
| 176 |
+
elif EVEN_HEADDIM:
|
| 177 |
+
tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
|
| 178 |
+
tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
|
| 179 |
+
else:
|
| 180 |
+
tl.store(dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
|
| 181 |
+
tl.store(dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim))
|
| 182 |
+
|
| 183 |
+
@triton.jit
|
| 184 |
+
def _bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD: tl.constexpr, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
| 185 |
+
begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
|
| 186 |
+
offs_qm = begin_m + tl.arange(0, BLOCK_M)
|
| 187 |
+
offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
|
| 188 |
+
offs_m = tl.arange(0, BLOCK_M)
|
| 189 |
+
offs_d = tl.arange(0, BLOCK_HEADDIM)
|
| 190 |
+
q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
|
| 191 |
+
k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
|
| 192 |
+
v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
|
| 193 |
+
do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
|
| 194 |
+
dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
|
| 195 |
+
if BIAS_TYPE == 'vector':
|
| 196 |
+
b_ptrs = Bias + offs_n
|
| 197 |
+
elif BIAS_TYPE == 'matrix':
|
| 198 |
+
b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
|
| 199 |
+
dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
|
| 200 |
+
dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
|
| 201 |
+
if begin_m >= seqlen_q:
|
| 202 |
+
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
|
| 203 |
+
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
|
| 204 |
+
_bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
|
| 205 |
+
return
|
| 206 |
+
if EVEN_N & EVEN_M:
|
| 207 |
+
if EVEN_HEADDIM:
|
| 208 |
+
k = tl.load(k_ptrs)
|
| 209 |
+
v = tl.load(v_ptrs)
|
| 210 |
+
else:
|
| 211 |
+
k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
| 212 |
+
v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
|
| 213 |
+
elif EVEN_HEADDIM:
|
| 214 |
+
k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
|
| 215 |
+
v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
|
| 216 |
+
else:
|
| 217 |
+
k = tl.load(k_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
| 218 |
+
v = tl.load(v_ptrs, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim), other=0.0)
|
| 219 |
+
num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
|
| 220 |
+
for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
|
| 221 |
+
start_m = tl.multiple_of(start_m, BLOCK_M)
|
| 222 |
+
offs_m_curr = start_m + offs_m
|
| 223 |
+
if EVEN_M & EVEN_HEADDIM:
|
| 224 |
+
q = tl.load(q_ptrs)
|
| 225 |
+
elif EVEN_HEADDIM:
|
| 226 |
+
q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
|
| 227 |
+
else:
|
| 228 |
+
q = tl.load(q_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
| 229 |
+
qk = tl.dot(q, k, trans_b=True)
|
| 230 |
+
if not EVEN_N:
|
| 231 |
+
qk = tl.where(offs_n[None, :] < seqlen_k, qk, float('-inf'))
|
| 232 |
+
if IS_CAUSAL:
|
| 233 |
+
qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float('-inf'))
|
| 234 |
+
if BIAS_TYPE != 'none':
|
| 235 |
+
tl.debug_barrier()
|
| 236 |
+
if BIAS_TYPE == 'vector':
|
| 237 |
+
if EVEN_N:
|
| 238 |
+
bias = tl.load(b_ptrs).to(tl.float32)
|
| 239 |
+
else:
|
| 240 |
+
bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(tl.float32)
|
| 241 |
+
bias = bias[None, :]
|
| 242 |
+
elif BIAS_TYPE == 'matrix':
|
| 243 |
+
if EVEN_M & EVEN_N:
|
| 244 |
+
bias = tl.load(b_ptrs).to(tl.float32)
|
| 245 |
+
else:
|
| 246 |
+
bias = tl.load(b_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_n[None, :] < seqlen_k), other=0.0).to(tl.float32)
|
| 247 |
+
qk = qk * softmax_scale + bias
|
| 248 |
+
if not EVEN_M & EVEN_HEADDIM:
|
| 249 |
+
tl.debug_barrier()
|
| 250 |
+
lse_i = tl.load(LSE + offs_m_curr)
|
| 251 |
+
if BIAS_TYPE == 'none':
|
| 252 |
+
p = tl.exp(qk * softmax_scale - lse_i[:, None])
|
| 253 |
+
else:
|
| 254 |
+
p = tl.exp(qk - lse_i[:, None])
|
| 255 |
+
if EVEN_M & EVEN_HEADDIM:
|
| 256 |
+
do = tl.load(do_ptrs)
|
| 257 |
+
else:
|
| 258 |
+
do = tl.load(do_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0)
|
| 259 |
+
dv += tl.dot(p.to(do.dtype), do, trans_a=True)
|
| 260 |
+
if not EVEN_M & EVEN_HEADDIM:
|
| 261 |
+
tl.debug_barrier()
|
| 262 |
+
dp = tl.dot(do, v, trans_b=True)
|
| 263 |
+
if not EVEN_HEADDIM:
|
| 264 |
+
tl.debug_barrier()
|
| 265 |
+
Di = tl.load(D + offs_m_curr)
|
| 266 |
+
ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
|
| 267 |
+
dk += tl.dot(ds, q, trans_a=True)
|
| 268 |
+
if not EVEN_M & EVEN_HEADDIM:
|
| 269 |
+
tl.debug_barrier()
|
| 270 |
+
if not ATOMIC_ADD:
|
| 271 |
+
if EVEN_M & EVEN_HEADDIM:
|
| 272 |
+
dq = tl.load(dq_ptrs, eviction_policy='evict_last')
|
| 273 |
+
dq += tl.dot(ds, k)
|
| 274 |
+
tl.store(dq_ptrs, dq, eviction_policy='evict_last')
|
| 275 |
+
elif EVEN_HEADDIM:
|
| 276 |
+
dq = tl.load(dq_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0, eviction_policy='evict_last')
|
| 277 |
+
dq += tl.dot(ds, k)
|
| 278 |
+
tl.store(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q, eviction_policy='evict_last')
|
| 279 |
+
else:
|
| 280 |
+
dq = tl.load(dq_ptrs, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), other=0.0, eviction_policy='evict_last')
|
| 281 |
+
dq += tl.dot(ds, k)
|
| 282 |
+
tl.store(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim), eviction_policy='evict_last')
|
| 283 |
+
else:
|
| 284 |
+
dq = tl.dot(ds, k)
|
| 285 |
+
if EVEN_M & EVEN_HEADDIM:
|
| 286 |
+
tl.atomic_add(dq_ptrs, dq)
|
| 287 |
+
elif EVEN_HEADDIM:
|
| 288 |
+
tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
|
| 289 |
+
else:
|
| 290 |
+
tl.atomic_add(dq_ptrs, dq, mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim))
|
| 291 |
+
dq_ptrs += BLOCK_M * stride_dqm
|
| 292 |
+
q_ptrs += BLOCK_M * stride_qm
|
| 293 |
+
do_ptrs += BLOCK_M * stride_dom
|
| 294 |
+
if BIAS_TYPE == 'matrix':
|
| 295 |
+
b_ptrs += BLOCK_M * stride_bm
|
| 296 |
+
dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
|
| 297 |
+
dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
|
| 298 |
+
_bwd_store_dk_dv(dk_ptrs, dv_ptrs, dk, dv, offs_n, offs_d, seqlen_k, headdim, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM)
|
| 299 |
+
|
| 300 |
+
def init_to_zero(name):
|
| 301 |
+
return lambda nargs: nargs[name].zero_()
|
| 302 |
+
|
| 303 |
+
@triton.autotune(configs=[triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': False}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ')), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'SEQUENCE_PARALLEL': True}, num_warps=8, num_stages=1, pre_hook=init_to_zero('DQ'))], key=['CACHE_KEY_SEQLEN_Q', 'CACHE_KEY_SEQLEN_K', 'BIAS_TYPE', 'IS_CAUSAL', 'BLOCK_HEADDIM'])
|
| 304 |
+
@triton.heuristics({'EVEN_M': lambda args: args['seqlen_q'] % args['BLOCK_M'] == 0, 'EVEN_N': lambda args: args['seqlen_k'] % args['BLOCK_N'] == 0, 'EVEN_HEADDIM': lambda args: args['headdim'] == args['BLOCK_HEADDIM']})
|
| 305 |
+
@triton.jit
|
| 306 |
+
def _bwd_kernel(Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qb, stride_qh, stride_qm, stride_kb, stride_kh, stride_kn, stride_vb, stride_vh, stride_vn, stride_bb, stride_bh, stride_bm, stride_dob, stride_doh, stride_dom, stride_dqb, stride_dqh, stride_dqm, stride_dkb, stride_dkh, stride_dkn, stride_dvb, stride_dvh, stride_dvn, nheads, seqlen_q, seqlen_k, seqlen_q_rounded, headdim, CACHE_KEY_SEQLEN_Q, CACHE_KEY_SEQLEN_K, BIAS_TYPE: tl.constexpr, IS_CAUSAL: tl.constexpr, BLOCK_HEADDIM: tl.constexpr, SEQUENCE_PARALLEL: tl.constexpr, EVEN_M: tl.constexpr, EVEN_N: tl.constexpr, EVEN_HEADDIM: tl.constexpr, BLOCK_M: tl.constexpr, BLOCK_N: tl.constexpr):
|
| 307 |
+
off_hb = tl.program_id(1)
|
| 308 |
+
off_b = off_hb // nheads
|
| 309 |
+
off_h = off_hb % nheads
|
| 310 |
+
Q += off_b * stride_qb + off_h * stride_qh
|
| 311 |
+
K += off_b * stride_kb + off_h * stride_kh
|
| 312 |
+
V += off_b * stride_vb + off_h * stride_vh
|
| 313 |
+
DO += off_b * stride_dob + off_h * stride_doh
|
| 314 |
+
DQ += off_b * stride_dqb + off_h * stride_dqh
|
| 315 |
+
DK += off_b * stride_dkb + off_h * stride_dkh
|
| 316 |
+
DV += off_b * stride_dvb + off_h * stride_dvh
|
| 317 |
+
if BIAS_TYPE != 'none':
|
| 318 |
+
Bias += off_b * stride_bb + off_h * stride_bh
|
| 319 |
+
D += off_hb * seqlen_q_rounded
|
| 320 |
+
LSE += off_hb * seqlen_q_rounded
|
| 321 |
+
if not SEQUENCE_PARALLEL:
|
| 322 |
+
num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
|
| 323 |
+
for start_n in range(0, num_block_n):
|
| 324 |
+
_bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=False, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
|
| 325 |
+
else:
|
| 326 |
+
start_n = tl.program_id(0)
|
| 327 |
+
_bwd_kernel_one_col_block(start_n, Q, K, V, Bias, DO, DQ, DK, DV, LSE, D, softmax_scale, stride_qm, stride_kn, stride_vn, stride_bm, stride_dom, stride_dqm, stride_dkn, stride_dvn, seqlen_q, seqlen_k, headdim, ATOMIC_ADD=True, BIAS_TYPE=BIAS_TYPE, IS_CAUSAL=IS_CAUSAL, BLOCK_HEADDIM=BLOCK_HEADDIM, EVEN_M=EVEN_M, EVEN_N=EVEN_N, EVEN_HEADDIM=EVEN_HEADDIM, BLOCK_M=BLOCK_M, BLOCK_N=BLOCK_N)
|
| 328 |
+
|
| 329 |
+
def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
|
| 330 |
+
(batch, seqlen_q, nheads, d) = q.shape
|
| 331 |
+
(_, seqlen_k, _, _) = k.shape
|
| 332 |
+
assert k.shape == (batch, seqlen_k, nheads, d)
|
| 333 |
+
assert v.shape == (batch, seqlen_k, nheads, d)
|
| 334 |
+
assert d <= 128, 'FlashAttention only support head dimensions up to 128'
|
| 335 |
+
assert q.dtype == k.dtype == v.dtype, 'All tensors must have the same type'
|
| 336 |
+
assert q.dtype in [torch.float16, torch.bfloat16], 'Only support fp16 and bf16'
|
| 337 |
+
assert q.is_cuda and k.is_cuda and v.is_cuda
|
| 338 |
+
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
|
| 339 |
+
has_bias = bias is not None
|
| 340 |
+
bias_type = 'none'
|
| 341 |
+
if has_bias:
|
| 342 |
+
assert bias.dtype in [q.dtype, torch.float]
|
| 343 |
+
assert bias.is_cuda
|
| 344 |
+
assert bias.dim() == 4
|
| 345 |
+
if bias.stride(-1) != 1:
|
| 346 |
+
bias = bias.contiguous()
|
| 347 |
+
if bias.shape[2:] == (1, seqlen_k):
|
| 348 |
+
bias_type = 'vector'
|
| 349 |
+
elif bias.shape[2:] == (seqlen_q, seqlen_k):
|
| 350 |
+
bias_type = 'matrix'
|
| 351 |
+
else:
|
| 352 |
+
raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
|
| 353 |
+
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
|
| 354 |
+
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
|
| 355 |
+
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
|
| 356 |
+
lse = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
|
| 357 |
+
tmp = torch.empty((batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32)
|
| 358 |
+
o = torch.empty_like(q)
|
| 359 |
+
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
|
| 360 |
+
BLOCK = 128
|
| 361 |
+
num_warps = 4 if d <= 64 else 8
|
| 362 |
+
grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
|
| 363 |
+
_fwd_kernel[grid](q, k, v, bias, o, lse, tmp, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, o.stride(0), o.stride(2), o.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM, BLOCK_M=BLOCK, BLOCK_N=BLOCK, num_warps=num_warps, num_stages=1)
|
| 364 |
+
return (o, lse, softmax_scale)
|
| 365 |
+
|
| 366 |
+
def _flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None):
|
| 367 |
+
if do.stride(-1) != 1:
|
| 368 |
+
do = do.contiguous()
|
| 369 |
+
(batch, seqlen_q, nheads, d) = q.shape
|
| 370 |
+
(_, seqlen_k, _, _) = k.shape
|
| 371 |
+
assert d <= 128
|
| 372 |
+
seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
|
| 373 |
+
assert lse.shape == (batch, nheads, seqlen_q_rounded)
|
| 374 |
+
assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
|
| 375 |
+
assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
|
| 376 |
+
softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
|
| 377 |
+
dq_accum = torch.empty_like(q, dtype=torch.float32)
|
| 378 |
+
delta = torch.empty_like(lse)
|
| 379 |
+
BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
|
| 380 |
+
grid = lambda META: (triton.cdiv(seqlen_q, META['BLOCK_M']), batch * nheads)
|
| 381 |
+
_bwd_preprocess_do_o_dot[grid](o, do, delta, o.stride(0), o.stride(2), o.stride(1), do.stride(0), do.stride(2), do.stride(1), nheads, seqlen_q, seqlen_q_rounded, d, BLOCK_M=128, BLOCK_HEADDIM=BLOCK_HEADDIM)
|
| 382 |
+
has_bias = bias is not None
|
| 383 |
+
bias_type = 'none'
|
| 384 |
+
if has_bias:
|
| 385 |
+
assert bias.dtype in [q.dtype, torch.float]
|
| 386 |
+
assert bias.is_cuda
|
| 387 |
+
assert bias.dim() == 4
|
| 388 |
+
assert bias.stride(-1) == 1
|
| 389 |
+
if bias.shape[2:] == (1, seqlen_k):
|
| 390 |
+
bias_type = 'vector'
|
| 391 |
+
elif bias.shape[2:] == (seqlen_q, seqlen_k):
|
| 392 |
+
bias_type = 'matrix'
|
| 393 |
+
else:
|
| 394 |
+
raise RuntimeError('Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)')
|
| 395 |
+
bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
|
| 396 |
+
bias_strides = (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
|
| 397 |
+
grid = lambda META: (triton.cdiv(seqlen_k, META['BLOCK_N']) if META['SEQUENCE_PARALLEL'] else 1, batch * nheads)
|
| 398 |
+
_bwd_kernel[grid](q, k, v, bias, do, dq_accum, dk, dv, lse, delta, softmax_scale, q.stride(0), q.stride(2), q.stride(1), k.stride(0), k.stride(2), k.stride(1), v.stride(0), v.stride(2), v.stride(1), *bias_strides, do.stride(0), do.stride(2), do.stride(1), dq_accum.stride(0), dq_accum.stride(2), dq_accum.stride(1), dk.stride(0), dk.stride(2), dk.stride(1), dv.stride(0), dv.stride(2), dv.stride(1), nheads, seqlen_q, seqlen_k, seqlen_q_rounded, d, seqlen_q // 32, seqlen_k // 32, bias_type, causal, BLOCK_HEADDIM)
|
| 399 |
+
dq.copy_(dq_accum)
|
| 400 |
+
|
| 401 |
+
class FlashAttnQKVPackedFunc(torch.autograd.Function):
|
| 402 |
+
|
| 403 |
+
@staticmethod
|
| 404 |
+
def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
|
| 405 |
+
"""
|
| 406 |
+
qkv: (batch, seqlen, 3, nheads, headdim)
|
| 407 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
|
| 408 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
|
| 409 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
|
| 410 |
+
"""
|
| 411 |
+
if qkv.stride(-1) != 1:
|
| 412 |
+
qkv = qkv.contiguous()
|
| 413 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], bias=bias, causal=causal, softmax_scale=softmax_scale)
|
| 414 |
+
ctx.save_for_backward(qkv, o, lse, bias)
|
| 415 |
+
ctx.causal = causal
|
| 416 |
+
return o
|
| 417 |
+
|
| 418 |
+
@staticmethod
|
| 419 |
+
def backward(ctx, do):
|
| 420 |
+
(qkv, o, lse, bias) = ctx.saved_tensors
|
| 421 |
+
assert not ctx.needs_input_grad[1], 'FlashAttention does not support bias gradient yet'
|
| 422 |
+
with torch.inference_mode():
|
| 423 |
+
dqkv = torch.empty_like(qkv)
|
| 424 |
+
_flash_attn_backward(do, qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2], o, lse, dqkv[:, :, 0], dqkv[:, :, 1], dqkv[:, :, 2], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
| 425 |
+
return (dqkv, None, None, None)
|
| 426 |
+
flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
|
| 427 |
+
|
| 428 |
+
class FlashAttnKVPackedFunc(torch.autograd.Function):
|
| 429 |
+
|
| 430 |
+
@staticmethod
|
| 431 |
+
def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
|
| 432 |
+
"""
|
| 433 |
+
q: (batch, seqlen_q, nheads, headdim)
|
| 434 |
+
kv: (batch, seqlen_k, 2, nheads, headdim)
|
| 435 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
|
| 436 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
|
| 437 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
|
| 438 |
+
"""
|
| 439 |
+
(q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
|
| 440 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(q, kv[:, :, 0], kv[:, :, 1], bias=bias, causal=causal, softmax_scale=softmax_scale)
|
| 441 |
+
ctx.save_for_backward(q, kv, o, lse, bias)
|
| 442 |
+
ctx.causal = causal
|
| 443 |
+
return o
|
| 444 |
+
|
| 445 |
+
@staticmethod
|
| 446 |
+
def backward(ctx, do):
|
| 447 |
+
(q, kv, o, lse, bias) = ctx.saved_tensors
|
| 448 |
+
if len(ctx.needs_input_grad) >= 3:
|
| 449 |
+
assert not ctx.needs_input_grad[2], 'FlashAttention does not support bias gradient yet'
|
| 450 |
+
with torch.inference_mode():
|
| 451 |
+
dq = torch.empty_like(q)
|
| 452 |
+
dkv = torch.empty_like(kv)
|
| 453 |
+
_flash_attn_backward(do, q, kv[:, :, 0], kv[:, :, 1], o, lse, dq, dkv[:, :, 0], dkv[:, :, 1], bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
| 454 |
+
return (dq, dkv, None, None, None)
|
| 455 |
+
flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
|
| 456 |
+
|
| 457 |
+
class FlashAttnFunc(torch.autograd.Function):
|
| 458 |
+
|
| 459 |
+
@staticmethod
|
| 460 |
+
def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
|
| 461 |
+
"""
|
| 462 |
+
q: (batch_size, seqlen_q, nheads, headdim)
|
| 463 |
+
k, v: (batch_size, seqlen_k, nheads, headdim)
|
| 464 |
+
bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
|
| 465 |
+
For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
|
| 466 |
+
ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
|
| 467 |
+
"""
|
| 468 |
+
(q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
|
| 469 |
+
(o, lse, ctx.softmax_scale) = _flash_attn_forward(q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale)
|
| 470 |
+
ctx.save_for_backward(q, k, v, o, lse, bias)
|
| 471 |
+
ctx.causal = causal
|
| 472 |
+
return o
|
| 473 |
+
|
| 474 |
+
@staticmethod
|
| 475 |
+
def backward(ctx, do):
|
| 476 |
+
(q, k, v, o, lse, bias) = ctx.saved_tensors
|
| 477 |
+
assert not ctx.needs_input_grad[3], 'FlashAttention does not support bias gradient yet'
|
| 478 |
+
with torch.inference_mode():
|
| 479 |
+
dq = torch.empty_like(q)
|
| 480 |
+
dk = torch.empty_like(k)
|
| 481 |
+
dv = torch.empty_like(v)
|
| 482 |
+
_flash_attn_backward(do, q, k, v, o, lse, dq, dk, dv, bias=bias, causal=ctx.causal, softmax_scale=ctx.softmax_scale)
|
| 483 |
+
return (dq, dk, dv, None, None, None)
|
| 484 |
+
flash_attn_func = FlashAttnFunc.apply
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/hf_prefixlm_converter.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Converts Huggingface Causal LM to Prefix LM.
|
| 2 |
+
|
| 3 |
+
Conversion does lightweight surgery on a HuggingFace
|
| 4 |
+
Causal LM to convert it to a Prefix LM.
|
| 5 |
+
|
| 6 |
+
Prefix LMs accepts a `bidirectional_mask` input in `forward`
|
| 7 |
+
and treat the input prompt as the prefix in `generate`.
|
| 8 |
+
"""
|
| 9 |
+
import math
|
| 10 |
+
import warnings
|
| 11 |
+
from types import MethodType
|
| 12 |
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
| 13 |
+
import torch
|
| 14 |
+
from transformers.models.bloom.modeling_bloom import BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel, CausalLMOutputWithCrossAttentions, CrossEntropyLoss
|
| 15 |
+
from transformers.models.bloom.modeling_bloom import _expand_mask as _expand_mask_bloom
|
| 16 |
+
from transformers.models.bloom.modeling_bloom import _make_causal_mask as _make_causal_mask_bloom
|
| 17 |
+
from transformers.models.bloom.modeling_bloom import logging
|
| 18 |
+
from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
|
| 19 |
+
from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
|
| 20 |
+
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
|
| 21 |
+
from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
|
| 22 |
+
from transformers.models.opt.modeling_opt import OPTForCausalLM
|
| 23 |
+
from transformers.models.opt.modeling_opt import _expand_mask as _expand_mask_opt
|
| 24 |
+
from transformers.models.opt.modeling_opt import _make_causal_mask as _make_causal_mask_opt
|
| 25 |
+
logger = logging.get_logger(__name__)
|
| 26 |
+
_SUPPORTED_GPT_MODELS = (GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM)
|
| 27 |
+
CAUSAL_GPT_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM]
|
| 28 |
+
|
| 29 |
+
def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
|
| 30 |
+
"""Converts a GPT-style Causal LM to a Prefix LM.
|
| 31 |
+
|
| 32 |
+
Supported HuggingFace model classes:
|
| 33 |
+
- `GPT2LMHeadModel`
|
| 34 |
+
- `GPTNeoForCausalLM`
|
| 35 |
+
- `GPTNeoXForCausalLM`
|
| 36 |
+
- `GPTJForCausalLM`
|
| 37 |
+
|
| 38 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
| 39 |
+
"""
|
| 40 |
+
if hasattr(model, '_prefix_lm_converted'):
|
| 41 |
+
return model
|
| 42 |
+
assert isinstance(model, _SUPPORTED_GPT_MODELS)
|
| 43 |
+
assert model.config.add_cross_attention == False, 'Only supports GPT-style decoder-only models'
|
| 44 |
+
|
| 45 |
+
def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
|
| 46 |
+
"""Helper that gets a list of the model's attention modules.
|
| 47 |
+
|
| 48 |
+
Each module has a `bias` buffer used for causal masking. The Prefix LM
|
| 49 |
+
conversion adds logic to dynamically manipulate these biases to support
|
| 50 |
+
Prefix LM attention masking.
|
| 51 |
+
"""
|
| 52 |
+
attn_modules = []
|
| 53 |
+
if isinstance(model, GPTNeoXForCausalLM):
|
| 54 |
+
blocks = model.gpt_neox.layers
|
| 55 |
+
else:
|
| 56 |
+
blocks = model.transformer.h
|
| 57 |
+
for block in blocks:
|
| 58 |
+
if isinstance(model, GPTNeoForCausalLM):
|
| 59 |
+
if block.attn.attention_type != 'global':
|
| 60 |
+
continue
|
| 61 |
+
attn_module = block.attn.attention
|
| 62 |
+
elif isinstance(model, GPTNeoXForCausalLM):
|
| 63 |
+
attn_module = block.attention
|
| 64 |
+
else:
|
| 65 |
+
attn_module = block.attn
|
| 66 |
+
attn_modules.append(attn_module)
|
| 67 |
+
return attn_modules
|
| 68 |
+
setattr(model, '_original_forward', getattr(model, 'forward'))
|
| 69 |
+
setattr(model, '_original_generate', getattr(model, 'generate'))
|
| 70 |
+
|
| 71 |
+
def forward(self: CAUSAL_GPT_TYPES, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]]=None, attention_mask: Optional[torch.FloatTensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, token_type_ids: Optional[torch.LongTensor]=None, position_ids: Optional[torch.LongTensor]=None, head_mask: Optional[torch.FloatTensor]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
|
| 72 |
+
"""Wraps original forward to enable PrefixLM attention."""
|
| 73 |
+
|
| 74 |
+
def call_og_forward():
|
| 75 |
+
if isinstance(self, GPTNeoXForCausalLM):
|
| 76 |
+
return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
| 77 |
+
else:
|
| 78 |
+
return self._original_forward(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
| 79 |
+
if bidirectional_mask is None:
|
| 80 |
+
return call_og_forward()
|
| 81 |
+
assert isinstance(bidirectional_mask, torch.Tensor)
|
| 82 |
+
attn_modules = _get_attn_modules(model)
|
| 83 |
+
(b, s) = bidirectional_mask.shape
|
| 84 |
+
max_length = attn_modules[0].bias.shape[-1]
|
| 85 |
+
if s > max_length:
|
| 86 |
+
raise ValueError(f'bidirectional_mask sequence length (={s}) exceeds the ' + f'max length allowed by the model ({max_length}).')
|
| 87 |
+
assert s <= max_length
|
| 88 |
+
if s < max_length:
|
| 89 |
+
pad = torch.zeros((int(b), int(max_length - s)), dtype=bidirectional_mask.dtype, device=bidirectional_mask.device)
|
| 90 |
+
bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
|
| 91 |
+
bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
|
| 92 |
+
for attn_module in attn_modules:
|
| 93 |
+
attn_module.bias.data = torch.logical_or(attn_module.bias.data, bidirectional)
|
| 94 |
+
output = call_og_forward()
|
| 95 |
+
for attn_module in attn_modules:
|
| 96 |
+
attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
|
| 97 |
+
return output
|
| 98 |
+
|
| 99 |
+
def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
|
| 100 |
+
"""Wraps original generate to enable PrefixLM attention."""
|
| 101 |
+
attn_modules = _get_attn_modules(model)
|
| 102 |
+
for attn_module in attn_modules:
|
| 103 |
+
attn_module.bias.data[:] = 1
|
| 104 |
+
output = self._original_generate(*args, **kwargs)
|
| 105 |
+
for attn_module in attn_modules:
|
| 106 |
+
attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
|
| 107 |
+
return output
|
| 108 |
+
setattr(model, 'forward', MethodType(forward, model))
|
| 109 |
+
setattr(model, 'generate', MethodType(generate, model))
|
| 110 |
+
setattr(model, '_prefix_lm_converted', True)
|
| 111 |
+
return model
|
| 112 |
+
|
| 113 |
+
def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
|
| 114 |
+
"""Converts a BLOOM Causal LM to a Prefix LM.
|
| 115 |
+
|
| 116 |
+
Supported HuggingFace model classes:
|
| 117 |
+
- `BloomForCausalLM`
|
| 118 |
+
|
| 119 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
| 120 |
+
"""
|
| 121 |
+
if hasattr(model, '_prefix_lm_converted'):
|
| 122 |
+
return model
|
| 123 |
+
assert isinstance(model, BloomForCausalLM)
|
| 124 |
+
assert model.config.add_cross_attention == False, 'Only supports BLOOM decoder-only models'
|
| 125 |
+
|
| 126 |
+
def _prepare_attn_mask(self: BloomModel, attention_mask: torch.Tensor, bidirectional_mask: Optional[torch.Tensor], input_shape: Tuple[int, int], past_key_values_length: int) -> torch.BoolTensor:
|
| 127 |
+
combined_attention_mask = None
|
| 128 |
+
device = attention_mask.device
|
| 129 |
+
(_, src_length) = input_shape
|
| 130 |
+
if src_length > 1:
|
| 131 |
+
combined_attention_mask = _make_causal_mask_bloom(input_shape, device=device, past_key_values_length=past_key_values_length)
|
| 132 |
+
if bidirectional_mask is not None:
|
| 133 |
+
assert attention_mask.shape == bidirectional_mask.shape
|
| 134 |
+
expanded_bidirectional_mask = _expand_mask_bloom(bidirectional_mask, tgt_length=src_length)
|
| 135 |
+
combined_attention_mask = torch.logical_and(combined_attention_mask, expanded_bidirectional_mask)
|
| 136 |
+
expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
|
| 137 |
+
combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
|
| 138 |
+
return combined_attention_mask
|
| 139 |
+
|
| 140 |
+
def _build_alibi_tensor(self: BloomModel, batch_size: int, query_length: int, key_length: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
|
| 141 |
+
num_heads = self.config.n_head
|
| 142 |
+
closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
|
| 143 |
+
base = torch.tensor(2 ** (-2 ** (-(math.log2(closest_power_of_2) - 3))), device=device, dtype=torch.float32)
|
| 144 |
+
powers = torch.arange(1, 1 + closest_power_of_2, device=device, dtype=torch.int32)
|
| 145 |
+
slopes = torch.pow(base, powers)
|
| 146 |
+
if closest_power_of_2 != num_heads:
|
| 147 |
+
extra_base = torch.tensor(2 ** (-2 ** (-(math.log2(2 * closest_power_of_2) - 3))), device=device, dtype=torch.float32)
|
| 148 |
+
num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2)
|
| 149 |
+
extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32)
|
| 150 |
+
slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
|
| 151 |
+
qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
|
| 152 |
+
ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
|
| 153 |
+
diffs = qa - ka + key_length - query_length
|
| 154 |
+
diffs = -diffs.abs()
|
| 155 |
+
alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(1, 1, query_length, key_length)
|
| 156 |
+
alibi = alibi.expand(batch_size, -1, -1, -1).reshape(-1, query_length, key_length)
|
| 157 |
+
return alibi.to(dtype)
|
| 158 |
+
KeyValueT = Tuple[torch.Tensor, torch.Tensor]
|
| 159 |
+
|
| 160 |
+
def forward(self: BloomModel, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.LongTensor]=None, inputs_embeds: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
|
| 161 |
+
if deprecated_arguments.pop('position_ids', False) is not False:
|
| 162 |
+
warnings.warn('`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. ' + 'You can safely ignore passing `position_ids`.', FutureWarning)
|
| 163 |
+
if len(deprecated_arguments) > 0:
|
| 164 |
+
raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
|
| 165 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 166 |
+
output_hidden_states = output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 167 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 168 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 169 |
+
if input_ids is not None and inputs_embeds is not None:
|
| 170 |
+
raise ValueError('You cannot specify both input_ids and inputs_embeds at the same time')
|
| 171 |
+
elif input_ids is not None:
|
| 172 |
+
(batch_size, seq_length) = input_ids.shape
|
| 173 |
+
elif inputs_embeds is not None:
|
| 174 |
+
(batch_size, seq_length, _) = inputs_embeds.shape
|
| 175 |
+
else:
|
| 176 |
+
raise ValueError('You have to specify either input_ids or inputs_embeds')
|
| 177 |
+
if past_key_values is None:
|
| 178 |
+
past_key_values = tuple([None] * len(self.h))
|
| 179 |
+
head_mask = self.get_head_mask(head_mask, self.config.n_layer)
|
| 180 |
+
if inputs_embeds is None:
|
| 181 |
+
inputs_embeds = self.word_embeddings(input_ids)
|
| 182 |
+
hidden_states = self.word_embeddings_layernorm(inputs_embeds)
|
| 183 |
+
presents = () if use_cache else None
|
| 184 |
+
all_self_attentions = () if output_attentions else None
|
| 185 |
+
all_hidden_states = () if output_hidden_states else None
|
| 186 |
+
seq_length_with_past = seq_length
|
| 187 |
+
past_key_values_length = 0
|
| 188 |
+
if past_key_values[0] is not None:
|
| 189 |
+
tmp = past_key_values[0][0]
|
| 190 |
+
past_key_values_length = tmp.shape[2]
|
| 191 |
+
seq_length_with_past = seq_length_with_past + past_key_values_length
|
| 192 |
+
if attention_mask is None:
|
| 193 |
+
attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device)
|
| 194 |
+
else:
|
| 195 |
+
attention_mask = attention_mask.to(hidden_states.device)
|
| 196 |
+
alibi = self._build_alibi_tensor(batch_size=batch_size, query_length=seq_length, key_length=seq_length_with_past, dtype=hidden_states.dtype, device=hidden_states.device)
|
| 197 |
+
causal_mask = self._prepare_attn_mask(attention_mask, bidirectional_mask, input_shape=(batch_size, seq_length), past_key_values_length=past_key_values_length)
|
| 198 |
+
for (i, (block, layer_past)) in enumerate(zip(self.h, past_key_values)):
|
| 199 |
+
if output_hidden_states:
|
| 200 |
+
hst = (hidden_states,)
|
| 201 |
+
all_hidden_states = all_hidden_states + hst
|
| 202 |
+
if self.gradient_checkpointing and self.training:
|
| 203 |
+
if use_cache:
|
| 204 |
+
logger.warning('`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...')
|
| 205 |
+
use_cache = False
|
| 206 |
+
|
| 207 |
+
def create_custom_forward(module):
|
| 208 |
+
|
| 209 |
+
def custom_forward(*inputs):
|
| 210 |
+
return module(*inputs, use_cache=use_cache, output_attentions=output_attentions)
|
| 211 |
+
return custom_forward
|
| 212 |
+
outputs = torch.utils.checkpoint.checkpoint(create_custom_forward(block), hidden_states, alibi, causal_mask, head_mask[i])
|
| 213 |
+
else:
|
| 214 |
+
outputs = block(hidden_states, layer_past=layer_past, attention_mask=causal_mask, head_mask=head_mask[i], use_cache=use_cache, output_attentions=output_attentions, alibi=alibi)
|
| 215 |
+
hidden_states = outputs[0]
|
| 216 |
+
if use_cache is True:
|
| 217 |
+
presents = presents + (outputs[1],)
|
| 218 |
+
if output_attentions:
|
| 219 |
+
oa = (outputs[2 if use_cache else 1],)
|
| 220 |
+
all_self_attentions = all_self_attentions + oa
|
| 221 |
+
hidden_states = self.ln_f(hidden_states)
|
| 222 |
+
if output_hidden_states:
|
| 223 |
+
hst = (hidden_states,)
|
| 224 |
+
all_hidden_states = all_hidden_states + hst
|
| 225 |
+
if not return_dict:
|
| 226 |
+
return tuple((v for v in [hidden_states, presents, all_hidden_states, all_self_attentions] if v is not None))
|
| 227 |
+
return BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=hidden_states, past_key_values=presents, hidden_states=all_hidden_states, attentions=all_self_attentions)
|
| 228 |
+
setattr(model.transformer, '_prepare_attn_mask', MethodType(_prepare_attn_mask, model.transformer))
|
| 229 |
+
setattr(model.transformer, '_build_alibi_tensor', MethodType(_build_alibi_tensor, model.transformer))
|
| 230 |
+
setattr(model.transformer, 'forward', MethodType(forward, model.transformer))
|
| 231 |
+
KeyValueT = Tuple[torch.Tensor, torch.Tensor]
|
| 232 |
+
|
| 233 |
+
def forward(self: BloomForCausalLM, input_ids: Optional[torch.LongTensor]=None, past_key_values: Optional[Tuple[KeyValueT, ...]]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.Tensor]=None, head_mask: Optional[torch.Tensor]=None, inputs_embeds: Optional[torch.Tensor]=None, labels: Optional[torch.Tensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None, **deprecated_arguments) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
|
| 234 |
+
"""Replacement forward method for BloomCausalLM."""
|
| 235 |
+
if deprecated_arguments.pop('position_ids', False) is not False:
|
| 236 |
+
warnings.warn('`position_ids` have no functionality in BLOOM and will be removed ' + 'in v5.0.0. You can safely ignore passing `position_ids`.', FutureWarning)
|
| 237 |
+
if len(deprecated_arguments) > 0:
|
| 238 |
+
raise ValueError(f'Got unexpected arguments: {deprecated_arguments}')
|
| 239 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 240 |
+
transformer_outputs = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask, bidirectional_mask=bidirectional_mask, head_mask=head_mask, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
| 241 |
+
hidden_states = transformer_outputs[0]
|
| 242 |
+
lm_logits = self.lm_head(hidden_states)
|
| 243 |
+
loss = None
|
| 244 |
+
if labels is not None:
|
| 245 |
+
shift_logits = lm_logits[..., :-1, :].contiguous()
|
| 246 |
+
shift_labels = labels[..., 1:].contiguous()
|
| 247 |
+
(batch_size, seq_length, vocab_size) = shift_logits.shape
|
| 248 |
+
loss_fct = CrossEntropyLoss()
|
| 249 |
+
loss = loss_fct(shift_logits.view(batch_size * seq_length, vocab_size), shift_labels.view(batch_size * seq_length))
|
| 250 |
+
if not return_dict:
|
| 251 |
+
output = (lm_logits,) + transformer_outputs[1:]
|
| 252 |
+
return (loss,) + output if loss is not None else output
|
| 253 |
+
return CausalLMOutputWithCrossAttentions(loss=loss, logits=lm_logits, past_key_values=transformer_outputs.past_key_values, hidden_states=transformer_outputs.hidden_states, attentions=transformer_outputs.attentions)
|
| 254 |
+
|
| 255 |
+
def prepare_inputs_for_generation(self: BloomForCausalLM, input_ids: torch.LongTensor, past: Optional[torch.Tensor]=None, attention_mask: Optional[torch.Tensor]=None, **kwargs) -> dict:
|
| 256 |
+
if past:
|
| 257 |
+
input_ids = input_ids[:, -1].unsqueeze(-1)
|
| 258 |
+
bidirectional_mask = None
|
| 259 |
+
if past[0][0].shape[0] == input_ids.shape[0]:
|
| 260 |
+
past = self._convert_to_bloom_cache(past)
|
| 261 |
+
else:
|
| 262 |
+
bidirectional_mask = torch.ones_like(input_ids)
|
| 263 |
+
return {'input_ids': input_ids, 'past_key_values': past, 'use_cache': True, 'attention_mask': attention_mask, 'bidirectional_mask': bidirectional_mask}
|
| 264 |
+
setattr(model, 'forward', MethodType(forward, model))
|
| 265 |
+
setattr(model, 'prepare_inputs_for_generation', MethodType(prepare_inputs_for_generation, model))
|
| 266 |
+
setattr(model, '_prefix_lm_converted', True)
|
| 267 |
+
return model
|
| 268 |
+
|
| 269 |
+
def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
|
| 270 |
+
"""Converts an OPT Causal LM to a Prefix LM.
|
| 271 |
+
|
| 272 |
+
Supported HuggingFace model classes:
|
| 273 |
+
- `OPTForCausalLM`
|
| 274 |
+
|
| 275 |
+
See `convert_hf_causal_lm_to_prefix_lm` for more details.
|
| 276 |
+
"""
|
| 277 |
+
if hasattr(model, '_prefix_lm_converted'):
|
| 278 |
+
return model
|
| 279 |
+
assert isinstance(model, OPTForCausalLM)
|
| 280 |
+
assert model.config.add_cross_attention == False, 'Only supports OPT decoder-only models'
|
| 281 |
+
setattr(model, '_original_forward', getattr(model, 'forward'))
|
| 282 |
+
setattr(model, '_original_generate', getattr(model, 'generate'))
|
| 283 |
+
model.model.decoder.bidirectional_mask = None
|
| 284 |
+
|
| 285 |
+
def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length):
|
| 286 |
+
combined_attention_mask = None
|
| 287 |
+
if input_shape[-1] > 1:
|
| 288 |
+
if self.bidirectional_mask == 'g':
|
| 289 |
+
(bsz, src_length) = input_shape
|
| 290 |
+
combined_attention_mask = torch.zeros((bsz, 1, src_length, src_length + past_key_values_length), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
|
| 291 |
+
else:
|
| 292 |
+
combined_attention_mask = _make_causal_mask_opt(input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length).to(inputs_embeds.device)
|
| 293 |
+
if self.bidirectional_mask is not None:
|
| 294 |
+
assert attention_mask.shape == self.bidirectional_mask.shape
|
| 295 |
+
expanded_bidirectional_mask = _expand_mask_opt(self.bidirectional_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
|
| 296 |
+
combined_attention_mask = torch.maximum(expanded_bidirectional_mask, combined_attention_mask)
|
| 297 |
+
if attention_mask is not None:
|
| 298 |
+
expanded_attn_mask = _expand_mask_opt(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(inputs_embeds.device)
|
| 299 |
+
combined_attention_mask = expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
|
| 300 |
+
return combined_attention_mask
|
| 301 |
+
setattr(model.model.decoder, '_prepare_decoder_attention_mask', MethodType(_prepare_decoder_attention_mask, model.model.decoder))
|
| 302 |
+
|
| 303 |
+
def forward(self: OPTForCausalLM, input_ids: Optional[torch.LongTensor]=None, attention_mask: Optional[torch.Tensor]=None, bidirectional_mask: Optional[torch.ByteTensor]=None, head_mask: Optional[torch.Tensor]=None, past_key_values: Optional[List[torch.FloatTensor]]=None, inputs_embeds: Optional[torch.FloatTensor]=None, labels: Optional[torch.LongTensor]=None, use_cache: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, return_dict: Optional[bool]=None):
|
| 304 |
+
|
| 305 |
+
def call_og_forward():
|
| 306 |
+
return self._original_forward(input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, labels=labels, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict)
|
| 307 |
+
if bidirectional_mask is None:
|
| 308 |
+
return call_og_forward()
|
| 309 |
+
self.model.decoder.bidirectional_mask = bidirectional_mask
|
| 310 |
+
try:
|
| 311 |
+
outputs = call_og_forward()
|
| 312 |
+
except:
|
| 313 |
+
self.model.decoder.bidirectional_mask = None
|
| 314 |
+
raise
|
| 315 |
+
self.model.decoder.bidirectional_mask = None
|
| 316 |
+
return outputs
|
| 317 |
+
|
| 318 |
+
def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
|
| 319 |
+
"""Wraps original generate to enable PrefixLM-style attention."""
|
| 320 |
+
self.model.decoder.bidirectional_mask = 'g'
|
| 321 |
+
try:
|
| 322 |
+
output = self._original_generate(*args, **kwargs)
|
| 323 |
+
except:
|
| 324 |
+
self.model.decoder.bidirectional_mask = None
|
| 325 |
+
raise
|
| 326 |
+
self.model.decoder.bidirectional_mask = None
|
| 327 |
+
return output
|
| 328 |
+
setattr(model, 'forward', MethodType(forward, model))
|
| 329 |
+
setattr(model, 'generate', MethodType(generate, model))
|
| 330 |
+
setattr(model, '_prefix_lm_converted', True)
|
| 331 |
+
return model
|
| 332 |
+
_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
|
| 333 |
+
CAUSAL_LM_TYPES = Union[GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM, BloomForCausalLM, OPTForCausalLM]
|
| 334 |
+
|
| 335 |
+
def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
|
| 336 |
+
"""Converts a HuggingFace Causal LM to a Prefix LM.
|
| 337 |
+
|
| 338 |
+
Supported HuggingFace model classes:
|
| 339 |
+
- `GPT2LMHeadModel`
|
| 340 |
+
- `GPTNeoForCausalLM`
|
| 341 |
+
- `GPTNeoXForCausalLM`
|
| 342 |
+
- `GPTJForCausalLM`
|
| 343 |
+
- `BloomForCausalLM`
|
| 344 |
+
- `OPTForCausalLM`
|
| 345 |
+
|
| 346 |
+
Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
|
| 347 |
+
`generate` method and/or select underlying methods depending on the model class.
|
| 348 |
+
|
| 349 |
+
These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
|
| 350 |
+
|
| 351 |
+
Notes on training:
|
| 352 |
+
To actually train the converted model as a Prefix LM, training batches will need to indicate
|
| 353 |
+
the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
|
| 354 |
+
|
| 355 |
+
**This is not a standard input and requires custom layers either within or after your dataloader.**
|
| 356 |
+
|
| 357 |
+
In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
|
| 358 |
+
such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
|
| 359 |
+
That is, the prefix portion of the sequence should not generate any loss. Loss should only be
|
| 360 |
+
generated by the target portion of the sequence.
|
| 361 |
+
|
| 362 |
+
Notes on `GPTNeoForCausalLM`:
|
| 363 |
+
To simplify the implementation, "global" and "local" attention layers are handled differently.
|
| 364 |
+
For "global" layers, we handle conversion as described above. For "local" layers, which use a
|
| 365 |
+
causal attention mask within a restricted local window, we do not alter the masking.
|
| 366 |
+
|
| 367 |
+
Notes on `forward` method conversion:
|
| 368 |
+
After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
|
| 369 |
+
which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
|
| 370 |
+
belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
|
| 371 |
+
0 indicates token positions belonging to the target.
|
| 372 |
+
|
| 373 |
+
The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
|
| 374 |
+
causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
|
| 375 |
+
the causal masks before returning the result.
|
| 376 |
+
|
| 377 |
+
Notes on `generate` method conversion:
|
| 378 |
+
After conversion, the `generate` method will have the same signature but will internally
|
| 379 |
+
convert all causal masks to be purely bidirectional, call the original `generate` method, and
|
| 380 |
+
(where appropriate) reset the causal masks before returning the result.
|
| 381 |
+
|
| 382 |
+
This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
|
| 383 |
+
"prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
|
| 384 |
+
each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
|
| 385 |
+
another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
|
| 386 |
+
previously-generated tokens (also as expected in a Prefix LM).
|
| 387 |
+
|
| 388 |
+
To preserve the API, the original methods are renamed to `_original_forward` and
|
| 389 |
+
`_original_generate`, and replaced with new `forward` and `generate` methods that wrap
|
| 390 |
+
them, respectively. Although implementation details vary by model class.
|
| 391 |
+
"""
|
| 392 |
+
if isinstance(model, _SUPPORTED_GPT_MODELS):
|
| 393 |
+
return _convert_gpt_causal_lm_to_prefix_lm(model)
|
| 394 |
+
elif isinstance(model, BloomForCausalLM):
|
| 395 |
+
return _convert_bloom_causal_lm_to_prefix_lm(model)
|
| 396 |
+
elif isinstance(model, OPTForCausalLM):
|
| 397 |
+
return _convert_opt_causal_lm_to_prefix_lm(model)
|
| 398 |
+
else:
|
| 399 |
+
raise TypeError(f'Cannot convert model to Prefix LM. ' + f'Model does not belong to set of supported HF models:' + f'\n{_SUPPORTED_HF_MODELS}')
|
| 400 |
+
|
| 401 |
+
def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
|
| 402 |
+
"""Attempts to add bidirectional_mask to batch if missing.
|
| 403 |
+
|
| 404 |
+
Raises:
|
| 405 |
+
KeyError if bidirectional_mask is missing and can't be inferred
|
| 406 |
+
"""
|
| 407 |
+
if 'bidirectional_mask' not in batch:
|
| 408 |
+
if batch.get('mode', None) == 'icl_task':
|
| 409 |
+
batch['bidirectional_mask'] = batch['attention_mask'].clone()
|
| 410 |
+
for (i, continuation_indices) in enumerate(batch['continuation_indices']):
|
| 411 |
+
batch['bidirectional_mask'][i, continuation_indices] = 0
|
| 412 |
+
elif 'labels' in batch and 'attention_mask' in batch:
|
| 413 |
+
batch['bidirectional_mask'] = torch.logical_and(torch.eq(batch['attention_mask'], 1), torch.eq(batch['labels'], -100)).type_as(batch['attention_mask'])
|
| 414 |
+
else:
|
| 415 |
+
raise KeyError('No bidirectional_mask in batch and not sure how to construct one.')
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/meta_init_context.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from contextlib import contextmanager
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
|
| 5 |
+
@contextmanager
|
| 6 |
+
def init_empty_weights(include_buffers: bool=False):
|
| 7 |
+
"""Meta initialization context manager.
|
| 8 |
+
|
| 9 |
+
A context manager under which models are initialized with all parameters
|
| 10 |
+
on the meta device, therefore creating an empty model. Useful when just
|
| 11 |
+
initializing the model would blow the available RAM.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
include_buffers (`bool`, *optional*, defaults to `False`): Whether or
|
| 15 |
+
not to also put all buffers on the meta device while initializing.
|
| 16 |
+
|
| 17 |
+
Example:
|
| 18 |
+
```python
|
| 19 |
+
import torch.nn as nn
|
| 20 |
+
|
| 21 |
+
# Initialize a model with 100 billions parameters in no time and without using any RAM.
|
| 22 |
+
with init_empty_weights():
|
| 23 |
+
tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
<Tip warning={true}>
|
| 27 |
+
|
| 28 |
+
Any model created under this context manager has no weights. As such you can't do something like
|
| 29 |
+
`model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
|
| 30 |
+
|
| 31 |
+
</Tip>
|
| 32 |
+
"""
|
| 33 |
+
with init_on_device(torch.device('meta'), include_buffers=include_buffers) as f:
|
| 34 |
+
yield f
|
| 35 |
+
|
| 36 |
+
@contextmanager
|
| 37 |
+
def init_on_device(device: torch.device, include_buffers: bool=False):
|
| 38 |
+
"""Device initialization context manager.
|
| 39 |
+
|
| 40 |
+
A context manager under which models are initialized with all parameters
|
| 41 |
+
on the specified device.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
device (`torch.device`): Device to initialize all parameters on.
|
| 45 |
+
include_buffers (`bool`, *optional*, defaults to `False`): Whether or
|
| 46 |
+
not to also put all buffers on the meta device while initializing.
|
| 47 |
+
|
| 48 |
+
Example:
|
| 49 |
+
```python
|
| 50 |
+
import torch.nn as nn
|
| 51 |
+
|
| 52 |
+
with init_on_device(device=torch.device("cuda")):
|
| 53 |
+
tst = nn.Liner(100, 100) # on `cuda` device
|
| 54 |
+
```
|
| 55 |
+
"""
|
| 56 |
+
old_register_parameter = nn.Module.register_parameter
|
| 57 |
+
if include_buffers:
|
| 58 |
+
old_register_buffer = nn.Module.register_buffer
|
| 59 |
+
|
| 60 |
+
def register_empty_parameter(module, name, param):
|
| 61 |
+
old_register_parameter(module, name, param)
|
| 62 |
+
if param is not None:
|
| 63 |
+
param_cls = type(module._parameters[name])
|
| 64 |
+
kwargs = module._parameters[name].__dict__
|
| 65 |
+
module._parameters[name] = param_cls(module._parameters[name].to(device), **kwargs)
|
| 66 |
+
|
| 67 |
+
def register_empty_buffer(module, name, buffer):
|
| 68 |
+
old_register_buffer(module, name, buffer)
|
| 69 |
+
if buffer is not None:
|
| 70 |
+
module._buffers[name] = module._buffers[name].to(device)
|
| 71 |
+
if include_buffers:
|
| 72 |
+
tensor_constructors_to_patch = {torch_function_name: getattr(torch, torch_function_name) for torch_function_name in ['empty', 'zeros', 'ones', 'full']}
|
| 73 |
+
else:
|
| 74 |
+
tensor_constructors_to_patch = {}
|
| 75 |
+
|
| 76 |
+
def patch_tensor_constructor(fn):
|
| 77 |
+
|
| 78 |
+
def wrapper(*args, **kwargs):
|
| 79 |
+
kwargs['device'] = device
|
| 80 |
+
return fn(*args, **kwargs)
|
| 81 |
+
return wrapper
|
| 82 |
+
try:
|
| 83 |
+
nn.Module.register_parameter = register_empty_parameter
|
| 84 |
+
if include_buffers:
|
| 85 |
+
nn.Module.register_buffer = register_empty_buffer
|
| 86 |
+
for torch_function_name in tensor_constructors_to_patch.keys():
|
| 87 |
+
setattr(torch, torch_function_name, patch_tensor_constructor(getattr(torch, torch_function_name)))
|
| 88 |
+
yield
|
| 89 |
+
finally:
|
| 90 |
+
nn.Module.register_parameter = old_register_parameter
|
| 91 |
+
if include_buffers:
|
| 92 |
+
nn.Module.register_buffer = old_register_buffer
|
| 93 |
+
for (torch_function_name, old_torch_function) in tensor_constructors_to_patch.items():
|
| 94 |
+
setattr(torch, torch_function_name, old_torch_function)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/modeling_mpt.py
ADDED
|
@@ -0,0 +1,331 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""A simple, flexible implementation of a GPT model.
|
| 2 |
+
|
| 3 |
+
Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
|
| 4 |
+
"""
|
| 5 |
+
import math
|
| 6 |
+
import warnings
|
| 7 |
+
from typing import List, Optional, Tuple, Union
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn as nn
|
| 10 |
+
import torch.nn.functional as F
|
| 11 |
+
from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
|
| 12 |
+
from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
|
| 13 |
+
from .attention import attn_bias_shape, build_attn_bias
|
| 14 |
+
from .blocks import MPTBlock
|
| 15 |
+
from .custom_embedding import SharedEmbedding
|
| 16 |
+
from .norm import NORM_CLASS_REGISTRY
|
| 17 |
+
from .configuration_mpt import MPTConfig
|
| 18 |
+
from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
|
| 19 |
+
from .hf_prefixlm_converter import add_bidirectional_mask_if_missing, convert_hf_causal_lm_to_prefix_lm
|
| 20 |
+
from .meta_init_context import init_empty_weights
|
| 21 |
+
from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
|
| 22 |
+
try:
|
| 23 |
+
from .flash_attn_triton import flash_attn_func
|
| 24 |
+
except:
|
| 25 |
+
pass
|
| 26 |
+
Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
| 27 |
+
|
| 28 |
+
class MPTPreTrainedModel(PreTrainedModel):
|
| 29 |
+
config_class = MPTConfig
|
| 30 |
+
base_model_prefix = 'model'
|
| 31 |
+
_no_split_modules = ['MPTBlock']
|
| 32 |
+
|
| 33 |
+
class MPTModel(MPTPreTrainedModel):
|
| 34 |
+
|
| 35 |
+
def __init__(self, config: MPTConfig):
|
| 36 |
+
config._validate_config()
|
| 37 |
+
super().__init__(config)
|
| 38 |
+
self.attn_impl = config.attn_config['attn_impl']
|
| 39 |
+
self.prefix_lm = config.attn_config['prefix_lm']
|
| 40 |
+
self.attn_uses_sequence_id = config.attn_config['attn_uses_sequence_id']
|
| 41 |
+
self.alibi = config.attn_config['alibi']
|
| 42 |
+
self.alibi_bias_max = config.attn_config['alibi_bias_max']
|
| 43 |
+
if config.init_device == 'mixed':
|
| 44 |
+
if dist.get_local_rank() == 0:
|
| 45 |
+
config.init_device = 'cpu'
|
| 46 |
+
else:
|
| 47 |
+
config.init_device = 'meta'
|
| 48 |
+
if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
|
| 49 |
+
norm_options = ' | '.join(NORM_CLASS_REGISTRY.keys())
|
| 50 |
+
raise NotImplementedError(f'Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options}).')
|
| 51 |
+
norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
|
| 52 |
+
self.embedding_fraction = config.embedding_fraction
|
| 53 |
+
self.wte = SharedEmbedding(config.vocab_size, config.d_model, device=config.init_device)
|
| 54 |
+
if not self.alibi:
|
| 55 |
+
self.wpe = torch.nn.Embedding(config.max_seq_len, config.d_model, device=config.init_device)
|
| 56 |
+
self.emb_drop = nn.Dropout(config.emb_pdrop)
|
| 57 |
+
self.blocks = nn.ModuleList([MPTBlock(device=config.init_device, **config.to_dict()) for _ in range(config.n_layers)])
|
| 58 |
+
self.norm_f = norm_class(config.d_model, device=config.init_device)
|
| 59 |
+
if config.init_device != 'meta':
|
| 60 |
+
print(f'You are using config.init_device={config.init_device!r}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.')
|
| 61 |
+
self.apply(self.param_init_fn)
|
| 62 |
+
self.is_causal = not self.prefix_lm
|
| 63 |
+
self._attn_bias_initialized = False
|
| 64 |
+
self.attn_bias = None
|
| 65 |
+
self.attn_bias_shape = attn_bias_shape(self.attn_impl, config.n_heads, config.max_seq_len, self.alibi, prefix_lm=self.prefix_lm, causal=self.is_causal, use_sequence_id=self.attn_uses_sequence_id)
|
| 66 |
+
if config.no_bias:
|
| 67 |
+
for module in self.modules():
|
| 68 |
+
if hasattr(module, 'bias') and isinstance(module.bias, nn.Parameter):
|
| 69 |
+
if config.verbose:
|
| 70 |
+
warnings.warn(f'Removing bias ({module.bias}) from {module}.')
|
| 71 |
+
module.register_parameter('bias', None)
|
| 72 |
+
if config.verbose and config.verbose > 2:
|
| 73 |
+
print(self)
|
| 74 |
+
if 'verbose' not in self.config.init_config:
|
| 75 |
+
self.config.init_config['verbose'] = self.config.verbose
|
| 76 |
+
if self.config.init_config['verbose'] > 1:
|
| 77 |
+
init_fn_name = self.config.init_config['name']
|
| 78 |
+
warnings.warn(f'Using {init_fn_name} initialization.')
|
| 79 |
+
self.gradient_checkpointing = False
|
| 80 |
+
|
| 81 |
+
def get_input_embeddings(self):
|
| 82 |
+
return self.wte
|
| 83 |
+
|
| 84 |
+
def set_input_embeddings(self, value):
|
| 85 |
+
self.wte = value
|
| 86 |
+
|
| 87 |
+
@torch.no_grad()
|
| 88 |
+
def _attn_bias(self, device, dtype, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None):
|
| 89 |
+
if not self._attn_bias_initialized:
|
| 90 |
+
if self.attn_bias_shape:
|
| 91 |
+
self.attn_bias = torch.zeros(self.attn_bias_shape, device=device, dtype=dtype)
|
| 92 |
+
self.attn_bias = build_attn_bias(self.attn_impl, self.attn_bias, self.config.n_heads, self.config.max_seq_len, causal=self.is_causal, alibi=self.alibi, alibi_bias_max=self.alibi_bias_max)
|
| 93 |
+
self._attn_bias_initialized = True
|
| 94 |
+
if self.attn_impl == 'flash':
|
| 95 |
+
return (self.attn_bias, attention_mask)
|
| 96 |
+
if self.attn_bias is not None:
|
| 97 |
+
self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
|
| 98 |
+
attn_bias = self.attn_bias
|
| 99 |
+
if self.prefix_lm:
|
| 100 |
+
assert isinstance(attn_bias, torch.Tensor)
|
| 101 |
+
assert isinstance(prefix_mask, torch.Tensor)
|
| 102 |
+
attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
|
| 103 |
+
if self.attn_uses_sequence_id and sequence_id is not None:
|
| 104 |
+
assert isinstance(attn_bias, torch.Tensor)
|
| 105 |
+
attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
|
| 106 |
+
if attention_mask is not None:
|
| 107 |
+
s_k = attention_mask.shape[-1]
|
| 108 |
+
if attn_bias is None:
|
| 109 |
+
attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
|
| 110 |
+
else:
|
| 111 |
+
_s_k = max(0, attn_bias.size(-1) - s_k)
|
| 112 |
+
attn_bias = attn_bias[:, :, :, _s_k:]
|
| 113 |
+
if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
|
| 114 |
+
raise ValueError(f'attention_mask shape={attention_mask.shape} ' + f'and prefix_mask shape={prefix_mask.shape} are not equal.')
|
| 115 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
| 116 |
+
attn_bias = attn_bias.masked_fill(~attention_mask.view(-1, 1, 1, s_k), min_val)
|
| 117 |
+
return (attn_bias, None)
|
| 118 |
+
|
| 119 |
+
def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
|
| 120 |
+
(s_k, s_q) = attn_bias.shape[-2:]
|
| 121 |
+
if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
|
| 122 |
+
raise ValueError('attn_bias does not match the expected shape. ' + f'The last two dimensions should both be {self.config.max_length} ' + f'but are {s_k} and {s_q}.')
|
| 123 |
+
seq_len = prefix_mask.shape[-1]
|
| 124 |
+
if seq_len > self.config.max_seq_len:
|
| 125 |
+
raise ValueError(f'prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
|
| 126 |
+
attn_bias = attn_bias[..., :seq_len, :seq_len]
|
| 127 |
+
causal = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)).view(1, 1, seq_len, seq_len)
|
| 128 |
+
prefix = prefix_mask.view(-1, 1, 1, seq_len)
|
| 129 |
+
cannot_attend = ~torch.logical_or(causal, prefix.bool())
|
| 130 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
| 131 |
+
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
|
| 132 |
+
return attn_bias
|
| 133 |
+
|
| 134 |
+
def _apply_sequence_id(self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor):
|
| 135 |
+
seq_len = sequence_id.shape[-1]
|
| 136 |
+
if seq_len > self.config.max_seq_len:
|
| 137 |
+
raise ValueError(f'sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}')
|
| 138 |
+
attn_bias = attn_bias[..., :seq_len, :seq_len]
|
| 139 |
+
cannot_attend = torch.logical_not(torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))).unsqueeze(1)
|
| 140 |
+
min_val = torch.finfo(attn_bias.dtype).min
|
| 141 |
+
attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
|
| 142 |
+
return attn_bias
|
| 143 |
+
|
| 144 |
+
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.Tensor]=None):
|
| 145 |
+
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
| 146 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 147 |
+
if attention_mask is not None:
|
| 148 |
+
attention_mask = attention_mask.bool()
|
| 149 |
+
if prefix_mask is not None:
|
| 150 |
+
prefix_mask = prefix_mask.bool()
|
| 151 |
+
if not return_dict:
|
| 152 |
+
raise NotImplementedError('return_dict False is not implemented yet for MPT')
|
| 153 |
+
if output_attentions:
|
| 154 |
+
if self.attn_impl != 'torch':
|
| 155 |
+
raise NotImplementedError('output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`.')
|
| 156 |
+
if attention_mask is not None and attention_mask[:, 0].sum() != attention_mask.shape[0] and self.training:
|
| 157 |
+
raise NotImplementedError('MPT does not support training with left padding.')
|
| 158 |
+
if self.prefix_lm and prefix_mask is None:
|
| 159 |
+
raise ValueError('prefix_mask is a required argument when MPT is configured with prefix_lm=True.')
|
| 160 |
+
if self.training:
|
| 161 |
+
if self.attn_uses_sequence_id and sequence_id is None:
|
| 162 |
+
raise ValueError('sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True ' + 'and the model is in train mode.')
|
| 163 |
+
elif self.attn_uses_sequence_id is False and sequence_id is not None:
|
| 164 |
+
warnings.warn('MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. ' + 'This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True.')
|
| 165 |
+
if input_ids is not None:
|
| 166 |
+
S = input_ids.size(1)
|
| 167 |
+
assert S <= self.config.max_seq_len, f'Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}'
|
| 168 |
+
tok_emb = self.wte(input_ids)
|
| 169 |
+
else:
|
| 170 |
+
assert inputs_embeds is not None
|
| 171 |
+
assert self.alibi, 'inputs_embeds is not implemented for MPT unless for alibi.'
|
| 172 |
+
S = inputs_embeds.size(1)
|
| 173 |
+
tok_emb = inputs_embeds
|
| 174 |
+
if self.alibi:
|
| 175 |
+
x = tok_emb
|
| 176 |
+
else:
|
| 177 |
+
past_position = 0
|
| 178 |
+
if past_key_values is not None:
|
| 179 |
+
if len(past_key_values) != self.config.n_layers:
|
| 180 |
+
raise ValueError(f'past_key_values must provide a past_key_value for each attention ' + f'layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r}).')
|
| 181 |
+
past_position = past_key_values[0][0].size(1)
|
| 182 |
+
if self.attn_impl == 'torch':
|
| 183 |
+
past_position = past_key_values[0][0].size(3)
|
| 184 |
+
if S + past_position > self.config.max_seq_len:
|
| 185 |
+
raise ValueError(f'Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}.')
|
| 186 |
+
pos = torch.arange(past_position, S + past_position, dtype=torch.long, device=input_ids.device).unsqueeze(0)
|
| 187 |
+
if attention_mask is not None:
|
| 188 |
+
pos = torch.clamp(pos - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[:, past_position:], min=0)
|
| 189 |
+
pos_emb = self.wpe(pos)
|
| 190 |
+
x = tok_emb + pos_emb
|
| 191 |
+
if self.embedding_fraction == 1:
|
| 192 |
+
x = self.emb_drop(x)
|
| 193 |
+
else:
|
| 194 |
+
x_shrunk = x * self.embedding_fraction + x.detach() * (1 - self.embedding_fraction)
|
| 195 |
+
assert isinstance(self.emb_drop, nn.Module)
|
| 196 |
+
x = self.emb_drop(x_shrunk)
|
| 197 |
+
(attn_bias, attention_mask) = self._attn_bias(device=x.device, dtype=torch.float32, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id)
|
| 198 |
+
if use_cache and past_key_values is None:
|
| 199 |
+
past_key_values = [() for _ in range(self.config.n_layers)]
|
| 200 |
+
all_hidden_states = () if output_hidden_states else None
|
| 201 |
+
all_self_attns = () if output_attentions else None
|
| 202 |
+
for (b_idx, block) in enumerate(self.blocks):
|
| 203 |
+
if output_hidden_states:
|
| 204 |
+
assert all_hidden_states is not None
|
| 205 |
+
all_hidden_states = all_hidden_states + (x,)
|
| 206 |
+
past_key_value = past_key_values[b_idx] if past_key_values is not None else None
|
| 207 |
+
if self.gradient_checkpointing and self.training:
|
| 208 |
+
(x, attn_weights, past_key_value) = torch.utils.checkpoint.checkpoint(block, x, past_key_value, attn_bias, attention_mask, self.is_causal)
|
| 209 |
+
else:
|
| 210 |
+
(x, attn_weights, past_key_value) = block(x, past_key_value=past_key_value, attn_bias=attn_bias, attention_mask=attention_mask, is_causal=self.is_causal)
|
| 211 |
+
if past_key_values is not None:
|
| 212 |
+
past_key_values[b_idx] = past_key_value
|
| 213 |
+
if output_attentions:
|
| 214 |
+
assert all_self_attns is not None
|
| 215 |
+
all_self_attns = all_self_attns + (attn_weights,)
|
| 216 |
+
x = self.norm_f(x)
|
| 217 |
+
if output_hidden_states:
|
| 218 |
+
assert all_hidden_states is not None
|
| 219 |
+
all_hidden_states = all_hidden_states + (x,)
|
| 220 |
+
return BaseModelOutputWithPast(last_hidden_state=x, past_key_values=past_key_values, hidden_states=all_hidden_states, attentions=all_self_attns)
|
| 221 |
+
|
| 222 |
+
def param_init_fn(self, module):
|
| 223 |
+
init_fn_name = self.config.init_config['name']
|
| 224 |
+
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
|
| 225 |
+
|
| 226 |
+
def fsdp_wrap_fn(self, module):
|
| 227 |
+
return isinstance(module, MPTBlock)
|
| 228 |
+
|
| 229 |
+
def activation_checkpointing_fn(self, module):
|
| 230 |
+
return isinstance(module, MPTBlock)
|
| 231 |
+
|
| 232 |
+
class MPTForCausalLM(MPTPreTrainedModel):
|
| 233 |
+
|
| 234 |
+
def __init__(self, config: MPTConfig):
|
| 235 |
+
super().__init__(config)
|
| 236 |
+
if not config.tie_word_embeddings:
|
| 237 |
+
raise ValueError('MPTForCausalLM only supports tied word embeddings')
|
| 238 |
+
print(f'Instantiating an MPTForCausalLM model from {__file__}')
|
| 239 |
+
self.transformer = MPTModel(config)
|
| 240 |
+
for child in self.transformer.children():
|
| 241 |
+
if isinstance(child, torch.nn.ModuleList):
|
| 242 |
+
continue
|
| 243 |
+
if isinstance(child, torch.nn.Module):
|
| 244 |
+
child._fsdp_wrap = True
|
| 245 |
+
self.logit_scale = None
|
| 246 |
+
if config.logit_scale is not None:
|
| 247 |
+
logit_scale = config.logit_scale
|
| 248 |
+
if isinstance(logit_scale, str):
|
| 249 |
+
if logit_scale == 'inv_sqrt_d_model':
|
| 250 |
+
logit_scale = 1 / math.sqrt(config.d_model)
|
| 251 |
+
else:
|
| 252 |
+
raise ValueError(f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'.")
|
| 253 |
+
self.logit_scale = logit_scale
|
| 254 |
+
|
| 255 |
+
def get_input_embeddings(self):
|
| 256 |
+
return self.transformer.wte
|
| 257 |
+
|
| 258 |
+
def set_input_embeddings(self, value):
|
| 259 |
+
self.transformer.wte = value
|
| 260 |
+
|
| 261 |
+
def get_output_embeddings(self):
|
| 262 |
+
return self.transformer.wte
|
| 263 |
+
|
| 264 |
+
def set_output_embeddings(self, new_embeddings):
|
| 265 |
+
self.transformer.wte = new_embeddings
|
| 266 |
+
|
| 267 |
+
def set_decoder(self, decoder):
|
| 268 |
+
self.transformer = decoder
|
| 269 |
+
|
| 270 |
+
def get_decoder(self):
|
| 271 |
+
return self.transformer
|
| 272 |
+
|
| 273 |
+
def forward(self, input_ids: torch.LongTensor, past_key_values: Optional[List[Tuple[torch.FloatTensor]]]=None, attention_mask: Optional[torch.ByteTensor]=None, prefix_mask: Optional[torch.ByteTensor]=None, sequence_id: Optional[torch.LongTensor]=None, labels: Optional[torch.LongTensor]=None, return_dict: Optional[bool]=None, output_attentions: Optional[bool]=None, output_hidden_states: Optional[bool]=None, use_cache: Optional[bool]=None, inputs_embeds: Optional[torch.FloatTensor]=None):
|
| 274 |
+
return_dict = return_dict if return_dict is not None else self.config.return_dict
|
| 275 |
+
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
| 276 |
+
if inputs_embeds is not None:
|
| 277 |
+
raise NotImplementedError('inputs_embeds has to be None (for hf/peft support).')
|
| 278 |
+
outputs = self.transformer(input_ids=input_ids, past_key_values=past_key_values, attention_mask=attention_mask, prefix_mask=prefix_mask, sequence_id=sequence_id, return_dict=return_dict, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache)
|
| 279 |
+
logits = self.transformer.wte(outputs.last_hidden_state.to(self.transformer.wte.weight.device), True)
|
| 280 |
+
if self.logit_scale is not None:
|
| 281 |
+
if self.logit_scale == 0:
|
| 282 |
+
warnings.warn(f'Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs.')
|
| 283 |
+
logits *= self.logit_scale
|
| 284 |
+
loss = None
|
| 285 |
+
if labels is not None:
|
| 286 |
+
labels = torch.roll(labels, shifts=-1)
|
| 287 |
+
labels[:, -1] = -100
|
| 288 |
+
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1))
|
| 289 |
+
return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
|
| 290 |
+
|
| 291 |
+
def param_init_fn(self, module):
|
| 292 |
+
init_fn_name = self.config.init_config['name']
|
| 293 |
+
MODEL_INIT_REGISTRY[init_fn_name](module=module, n_layers=self.config.n_layers, d_model=self.config.d_model, **self.config.init_config)
|
| 294 |
+
|
| 295 |
+
def fsdp_wrap_fn(self, module):
|
| 296 |
+
return isinstance(module, MPTBlock)
|
| 297 |
+
|
| 298 |
+
def activation_checkpointing_fn(self, module):
|
| 299 |
+
return isinstance(module, MPTBlock)
|
| 300 |
+
|
| 301 |
+
def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
|
| 302 |
+
if inputs_embeds is not None:
|
| 303 |
+
raise NotImplementedError('inputs_embeds is not implemented for MPT yet')
|
| 304 |
+
attention_mask = kwargs['attention_mask'].bool()
|
| 305 |
+
if attention_mask[:, -1].sum() != attention_mask.shape[0]:
|
| 306 |
+
raise NotImplementedError('MPT does not support generation with right padding.')
|
| 307 |
+
if self.transformer.attn_uses_sequence_id and self.training:
|
| 308 |
+
sequence_id = torch.zeros_like(input_ids[:1])
|
| 309 |
+
else:
|
| 310 |
+
sequence_id = None
|
| 311 |
+
if past_key_values is not None:
|
| 312 |
+
input_ids = input_ids[:, -1].unsqueeze(-1)
|
| 313 |
+
if self.transformer.prefix_lm:
|
| 314 |
+
prefix_mask = torch.ones_like(attention_mask)
|
| 315 |
+
if kwargs.get('use_cache') == False:
|
| 316 |
+
raise NotImplementedError('MPT with prefix_lm=True does not support use_cache=False.')
|
| 317 |
+
else:
|
| 318 |
+
prefix_mask = None
|
| 319 |
+
return {'input_ids': input_ids, 'attention_mask': attention_mask, 'prefix_mask': prefix_mask, 'sequence_id': sequence_id, 'past_key_values': past_key_values, 'use_cache': kwargs.get('use_cache', True)}
|
| 320 |
+
|
| 321 |
+
@staticmethod
|
| 322 |
+
def _reorder_cache(past_key_values, beam_idx):
|
| 323 |
+
"""Used by HuggingFace generate when using beam search with kv-caching.
|
| 324 |
+
|
| 325 |
+
See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
|
| 326 |
+
for an example in transformers.
|
| 327 |
+
"""
|
| 328 |
+
reordered_past = []
|
| 329 |
+
for layer_past in past_key_values:
|
| 330 |
+
reordered_past += [tuple((past_state.index_select(0, beam_idx) for past_state in layer_past))]
|
| 331 |
+
return reordered_past
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/norm.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
|
| 3 |
+
def _cast_if_autocast_enabled(tensor):
|
| 4 |
+
if torch.is_autocast_enabled():
|
| 5 |
+
if tensor.device.type == 'cuda':
|
| 6 |
+
dtype = torch.get_autocast_gpu_dtype()
|
| 7 |
+
elif tensor.device.type == 'cpu':
|
| 8 |
+
dtype = torch.get_autocast_cpu_dtype()
|
| 9 |
+
else:
|
| 10 |
+
raise NotImplementedError()
|
| 11 |
+
return tensor.to(dtype=dtype)
|
| 12 |
+
return tensor
|
| 13 |
+
|
| 14 |
+
class LPLayerNorm(torch.nn.LayerNorm):
|
| 15 |
+
|
| 16 |
+
def __init__(self, normalized_shape, eps=1e-05, elementwise_affine=True, device=None, dtype=None):
|
| 17 |
+
super().__init__(normalized_shape=normalized_shape, eps=eps, elementwise_affine=elementwise_affine, device=device, dtype=dtype)
|
| 18 |
+
|
| 19 |
+
def forward(self, x):
|
| 20 |
+
module_device = x.device
|
| 21 |
+
downcast_x = _cast_if_autocast_enabled(x)
|
| 22 |
+
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
| 23 |
+
downcast_bias = _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
|
| 24 |
+
with torch.autocast(enabled=False, device_type=module_device.type):
|
| 25 |
+
return torch.nn.functional.layer_norm(downcast_x, self.normalized_shape, downcast_weight, downcast_bias, self.eps)
|
| 26 |
+
|
| 27 |
+
def rms_norm(x, weight=None, eps=1e-05):
|
| 28 |
+
output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
| 29 |
+
if weight is not None:
|
| 30 |
+
return output * weight
|
| 31 |
+
return output
|
| 32 |
+
|
| 33 |
+
class RMSNorm(torch.nn.Module):
|
| 34 |
+
|
| 35 |
+
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
|
| 36 |
+
super().__init__()
|
| 37 |
+
self.eps = eps
|
| 38 |
+
if weight:
|
| 39 |
+
self.weight = torch.nn.Parameter(torch.ones(normalized_shape, dtype=dtype, device=device))
|
| 40 |
+
else:
|
| 41 |
+
self.register_parameter('weight', None)
|
| 42 |
+
|
| 43 |
+
def forward(self, x):
|
| 44 |
+
return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
|
| 45 |
+
|
| 46 |
+
class LPRMSNorm(RMSNorm):
|
| 47 |
+
|
| 48 |
+
def __init__(self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None):
|
| 49 |
+
super().__init__(normalized_shape=normalized_shape, eps=eps, weight=weight, dtype=dtype, device=device)
|
| 50 |
+
|
| 51 |
+
def forward(self, x):
|
| 52 |
+
downcast_x = _cast_if_autocast_enabled(x)
|
| 53 |
+
downcast_weight = _cast_if_autocast_enabled(self.weight) if self.weight is not None else self.weight
|
| 54 |
+
with torch.autocast(enabled=False, device_type=x.device.type):
|
| 55 |
+
return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
|
| 56 |
+
NORM_CLASS_REGISTRY = {'layernorm': torch.nn.LayerNorm, 'low_precision_layernorm': LPLayerNorm, 'rmsnorm': RMSNorm, 'low_precision_rmsnorm': LPRMSNorm}
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/language_model/mpt/param_init_fns.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import math
|
| 2 |
+
import warnings
|
| 3 |
+
from collections.abc import Sequence
|
| 4 |
+
from functools import partial
|
| 5 |
+
from typing import Optional, Tuple, Union
|
| 6 |
+
import torch
|
| 7 |
+
from torch import nn
|
| 8 |
+
from .norm import NORM_CLASS_REGISTRY
|
| 9 |
+
|
| 10 |
+
def torch_default_param_init_fn_(module: nn.Module, verbose: int=0, **kwargs):
|
| 11 |
+
del kwargs
|
| 12 |
+
if verbose > 1:
|
| 13 |
+
warnings.warn(f"Initializing network using module's reset_parameters attribute")
|
| 14 |
+
if hasattr(module, 'reset_parameters'):
|
| 15 |
+
module.reset_parameters()
|
| 16 |
+
|
| 17 |
+
def fused_init_helper_(module: nn.Module, init_fn_):
|
| 18 |
+
_fused = getattr(module, '_fused', None)
|
| 19 |
+
if _fused is None:
|
| 20 |
+
raise RuntimeError(f'Internal logic error')
|
| 21 |
+
(dim, splits) = _fused
|
| 22 |
+
splits = (0, *splits, module.weight.size(dim))
|
| 23 |
+
for (s, e) in zip(splits[:-1], splits[1:]):
|
| 24 |
+
slice_indices = [slice(None)] * module.weight.ndim
|
| 25 |
+
slice_indices[dim] = slice(s, e)
|
| 26 |
+
init_fn_(module.weight[slice_indices])
|
| 27 |
+
|
| 28 |
+
def generic_param_init_fn_(module: nn.Module, init_fn_, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
| 29 |
+
del kwargs
|
| 30 |
+
if verbose > 1:
|
| 31 |
+
warnings.warn(f'If model has bias parameters they are initialized to 0.')
|
| 32 |
+
init_div_is_residual = init_div_is_residual
|
| 33 |
+
if init_div_is_residual is False:
|
| 34 |
+
div_is_residual = 1.0
|
| 35 |
+
elif init_div_is_residual is True:
|
| 36 |
+
div_is_residual = math.sqrt(2 * n_layers)
|
| 37 |
+
elif isinstance(init_div_is_residual, float) or isinstance(init_div_is_residual, int):
|
| 38 |
+
div_is_residual = init_div_is_residual
|
| 39 |
+
elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
|
| 40 |
+
div_is_residual = float(init_div_is_residual)
|
| 41 |
+
else:
|
| 42 |
+
div_is_residual = 1.0
|
| 43 |
+
raise ValueError(f'Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}')
|
| 44 |
+
if init_div_is_residual is not False:
|
| 45 |
+
if verbose > 1:
|
| 46 |
+
warnings.warn(f'Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. ' + f'Set `init_div_is_residual: false` in init config to disable this.')
|
| 47 |
+
if isinstance(module, nn.Linear):
|
| 48 |
+
if hasattr(module, '_fused'):
|
| 49 |
+
fused_init_helper_(module, init_fn_)
|
| 50 |
+
else:
|
| 51 |
+
init_fn_(module.weight)
|
| 52 |
+
if module.bias is not None:
|
| 53 |
+
torch.nn.init.zeros_(module.bias)
|
| 54 |
+
if init_div_is_residual is not False and getattr(module, '_is_residual', False):
|
| 55 |
+
with torch.no_grad():
|
| 56 |
+
module.weight.div_(div_is_residual)
|
| 57 |
+
elif isinstance(module, nn.Embedding):
|
| 58 |
+
if emb_init_std is not None:
|
| 59 |
+
std = emb_init_std
|
| 60 |
+
if std == 0:
|
| 61 |
+
warnings.warn(f'Embedding layer initialized to 0.')
|
| 62 |
+
emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
|
| 63 |
+
if verbose > 1:
|
| 64 |
+
warnings.warn(f'Embedding layer initialized using normal distribution with mean=0 and std={std!r}.')
|
| 65 |
+
elif emb_init_uniform_lim is not None:
|
| 66 |
+
lim = emb_init_uniform_lim
|
| 67 |
+
if isinstance(lim, Sequence):
|
| 68 |
+
if len(lim) > 2:
|
| 69 |
+
raise ValueError(f'Uniform init requires a min and a max limit. User input: {lim}.')
|
| 70 |
+
if lim[0] == lim[1]:
|
| 71 |
+
warnings.warn(f'Embedding layer initialized to {lim[0]}.')
|
| 72 |
+
else:
|
| 73 |
+
if lim == 0:
|
| 74 |
+
warnings.warn(f'Embedding layer initialized to 0.')
|
| 75 |
+
lim = [-lim, lim]
|
| 76 |
+
(a, b) = lim
|
| 77 |
+
emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
|
| 78 |
+
if verbose > 1:
|
| 79 |
+
warnings.warn(f'Embedding layer initialized using uniform distribution in range {lim}.')
|
| 80 |
+
else:
|
| 81 |
+
emb_init_fn_ = init_fn_
|
| 82 |
+
emb_init_fn_(module.weight)
|
| 83 |
+
elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
|
| 84 |
+
if verbose > 1:
|
| 85 |
+
warnings.warn(f'Norm weights are set to 1. If norm layer has a bias it is initialized to 0.')
|
| 86 |
+
if hasattr(module, 'weight') and module.weight is not None:
|
| 87 |
+
torch.nn.init.ones_(module.weight)
|
| 88 |
+
if hasattr(module, 'bias') and module.bias is not None:
|
| 89 |
+
torch.nn.init.zeros_(module.bias)
|
| 90 |
+
elif isinstance(module, nn.MultiheadAttention):
|
| 91 |
+
if module._qkv_same_embed_dim:
|
| 92 |
+
assert module.in_proj_weight is not None
|
| 93 |
+
assert module.q_proj_weight is None and module.k_proj_weight is None and (module.v_proj_weight is None)
|
| 94 |
+
assert d_model is not None
|
| 95 |
+
_d = d_model
|
| 96 |
+
splits = (0, _d, 2 * _d, 3 * _d)
|
| 97 |
+
for (s, e) in zip(splits[:-1], splits[1:]):
|
| 98 |
+
init_fn_(module.in_proj_weight[s:e])
|
| 99 |
+
else:
|
| 100 |
+
assert module.q_proj_weight is not None and module.k_proj_weight is not None and (module.v_proj_weight is not None)
|
| 101 |
+
assert module.in_proj_weight is None
|
| 102 |
+
init_fn_(module.q_proj_weight)
|
| 103 |
+
init_fn_(module.k_proj_weight)
|
| 104 |
+
init_fn_(module.v_proj_weight)
|
| 105 |
+
if module.in_proj_bias is not None:
|
| 106 |
+
torch.nn.init.zeros_(module.in_proj_bias)
|
| 107 |
+
if module.bias_k is not None:
|
| 108 |
+
torch.nn.init.zeros_(module.bias_k)
|
| 109 |
+
if module.bias_v is not None:
|
| 110 |
+
torch.nn.init.zeros_(module.bias_v)
|
| 111 |
+
init_fn_(module.out_proj.weight)
|
| 112 |
+
if init_div_is_residual is not False and getattr(module.out_proj, '_is_residual', False):
|
| 113 |
+
with torch.no_grad():
|
| 114 |
+
module.out_proj.weight.div_(div_is_residual)
|
| 115 |
+
if module.out_proj.bias is not None:
|
| 116 |
+
torch.nn.init.zeros_(module.out_proj.bias)
|
| 117 |
+
else:
|
| 118 |
+
for _ in module.parameters(recurse=False):
|
| 119 |
+
raise NotImplementedError(f'{module.__class__.__name__} parameters are not initialized by param_init_fn.')
|
| 120 |
+
|
| 121 |
+
def _normal_init_(std, mean=0.0):
|
| 122 |
+
return partial(torch.nn.init.normal_, mean=mean, std=std)
|
| 123 |
+
|
| 124 |
+
def _normal_param_init_fn_(module: nn.Module, std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
| 125 |
+
del kwargs
|
| 126 |
+
init_fn_ = _normal_init_(std=std)
|
| 127 |
+
if verbose > 1:
|
| 128 |
+
warnings.warn(f'Using torch.nn.init.normal_ init fn mean=0.0, std={std}')
|
| 129 |
+
generic_param_init_fn_(module=module, init_fn_=init_fn_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 130 |
+
|
| 131 |
+
def baseline_param_init_fn_(module: nn.Module, init_std: float, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
| 132 |
+
del kwargs
|
| 133 |
+
if init_std is None:
|
| 134 |
+
raise ValueError("You must set model.init_config['init_std'] to a float value to use the default initialization scheme.")
|
| 135 |
+
_normal_param_init_fn_(module=module, std=init_std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 136 |
+
|
| 137 |
+
def small_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
| 138 |
+
del kwargs
|
| 139 |
+
std = math.sqrt(2 / (5 * d_model))
|
| 140 |
+
_normal_param_init_fn_(module=module, std=std, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 141 |
+
|
| 142 |
+
def neox_param_init_fn_(module: nn.Module, n_layers: int, d_model: int, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, verbose: int=0, **kwargs):
|
| 143 |
+
"""From section 2.3.1 of GPT-NeoX-20B:
|
| 144 |
+
|
| 145 |
+
An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
|
| 146 |
+
see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
|
| 147 |
+
and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
|
| 148 |
+
"""
|
| 149 |
+
del kwargs
|
| 150 |
+
residual_div = n_layers / math.sqrt(10)
|
| 151 |
+
if verbose > 1:
|
| 152 |
+
warnings.warn(f'setting init_div_is_residual to {residual_div}')
|
| 153 |
+
small_param_init_fn_(module=module, d_model=d_model, n_layers=n_layers, init_div_is_residual=residual_div, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 154 |
+
|
| 155 |
+
def kaiming_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
|
| 156 |
+
del kwargs
|
| 157 |
+
if verbose > 1:
|
| 158 |
+
warnings.warn(f'Using nn.init.kaiming_uniform_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
|
| 159 |
+
kaiming_uniform_ = partial(nn.init.kaiming_uniform_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
|
| 160 |
+
generic_param_init_fn_(module=module, init_fn_=kaiming_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 161 |
+
|
| 162 |
+
def kaiming_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, fan_mode: str='fan_in', init_nonlinearity: str='leaky_relu', verbose: int=0, **kwargs):
|
| 163 |
+
del kwargs
|
| 164 |
+
if verbose > 1:
|
| 165 |
+
warnings.warn(f'Using nn.init.kaiming_normal_ init fn with parameters: ' + f'a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}')
|
| 166 |
+
kaiming_normal_ = partial(torch.nn.init.kaiming_normal_, a=init_gain, mode=fan_mode, nonlinearity=init_nonlinearity)
|
| 167 |
+
generic_param_init_fn_(module=module, init_fn_=kaiming_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 168 |
+
|
| 169 |
+
def xavier_uniform_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
|
| 170 |
+
del kwargs
|
| 171 |
+
xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
|
| 172 |
+
if verbose > 1:
|
| 173 |
+
warnings.warn(f'Using torch.nn.init.xavier_uniform_ init fn with parameters: ' + f'gain={init_gain}')
|
| 174 |
+
generic_param_init_fn_(module=module, init_fn_=xavier_uniform_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 175 |
+
|
| 176 |
+
def xavier_normal_param_init_fn_(module: nn.Module, n_layers: int, d_model: Optional[int]=None, init_div_is_residual: Union[int, float, str, bool]=True, emb_init_std: Optional[float]=None, emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]]=None, init_gain: float=0, verbose: int=0, **kwargs):
|
| 177 |
+
xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
|
| 178 |
+
if verbose > 1:
|
| 179 |
+
warnings.warn(f'Using torch.nn.init.xavier_normal_ init fn with parameters: ' + f'gain={init_gain}')
|
| 180 |
+
generic_param_init_fn_(module=module, init_fn_=xavier_normal_, d_model=d_model, n_layers=n_layers, init_div_is_residual=init_div_is_residual, emb_init_std=emb_init_std, emb_init_uniform_lim=emb_init_uniform_lim, verbose=verbose)
|
| 181 |
+
MODEL_INIT_REGISTRY = {'default_': torch_default_param_init_fn_, 'baseline_': baseline_param_init_fn_, 'kaiming_uniform_': kaiming_uniform_param_init_fn_, 'kaiming_normal_': kaiming_normal_param_init_fn_, 'neox_init_': neox_param_init_fn_, 'small_init_': small_param_init_fn_, 'xavier_uniform_': xavier_uniform_param_init_fn_, 'xavier_normal_': xavier_normal_param_init_fn_}
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/builder.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from .clip_encoder import CLIPVisionTower
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
def build_vision_tower(vision_tower_cfg, **kwargs):
|
| 6 |
+
vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
|
| 7 |
+
is_absolute_path_exists = os.path.exists(vision_tower)
|
| 8 |
+
if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") \
|
| 9 |
+
or "intern" in vision_tower.lower():
|
| 10 |
+
return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
|
| 11 |
+
|
| 12 |
+
raise ValueError(f'Unknown vision tower: {vision_tower}')
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/clip_encoder.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import logging
|
| 2 |
+
|
| 3 |
+
import torch
|
| 4 |
+
import torch.nn as nn
|
| 5 |
+
|
| 6 |
+
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
|
| 7 |
+
from .eva_clip.configuration_evaclip import EvaCLIPVisionConfig
|
| 8 |
+
from .eva_clip.modeling_evaclip import EvaCLIPVisionModel
|
| 9 |
+
from .intern_vit_6b.configuration_intern_vit import InternVisionConfig
|
| 10 |
+
from .intern_vit_6b.modeling_intern_vit import InternVisionModel
|
| 11 |
+
from .internvl_14b.configuration_internvl import InternVLConfig
|
| 12 |
+
from .internvl_14b.modeling_internvl import InternVLModel
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def is_intern_vit_6b_model(vision_tower_name):
|
| 16 |
+
model_names = ["intern_vit_6b", "internvit_6b", "InternViT-6B", "internvit6b"]
|
| 17 |
+
return any(name in vision_tower_name for name in model_names)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def is_internvl_14b_model(vision_tower_name):
|
| 21 |
+
model_names = ["internvl_14b", "intern_vl_14b", "InternVL-14B", "internvl14b"]
|
| 22 |
+
return any(name in vision_tower_name for name in model_names)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class CLIPVisionTower(nn.Module):
|
| 26 |
+
def __init__(self, vision_tower, args, delay_load=False):
|
| 27 |
+
super().__init__()
|
| 28 |
+
|
| 29 |
+
self.is_loaded = False
|
| 30 |
+
|
| 31 |
+
self.vision_tower_name = vision_tower
|
| 32 |
+
self.select_layer = args.mm_vision_select_layer
|
| 33 |
+
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
|
| 34 |
+
|
| 35 |
+
if not delay_load:
|
| 36 |
+
self.load_model()
|
| 37 |
+
else:
|
| 38 |
+
if "EVA" in self.vision_tower_name or "eva" in self.vision_tower_name:
|
| 39 |
+
self.cfg_only = EvaCLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
| 40 |
+
elif is_intern_vit_6b_model(self.vision_tower_name):
|
| 41 |
+
self.cfg_only = InternVisionConfig.from_pretrained(self.vision_tower_name)
|
| 42 |
+
elif is_internvl_14b_model(self.vision_tower_name):
|
| 43 |
+
self.cfg_only = InternVLConfig.from_pretrained(self.vision_tower_name)
|
| 44 |
+
else:
|
| 45 |
+
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
|
| 46 |
+
|
| 47 |
+
def load_model(self):
|
| 48 |
+
if "EVA" in self.vision_tower_name or "eva" in self.vision_tower_name:
|
| 49 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
| 50 |
+
self.vision_tower = EvaCLIPVisionModel.from_pretrained(self.vision_tower_name)
|
| 51 |
+
elif is_intern_vit_6b_model(self.vision_tower_name):
|
| 52 |
+
crop_size = 448 if "448" in self.vision_tower_name else 336
|
| 53 |
+
self.image_processor = CLIPImageProcessor(
|
| 54 |
+
crop_size=crop_size, do_center_crop=True, do_normalize=True, do_resize=True,
|
| 55 |
+
image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225], size=crop_size
|
| 56 |
+
)
|
| 57 |
+
self.vision_tower = InternVisionModel.from_pretrained(self.vision_tower_name)
|
| 58 |
+
elif is_internvl_14b_model(self.vision_tower_name):
|
| 59 |
+
self.image_processor = CLIPImageProcessor(
|
| 60 |
+
crop_size=336, do_center_crop=True, do_normalize=True, do_resize=True,
|
| 61 |
+
image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225], size=336
|
| 62 |
+
)
|
| 63 |
+
self.vision_tower = InternVLModel.from_pretrained(self.vision_tower_name)
|
| 64 |
+
self.vision_tower.eval()
|
| 65 |
+
else:
|
| 66 |
+
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
|
| 67 |
+
self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
|
| 68 |
+
self.vision_tower.requires_grad_(False)
|
| 69 |
+
|
| 70 |
+
self.is_loaded = True
|
| 71 |
+
|
| 72 |
+
def feature_select(self, image_forward_outs):
|
| 73 |
+
image_features = image_forward_outs.hidden_states[self.select_layer]
|
| 74 |
+
if self.select_feature == 'patch':
|
| 75 |
+
image_features = image_features[:, 1:]
|
| 76 |
+
elif self.select_feature == 'cls_patch':
|
| 77 |
+
image_features = image_features
|
| 78 |
+
else:
|
| 79 |
+
raise ValueError(f'Unexpected select feature: {self.select_feature}')
|
| 80 |
+
return image_features
|
| 81 |
+
|
| 82 |
+
@torch.no_grad()
|
| 83 |
+
def forward(self, images):
|
| 84 |
+
if type(images) is list:
|
| 85 |
+
image_features = []
|
| 86 |
+
for image in images:
|
| 87 |
+
if is_internvl_14b_model(self.vision_tower_name):
|
| 88 |
+
image_forward_out, query_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
|
| 89 |
+
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
| 90 |
+
image_features.append([image_feature, query_out])
|
| 91 |
+
else:
|
| 92 |
+
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
|
| 93 |
+
image_feature = self.feature_select(image_forward_out).to(image.dtype)
|
| 94 |
+
image_features.append(image_feature)
|
| 95 |
+
else:
|
| 96 |
+
if is_internvl_14b_model(self.vision_tower_name):
|
| 97 |
+
image_forward_outs, query_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
| 98 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
| 99 |
+
image_features = [image_features, query_outs]
|
| 100 |
+
else:
|
| 101 |
+
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
|
| 102 |
+
image_features = self.feature_select(image_forward_outs).to(images.dtype)
|
| 103 |
+
|
| 104 |
+
return image_features
|
| 105 |
+
|
| 106 |
+
@property
|
| 107 |
+
def dummy_feature(self):
|
| 108 |
+
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
|
| 109 |
+
|
| 110 |
+
@property
|
| 111 |
+
def dtype(self):
|
| 112 |
+
return self.vision_tower.dtype
|
| 113 |
+
|
| 114 |
+
@property
|
| 115 |
+
def device(self):
|
| 116 |
+
return self.vision_tower.device
|
| 117 |
+
|
| 118 |
+
@property
|
| 119 |
+
def config(self):
|
| 120 |
+
if self.is_loaded:
|
| 121 |
+
return self.vision_tower.config
|
| 122 |
+
else:
|
| 123 |
+
return self.cfg_only
|
| 124 |
+
|
| 125 |
+
@property
|
| 126 |
+
def hidden_size(self):
|
| 127 |
+
return self.config.hidden_size
|
| 128 |
+
|
| 129 |
+
@property
|
| 130 |
+
def num_patches(self):
|
| 131 |
+
if is_internvl_14b_model(self.vision_tower_name):
|
| 132 |
+
return (self.config.image_size // self.config.patch_size) ** 2 + 96
|
| 133 |
+
else:
|
| 134 |
+
return (self.config.image_size // self.config.patch_size) ** 2
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/eva_clip/configuration_evaclip.py
ADDED
|
@@ -0,0 +1,425 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
""" EvaCLIP model configuration"""
|
| 16 |
+
# Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
|
| 17 |
+
# and adjusted for evaclip
|
| 18 |
+
|
| 19 |
+
import copy
|
| 20 |
+
import os
|
| 21 |
+
from collections import OrderedDict
|
| 22 |
+
from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
if TYPE_CHECKING:
|
| 26 |
+
from transformers.processing_utils import ProcessorMixin
|
| 27 |
+
from transformers.utils import TensorType
|
| 28 |
+
|
| 29 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 30 |
+
from transformers.utils import logging
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
logger = logging.get_logger(__name__)
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
class EvaCLIPTextConfig(PretrainedConfig):
|
| 37 |
+
r"""
|
| 38 |
+
This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
|
| 39 |
+
text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
|
| 40 |
+
with the defaults will yield a similar configuration to that of the text encoder of the CLIP
|
| 41 |
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
| 42 |
+
|
| 43 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 44 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
vocab_size (`int`, *optional*, defaults to 49408):
|
| 48 |
+
Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
|
| 49 |
+
the `inputs_ids` passed when calling [`CLIPModel`].
|
| 50 |
+
hidden_size (`int`, *optional*, defaults to 512):
|
| 51 |
+
Dimensionality of the encoder layers and the pooler layer.
|
| 52 |
+
intermediate_size (`int`, *optional*, defaults to 2048):
|
| 53 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
| 54 |
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
| 55 |
+
Number of hidden layers in the Transformer encoder.
|
| 56 |
+
num_attention_heads (`int`, *optional*, defaults to 8):
|
| 57 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 58 |
+
max_position_embeddings (`int`, *optional*, defaults to 77):
|
| 59 |
+
The maximum sequence length that this model might ever be used with. Typically set this to something large
|
| 60 |
+
just in case (e.g., 512 or 1024 or 2048).
|
| 61 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
| 62 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
| 63 |
+
`"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
|
| 64 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
| 65 |
+
The epsilon used by the layer normalization layers.
|
| 66 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 67 |
+
The dropout ratio for the attention probabilities.
|
| 68 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 69 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 70 |
+
initializer_factor (`float`, *optional*, defaults to 1):
|
| 71 |
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
| 72 |
+
testing).
|
| 73 |
+
|
| 74 |
+
Example:
|
| 75 |
+
|
| 76 |
+
```python
|
| 77 |
+
>>> from transformers import CLIPTextConfig, CLIPTextModel
|
| 78 |
+
|
| 79 |
+
>>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
|
| 80 |
+
>>> configuration = CLIPTextConfig()
|
| 81 |
+
|
| 82 |
+
>>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
| 83 |
+
>>> model = CLIPTextModel(configuration)
|
| 84 |
+
|
| 85 |
+
>>> # Accessing the model configuration
|
| 86 |
+
>>> configuration = model.config
|
| 87 |
+
```"""
|
| 88 |
+
model_type = "clip_text_model"
|
| 89 |
+
|
| 90 |
+
def __init__(
|
| 91 |
+
self,
|
| 92 |
+
vocab_size=49408,
|
| 93 |
+
hidden_size=512,
|
| 94 |
+
intermediate_size=2048,
|
| 95 |
+
projection_dim=512,
|
| 96 |
+
num_hidden_layers=12,
|
| 97 |
+
num_attention_heads=8,
|
| 98 |
+
max_position_embeddings=77,
|
| 99 |
+
hidden_act="gelu",
|
| 100 |
+
layer_norm_eps=1e-5,
|
| 101 |
+
attention_dropout=0.0,
|
| 102 |
+
initializer_range=0.02,
|
| 103 |
+
initializer_factor=1.0,
|
| 104 |
+
q_bias=True,
|
| 105 |
+
k_bias=True,
|
| 106 |
+
v_bias=True,
|
| 107 |
+
post_layernorm=False,
|
| 108 |
+
pad_token_id=1,
|
| 109 |
+
bos_token_id=0,
|
| 110 |
+
eos_token_id=2,
|
| 111 |
+
**kwargs,
|
| 112 |
+
):
|
| 113 |
+
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
|
| 114 |
+
|
| 115 |
+
self.vocab_size = vocab_size
|
| 116 |
+
self.hidden_size = hidden_size
|
| 117 |
+
self.intermediate_size = intermediate_size
|
| 118 |
+
self.projection_dim = projection_dim
|
| 119 |
+
self.num_hidden_layers = num_hidden_layers
|
| 120 |
+
self.num_attention_heads = num_attention_heads
|
| 121 |
+
self.max_position_embeddings = max_position_embeddings
|
| 122 |
+
self.layer_norm_eps = layer_norm_eps
|
| 123 |
+
self.hidden_act = hidden_act
|
| 124 |
+
self.initializer_range = initializer_range
|
| 125 |
+
self.initializer_factor = initializer_factor
|
| 126 |
+
self.q_bias=q_bias
|
| 127 |
+
self.k_bias=k_bias
|
| 128 |
+
self.v_bias=v_bias
|
| 129 |
+
self.post_layernorm = post_layernorm
|
| 130 |
+
self.attention_dropout = attention_dropout
|
| 131 |
+
|
| 132 |
+
@classmethod
|
| 133 |
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
| 134 |
+
cls._set_token_in_kwargs(kwargs)
|
| 135 |
+
|
| 136 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 137 |
+
|
| 138 |
+
# get the text config dict if we are loading from CLIPConfig
|
| 139 |
+
if config_dict.get("model_type") == "clip":
|
| 140 |
+
config_dict = config_dict["text_config"]
|
| 141 |
+
|
| 142 |
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
| 143 |
+
logger.warning(
|
| 144 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
| 145 |
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
return cls.from_dict(config_dict, **kwargs)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
class EvaCLIPVisionConfig(PretrainedConfig):
|
| 152 |
+
r"""
|
| 153 |
+
This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
|
| 154 |
+
CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
|
| 155 |
+
configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
|
| 156 |
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
| 157 |
+
|
| 158 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 159 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 160 |
+
|
| 161 |
+
Args:
|
| 162 |
+
hidden_size (`int`, *optional*, defaults to 768):
|
| 163 |
+
Dimensionality of the encoder layers and the pooler layer.
|
| 164 |
+
intermediate_size (`int`, *optional*, defaults to 3072):
|
| 165 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
| 166 |
+
num_hidden_layers (`int`, *optional*, defaults to 12):
|
| 167 |
+
Number of hidden layers in the Transformer encoder.
|
| 168 |
+
num_attention_heads (`int`, *optional*, defaults to 12):
|
| 169 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 170 |
+
image_size (`int`, *optional*, defaults to 224):
|
| 171 |
+
The size (resolution) of each image.
|
| 172 |
+
patch_size (`int`, *optional*, defaults to 32):
|
| 173 |
+
The size (resolution) of each patch.
|
| 174 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
|
| 175 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
| 176 |
+
`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
|
| 177 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
| 178 |
+
The epsilon used by the layer normalization layers.
|
| 179 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 180 |
+
The dropout ratio for the attention probabilities.
|
| 181 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 182 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 183 |
+
initializer_factor (`float`, *optional*, defaults to 1):
|
| 184 |
+
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
| 185 |
+
testing).
|
| 186 |
+
|
| 187 |
+
Example:
|
| 188 |
+
|
| 189 |
+
```python
|
| 190 |
+
>>> from transformers import CLIPVisionConfig, CLIPVisionModel
|
| 191 |
+
|
| 192 |
+
>>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
|
| 193 |
+
>>> configuration = CLIPVisionConfig()
|
| 194 |
+
|
| 195 |
+
>>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
| 196 |
+
>>> model = CLIPVisionModel(configuration)
|
| 197 |
+
|
| 198 |
+
>>> # Accessing the model configuration
|
| 199 |
+
>>> configuration = model.config
|
| 200 |
+
```"""
|
| 201 |
+
|
| 202 |
+
model_type = "clip_vision_model"
|
| 203 |
+
|
| 204 |
+
def __init__(
|
| 205 |
+
self,
|
| 206 |
+
hidden_size=768,
|
| 207 |
+
intermediate_size=3072,
|
| 208 |
+
projection_dim=512,
|
| 209 |
+
num_hidden_layers=12,
|
| 210 |
+
num_attention_heads=12,
|
| 211 |
+
num_channels=3,
|
| 212 |
+
image_size=224,
|
| 213 |
+
patch_size=32,
|
| 214 |
+
hidden_act="gelu",
|
| 215 |
+
layer_norm_eps=1e-5,
|
| 216 |
+
attention_dropout=0.0,
|
| 217 |
+
initializer_range=0.02,
|
| 218 |
+
initializer_factor=1.0,
|
| 219 |
+
q_bias=True,
|
| 220 |
+
k_bias=True,
|
| 221 |
+
v_bias=True,
|
| 222 |
+
post_layernorm=False,
|
| 223 |
+
**kwargs,
|
| 224 |
+
):
|
| 225 |
+
super().__init__(**kwargs)
|
| 226 |
+
|
| 227 |
+
self.hidden_size = hidden_size
|
| 228 |
+
self.intermediate_size = intermediate_size
|
| 229 |
+
self.projection_dim = projection_dim
|
| 230 |
+
self.num_hidden_layers = num_hidden_layers
|
| 231 |
+
self.num_attention_heads = num_attention_heads
|
| 232 |
+
self.num_channels = num_channels
|
| 233 |
+
self.patch_size = patch_size
|
| 234 |
+
self.image_size = image_size
|
| 235 |
+
self.initializer_range = initializer_range
|
| 236 |
+
self.initializer_factor = initializer_factor
|
| 237 |
+
self.q_bias=q_bias
|
| 238 |
+
self.k_bias=k_bias
|
| 239 |
+
self.v_bias=v_bias
|
| 240 |
+
self.post_layernorm = post_layernorm
|
| 241 |
+
self.attention_dropout = attention_dropout
|
| 242 |
+
self.layer_norm_eps = layer_norm_eps
|
| 243 |
+
self.hidden_act = hidden_act
|
| 244 |
+
|
| 245 |
+
@classmethod
|
| 246 |
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
|
| 247 |
+
cls._set_token_in_kwargs(kwargs)
|
| 248 |
+
|
| 249 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 250 |
+
|
| 251 |
+
# get the vision config dict if we are loading from CLIPConfig
|
| 252 |
+
if config_dict.get("model_type") == "clip":
|
| 253 |
+
config_dict = config_dict["vision_config"]
|
| 254 |
+
|
| 255 |
+
if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
|
| 256 |
+
logger.warning(
|
| 257 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
| 258 |
+
f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
| 259 |
+
)
|
| 260 |
+
|
| 261 |
+
return cls.from_dict(config_dict, **kwargs)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
class EvaCLIPConfig(PretrainedConfig):
|
| 265 |
+
r"""
|
| 266 |
+
[`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
|
| 267 |
+
a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
|
| 268 |
+
a configuration with the defaults will yield a similar configuration to that of the CLIP
|
| 269 |
+
[openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
|
| 270 |
+
|
| 271 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 272 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
text_config (`dict`, *optional*):
|
| 276 |
+
Dictionary of configuration options used to initialize [`CLIPTextConfig`].
|
| 277 |
+
vision_config (`dict`, *optional*):
|
| 278 |
+
Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
|
| 279 |
+
projection_dim (`int`, *optional*, defaults to 512):
|
| 280 |
+
Dimentionality of text and vision projection layers.
|
| 281 |
+
logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
|
| 282 |
+
The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
|
| 283 |
+
kwargs (*optional*):
|
| 284 |
+
Dictionary of keyword arguments.
|
| 285 |
+
|
| 286 |
+
Example:
|
| 287 |
+
|
| 288 |
+
```python
|
| 289 |
+
>>> from transformers import CLIPConfig, CLIPModel
|
| 290 |
+
|
| 291 |
+
>>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
|
| 292 |
+
>>> configuration = CLIPConfig()
|
| 293 |
+
|
| 294 |
+
>>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
|
| 295 |
+
>>> model = CLIPModel(configuration)
|
| 296 |
+
|
| 297 |
+
>>> # Accessing the model configuration
|
| 298 |
+
>>> configuration = model.config
|
| 299 |
+
|
| 300 |
+
>>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
|
| 301 |
+
>>> from transformers import CLIPTextConfig, CLIPVisionConfig
|
| 302 |
+
|
| 303 |
+
>>> # Initializing a CLIPText and CLIPVision configuration
|
| 304 |
+
>>> config_text = CLIPTextConfig()
|
| 305 |
+
>>> config_vision = CLIPVisionConfig()
|
| 306 |
+
|
| 307 |
+
>>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
|
| 308 |
+
```"""
|
| 309 |
+
|
| 310 |
+
model_type = "clip"
|
| 311 |
+
is_composition = True
|
| 312 |
+
|
| 313 |
+
def __init__(
|
| 314 |
+
self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
|
| 315 |
+
):
|
| 316 |
+
# If `_config_dict` exist, we use them for the backward compatibility.
|
| 317 |
+
# We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
|
| 318 |
+
# of confusion!).
|
| 319 |
+
text_config_dict = kwargs.pop("text_config_dict", None)
|
| 320 |
+
vision_config_dict = kwargs.pop("vision_config_dict", None)
|
| 321 |
+
|
| 322 |
+
super().__init__(**kwargs)
|
| 323 |
+
|
| 324 |
+
# Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
|
| 325 |
+
# `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
|
| 326 |
+
# cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
|
| 327 |
+
if text_config_dict is not None:
|
| 328 |
+
if text_config is None:
|
| 329 |
+
text_config = {}
|
| 330 |
+
|
| 331 |
+
# This is the complete result when using `text_config_dict`.
|
| 332 |
+
_text_config_dict = EvaCLIPTextConfig(**text_config_dict).to_dict()
|
| 333 |
+
|
| 334 |
+
# Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
|
| 335 |
+
for key, value in _text_config_dict.items():
|
| 336 |
+
if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
|
| 337 |
+
# If specified in `text_config_dict`
|
| 338 |
+
if key in text_config_dict:
|
| 339 |
+
message = (
|
| 340 |
+
f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
|
| 341 |
+
f'The value `text_config_dict["{key}"]` will be used instead.'
|
| 342 |
+
)
|
| 343 |
+
# If inferred from default argument values (just to be super careful)
|
| 344 |
+
else:
|
| 345 |
+
message = (
|
| 346 |
+
f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
|
| 347 |
+
f'value `text_config["{key}"]` will be overriden.'
|
| 348 |
+
)
|
| 349 |
+
logger.warning(message)
|
| 350 |
+
|
| 351 |
+
# Update all values in `text_config` with the ones in `_text_config_dict`.
|
| 352 |
+
text_config.update(_text_config_dict)
|
| 353 |
+
|
| 354 |
+
if vision_config_dict is not None:
|
| 355 |
+
if vision_config is None:
|
| 356 |
+
vision_config = {}
|
| 357 |
+
|
| 358 |
+
# This is the complete result when using `vision_config_dict`.
|
| 359 |
+
_vision_config_dict = EvaCLIPVisionConfig(**vision_config_dict).to_dict()
|
| 360 |
+
# convert keys to string instead of integer
|
| 361 |
+
if "id2label" in _vision_config_dict:
|
| 362 |
+
_vision_config_dict["id2label"] = {
|
| 363 |
+
str(key): value for key, value in _vision_config_dict["id2label"].items()
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
+
# Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
|
| 367 |
+
for key, value in _vision_config_dict.items():
|
| 368 |
+
if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
|
| 369 |
+
# If specified in `vision_config_dict`
|
| 370 |
+
if key in vision_config_dict:
|
| 371 |
+
message = (
|
| 372 |
+
f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
|
| 373 |
+
f'values. The value `vision_config_dict["{key}"]` will be used instead.'
|
| 374 |
+
)
|
| 375 |
+
# If inferred from default argument values (just to be super careful)
|
| 376 |
+
else:
|
| 377 |
+
message = (
|
| 378 |
+
f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
|
| 379 |
+
f'The value `vision_config["{key}"]` will be overriden.'
|
| 380 |
+
)
|
| 381 |
+
logger.warning(message)
|
| 382 |
+
|
| 383 |
+
# Update all values in `vision_config` with the ones in `_vision_config_dict`.
|
| 384 |
+
vision_config.update(_vision_config_dict)
|
| 385 |
+
|
| 386 |
+
if text_config is None:
|
| 387 |
+
text_config = {}
|
| 388 |
+
logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
|
| 389 |
+
|
| 390 |
+
if vision_config is None:
|
| 391 |
+
vision_config = {}
|
| 392 |
+
logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
|
| 393 |
+
|
| 394 |
+
self.text_config = EvaCLIPTextConfig(**text_config)
|
| 395 |
+
self.vision_config = EvaCLIPVisionConfig(**vision_config)
|
| 396 |
+
|
| 397 |
+
self.projection_dim = projection_dim
|
| 398 |
+
self.logit_scale_init_value = logit_scale_init_value
|
| 399 |
+
self.initializer_factor = 1.0
|
| 400 |
+
|
| 401 |
+
@classmethod
|
| 402 |
+
def from_text_vision_configs(cls, text_config: EvaCLIPTextConfig, vision_config: EvaCLIPVisionConfig, **kwargs):
|
| 403 |
+
r"""
|
| 404 |
+
Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
|
| 405 |
+
configuration.
|
| 406 |
+
|
| 407 |
+
Returns:
|
| 408 |
+
[`CLIPConfig`]: An instance of a configuration object
|
| 409 |
+
"""
|
| 410 |
+
|
| 411 |
+
return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
|
| 412 |
+
|
| 413 |
+
def to_dict(self):
|
| 414 |
+
"""
|
| 415 |
+
Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
|
| 416 |
+
|
| 417 |
+
Returns:
|
| 418 |
+
`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
|
| 419 |
+
"""
|
| 420 |
+
output = copy.deepcopy(self.__dict__)
|
| 421 |
+
output["text_config"] = self.text_config.to_dict()
|
| 422 |
+
output["vision_config"] = self.vision_config.to_dict()
|
| 423 |
+
output["model_type"] = self.__class__.model_type
|
| 424 |
+
return output
|
| 425 |
+
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/eva_clip/modeling_evaclip.py
ADDED
|
@@ -0,0 +1,1428 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# coding=utf-8
|
| 2 |
+
# Copyright 2021 The OpenAI Team Authors and The HuggingFace Team. All rights reserved.
|
| 3 |
+
#
|
| 4 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 5 |
+
# you may not use this file except in compliance with the License.
|
| 6 |
+
# You may obtain a copy of the License at
|
| 7 |
+
#
|
| 8 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
| 9 |
+
#
|
| 10 |
+
# Unless required by applicable law or agreed to in writing, software
|
| 11 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 12 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 13 |
+
# See the License for the specific language governing permissions and
|
| 14 |
+
# limitations under the License.
|
| 15 |
+
""" PyTorch EvaCLIP model."""
|
| 16 |
+
# Code mainly taken from https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py#L943
|
| 17 |
+
# and adjusteed for EvaClip
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
from dataclasses import dataclass
|
| 21 |
+
from typing import Any, Optional, Tuple, Union
|
| 22 |
+
|
| 23 |
+
import torch
|
| 24 |
+
import torch.utils.checkpoint
|
| 25 |
+
from torch import nn
|
| 26 |
+
|
| 27 |
+
from transformers.activations import ACT2FN
|
| 28 |
+
from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
|
| 29 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 30 |
+
from transformers.utils import (
|
| 31 |
+
ModelOutput,
|
| 32 |
+
add_start_docstrings,
|
| 33 |
+
add_start_docstrings_to_model_forward,
|
| 34 |
+
logging,
|
| 35 |
+
replace_return_docstrings,
|
| 36 |
+
)
|
| 37 |
+
from .configuration_evaclip import EvaCLIPConfig, EvaCLIPTextConfig, EvaCLIPVisionConfig
|
| 38 |
+
|
| 39 |
+
logger = logging.get_logger(__name__)
|
| 40 |
+
|
| 41 |
+
_CHECKPOINT_FOR_DOC = "QuanSun/EVA02_CLIP_E_psz14_plus_s9B"
|
| 42 |
+
|
| 43 |
+
Eva_CLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
| 44 |
+
"EVA02_CLIP_E_psz14_plus_s9B",
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
# Copied from transformers.models.bart.modeling_bart._expand_mask
|
| 48 |
+
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| 49 |
+
"""
|
| 50 |
+
Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
|
| 51 |
+
"""
|
| 52 |
+
bsz, src_len = mask.size()
|
| 53 |
+
tgt_len = tgt_len if tgt_len is not None else src_len
|
| 54 |
+
|
| 55 |
+
expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
|
| 56 |
+
|
| 57 |
+
inverted_mask = 1.0 - expanded_mask
|
| 58 |
+
|
| 59 |
+
return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# contrastive loss function, adapted from
|
| 63 |
+
# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
|
| 64 |
+
def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
|
| 65 |
+
return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
|
| 69 |
+
caption_loss = contrastive_loss(similarity)
|
| 70 |
+
image_loss = contrastive_loss(similarity.t())
|
| 71 |
+
return (caption_loss + image_loss) / 2.0
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
@dataclass
|
| 75 |
+
class EvaCLIPVisionModelOutput(ModelOutput):
|
| 76 |
+
"""
|
| 77 |
+
Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
| 81 |
+
The image embeddings obtained by applying the projection layer to the pooler_output.
|
| 82 |
+
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
| 83 |
+
Sequence of hidden-states at the output of the last layer of the model.
|
| 84 |
+
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
| 85 |
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
| 86 |
+
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
| 87 |
+
|
| 88 |
+
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
| 89 |
+
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
| 90 |
+
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
| 91 |
+
sequence_length)`.
|
| 92 |
+
|
| 93 |
+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
| 94 |
+
heads.
|
| 95 |
+
"""
|
| 96 |
+
|
| 97 |
+
image_embeds: Optional[torch.FloatTensor] = None
|
| 98 |
+
last_hidden_state: torch.FloatTensor = None
|
| 99 |
+
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
| 100 |
+
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
@dataclass
|
| 104 |
+
class EvaCLIPTextModelOutput(ModelOutput):
|
| 105 |
+
"""
|
| 106 |
+
Base class for text model's outputs that also contains a pooling of the last hidden states.
|
| 107 |
+
|
| 108 |
+
Args:
|
| 109 |
+
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
|
| 110 |
+
The text embeddings obtained by applying the projection layer to the pooler_output.
|
| 111 |
+
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
| 112 |
+
Sequence of hidden-states at the output of the last layer of the model.
|
| 113 |
+
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
|
| 114 |
+
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
|
| 115 |
+
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
|
| 116 |
+
|
| 117 |
+
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
|
| 118 |
+
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
|
| 119 |
+
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
|
| 120 |
+
sequence_length)`.
|
| 121 |
+
|
| 122 |
+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
|
| 123 |
+
heads.
|
| 124 |
+
"""
|
| 125 |
+
|
| 126 |
+
text_embeds: Optional[torch.FloatTensor] = None
|
| 127 |
+
last_hidden_state: torch.FloatTensor = None
|
| 128 |
+
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
|
| 129 |
+
attentions: Optional[Tuple[torch.FloatTensor]] = None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class EvaCLIPOutput(ModelOutput):
|
| 134 |
+
"""
|
| 135 |
+
Args:
|
| 136 |
+
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
|
| 137 |
+
Contrastive loss for image-text similarity.
|
| 138 |
+
logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
|
| 139 |
+
The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
|
| 140 |
+
similarity scores.
|
| 141 |
+
logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
|
| 142 |
+
The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
|
| 143 |
+
similarity scores.
|
| 144 |
+
text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
| 145 |
+
The text embeddings obtained by applying the projection layer to the pooled output of [`EvaCLIPTextModel`].
|
| 146 |
+
image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
|
| 147 |
+
The image embeddings obtained by applying the projection layer to the pooled output of [`EvaCLIPVisionModel`].
|
| 148 |
+
text_model_output(`BaseModelOutputWithPooling`):
|
| 149 |
+
The output of the [`EvaCLIPTextModel`].
|
| 150 |
+
vision_model_output(`BaseModelOutputWithPooling`):
|
| 151 |
+
The output of the [`EvaCLIPVisionModel`].
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
loss: Optional[torch.FloatTensor] = None
|
| 155 |
+
logits_per_image: torch.FloatTensor = None
|
| 156 |
+
logits_per_text: torch.FloatTensor = None
|
| 157 |
+
text_embeds: torch.FloatTensor = None
|
| 158 |
+
image_embeds: torch.FloatTensor = None
|
| 159 |
+
text_model_output: BaseModelOutputWithPooling = None
|
| 160 |
+
vision_model_output: BaseModelOutputWithPooling = None
|
| 161 |
+
|
| 162 |
+
def to_tuple(self) -> Tuple[Any]:
|
| 163 |
+
return tuple(
|
| 164 |
+
self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
|
| 165 |
+
for k in self.keys()
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
class EvaCLIPVisionEmbeddings(nn.Module):
|
| 170 |
+
def __init__(self, config: EvaCLIPVisionConfig):
|
| 171 |
+
super().__init__()
|
| 172 |
+
self.config = config
|
| 173 |
+
self.embed_dim = config.hidden_size
|
| 174 |
+
self.image_size = config.image_size
|
| 175 |
+
self.patch_size = config.patch_size
|
| 176 |
+
|
| 177 |
+
self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
|
| 178 |
+
|
| 179 |
+
self.patch_embedding = nn.Conv2d(
|
| 180 |
+
in_channels=config.num_channels,
|
| 181 |
+
out_channels=self.embed_dim,
|
| 182 |
+
kernel_size=self.patch_size,
|
| 183 |
+
stride=self.patch_size,
|
| 184 |
+
bias=True,
|
| 185 |
+
)
|
| 186 |
+
|
| 187 |
+
self.num_patches = (self.image_size // self.patch_size) ** 2
|
| 188 |
+
self.num_positions = self.num_patches + 1
|
| 189 |
+
self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
|
| 190 |
+
self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent = False)
|
| 191 |
+
|
| 192 |
+
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
| 193 |
+
batch_size = pixel_values.shape[0]
|
| 194 |
+
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
|
| 195 |
+
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
| 196 |
+
|
| 197 |
+
class_embeds = self.class_embedding.expand(batch_size, 1, -1)
|
| 198 |
+
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
| 199 |
+
embeddings = embeddings + self.position_embedding(self.position_ids)
|
| 200 |
+
return embeddings
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
class EvaCLIPTextEmbeddings(nn.Module):
|
| 204 |
+
def __init__(self, config: EvaCLIPTextConfig):
|
| 205 |
+
super().__init__()
|
| 206 |
+
embed_dim = config.hidden_size
|
| 207 |
+
|
| 208 |
+
self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
|
| 209 |
+
self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
|
| 210 |
+
|
| 211 |
+
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
|
| 212 |
+
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False)
|
| 213 |
+
|
| 214 |
+
def forward(
|
| 215 |
+
self,
|
| 216 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 217 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 218 |
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
| 219 |
+
) -> torch.Tensor:
|
| 220 |
+
seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
|
| 221 |
+
|
| 222 |
+
if position_ids is None:
|
| 223 |
+
position_ids = self.position_ids[:, :seq_length]
|
| 224 |
+
|
| 225 |
+
if inputs_embeds is None:
|
| 226 |
+
inputs_embeds = self.token_embedding(input_ids)
|
| 227 |
+
|
| 228 |
+
position_embeddings = self.position_embedding(position_ids)
|
| 229 |
+
embeddings = inputs_embeds + position_embeddings
|
| 230 |
+
|
| 231 |
+
return embeddings
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
class EvaCLIPAttention(nn.Module):
|
| 235 |
+
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 236 |
+
|
| 237 |
+
def __init__(self, config):
|
| 238 |
+
super().__init__()
|
| 239 |
+
self.config = config
|
| 240 |
+
self.embed_dim = config.hidden_size
|
| 241 |
+
self.num_heads = config.num_attention_heads
|
| 242 |
+
self.head_dim = self.embed_dim // self.num_heads
|
| 243 |
+
if self.head_dim * self.num_heads != self.embed_dim:
|
| 244 |
+
raise ValueError(
|
| 245 |
+
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
|
| 246 |
+
f" {self.num_heads})."
|
| 247 |
+
)
|
| 248 |
+
self.scale = self.head_dim**-0.5
|
| 249 |
+
self.dropout = config.attention_dropout
|
| 250 |
+
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
|
| 251 |
+
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
|
| 252 |
+
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
|
| 253 |
+
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
|
| 254 |
+
|
| 255 |
+
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
| 256 |
+
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
| 257 |
+
|
| 258 |
+
def forward(
|
| 259 |
+
self,
|
| 260 |
+
hidden_states: torch.Tensor,
|
| 261 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 262 |
+
causal_attention_mask: Optional[torch.Tensor] = None,
|
| 263 |
+
output_attentions: Optional[bool] = False,
|
| 264 |
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 265 |
+
"""Input shape: Batch x Time x Channel"""
|
| 266 |
+
|
| 267 |
+
bsz, tgt_len, embed_dim = hidden_states.size()
|
| 268 |
+
|
| 269 |
+
# get query proj
|
| 270 |
+
query_states = self.q_proj(hidden_states) * self.scale
|
| 271 |
+
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
|
| 272 |
+
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
| 273 |
+
|
| 274 |
+
proj_shape = (bsz * self.num_heads, -1, self.head_dim)
|
| 275 |
+
query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
|
| 276 |
+
key_states = key_states.view(*proj_shape)
|
| 277 |
+
value_states = value_states.view(*proj_shape)
|
| 278 |
+
|
| 279 |
+
src_len = key_states.size(1)
|
| 280 |
+
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
|
| 281 |
+
|
| 282 |
+
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 283 |
+
raise ValueError(
|
| 284 |
+
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
| 285 |
+
f" {attn_weights.size()}"
|
| 286 |
+
)
|
| 287 |
+
|
| 288 |
+
# apply the causal_attention_mask first
|
| 289 |
+
if causal_attention_mask is not None:
|
| 290 |
+
if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 291 |
+
raise ValueError(
|
| 292 |
+
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
|
| 293 |
+
f" {causal_attention_mask.size()}"
|
| 294 |
+
)
|
| 295 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
|
| 296 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
| 297 |
+
|
| 298 |
+
if attention_mask is not None:
|
| 299 |
+
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 300 |
+
raise ValueError(
|
| 301 |
+
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
|
| 302 |
+
)
|
| 303 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
| 304 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
| 305 |
+
|
| 306 |
+
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
| 307 |
+
|
| 308 |
+
if output_attentions:
|
| 309 |
+
# this operation is a bit akward, but it's required to
|
| 310 |
+
# make sure that attn_weights keeps its gradient.
|
| 311 |
+
# In order to do so, attn_weights have to reshaped
|
| 312 |
+
# twice and have to be reused in the following
|
| 313 |
+
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
| 314 |
+
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
| 315 |
+
else:
|
| 316 |
+
attn_weights_reshaped = None
|
| 317 |
+
|
| 318 |
+
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
|
| 319 |
+
|
| 320 |
+
attn_output = torch.bmm(attn_probs, value_states)
|
| 321 |
+
|
| 322 |
+
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
| 323 |
+
raise ValueError(
|
| 324 |
+
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
|
| 325 |
+
f" {attn_output.size()}"
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
|
| 329 |
+
attn_output = attn_output.transpose(1, 2)
|
| 330 |
+
attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
|
| 331 |
+
|
| 332 |
+
attn_output = self.out_proj(attn_output)
|
| 333 |
+
|
| 334 |
+
return attn_output, attn_weights_reshaped
|
| 335 |
+
|
| 336 |
+
class EvaCLIPTextAttention(nn.Module):
|
| 337 |
+
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 338 |
+
|
| 339 |
+
def __init__(self, config):
|
| 340 |
+
super().__init__()
|
| 341 |
+
self.config = config
|
| 342 |
+
self.embed_dim = config.hidden_size
|
| 343 |
+
self.num_heads = config.num_attention_heads
|
| 344 |
+
self.head_dim = self.embed_dim // self.num_heads
|
| 345 |
+
if self.head_dim * self.num_heads != self.embed_dim:
|
| 346 |
+
raise ValueError(
|
| 347 |
+
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
|
| 348 |
+
f" {self.num_heads})."
|
| 349 |
+
)
|
| 350 |
+
self.scale = self.head_dim**-0.5
|
| 351 |
+
self.dropout = config.attention_dropout
|
| 352 |
+
self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
|
| 353 |
+
self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
|
| 354 |
+
self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
|
| 355 |
+
self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
|
| 356 |
+
|
| 357 |
+
def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
|
| 358 |
+
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
|
| 359 |
+
|
| 360 |
+
def forward(
|
| 361 |
+
self,
|
| 362 |
+
hidden_states: torch.Tensor,
|
| 363 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 364 |
+
causal_attention_mask: Optional[torch.Tensor] = None,
|
| 365 |
+
output_attentions: Optional[bool] = False,
|
| 366 |
+
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 367 |
+
"""Input shape: Batch x Time x Channel"""
|
| 368 |
+
|
| 369 |
+
bsz, tgt_len, embed_dim = hidden_states.size()
|
| 370 |
+
|
| 371 |
+
# get query proj
|
| 372 |
+
query_states = self.q_proj(hidden_states) * self.scale
|
| 373 |
+
key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
|
| 374 |
+
value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
|
| 375 |
+
|
| 376 |
+
proj_shape = (bsz * self.num_heads, -1, self.head_dim)
|
| 377 |
+
query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
|
| 378 |
+
key_states = key_states.view(*proj_shape)
|
| 379 |
+
value_states = value_states.view(*proj_shape)
|
| 380 |
+
|
| 381 |
+
src_len = key_states.size(1)
|
| 382 |
+
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
|
| 383 |
+
|
| 384 |
+
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
|
| 385 |
+
raise ValueError(
|
| 386 |
+
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
|
| 387 |
+
f" {attn_weights.size()}"
|
| 388 |
+
)
|
| 389 |
+
|
| 390 |
+
# apply the causal_attention_mask first
|
| 391 |
+
if causal_attention_mask is not None:
|
| 392 |
+
if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 393 |
+
raise ValueError(
|
| 394 |
+
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
|
| 395 |
+
f" {causal_attention_mask.size()}"
|
| 396 |
+
)
|
| 397 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
|
| 398 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
| 399 |
+
|
| 400 |
+
if attention_mask is not None:
|
| 401 |
+
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
|
| 402 |
+
raise ValueError(
|
| 403 |
+
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
|
| 404 |
+
)
|
| 405 |
+
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
|
| 406 |
+
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
|
| 407 |
+
|
| 408 |
+
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
|
| 409 |
+
|
| 410 |
+
if output_attentions:
|
| 411 |
+
# this operation is a bit akward, but it's required to
|
| 412 |
+
# make sure that attn_weights keeps its gradient.
|
| 413 |
+
# In order to do so, attn_weights have to reshaped
|
| 414 |
+
# twice and have to be reused in the following
|
| 415 |
+
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
|
| 416 |
+
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
|
| 417 |
+
else:
|
| 418 |
+
attn_weights_reshaped = None
|
| 419 |
+
|
| 420 |
+
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
|
| 421 |
+
|
| 422 |
+
attn_output = torch.bmm(attn_probs, value_states)
|
| 423 |
+
|
| 424 |
+
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
|
| 425 |
+
raise ValueError(
|
| 426 |
+
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
|
| 427 |
+
f" {attn_output.size()}"
|
| 428 |
+
)
|
| 429 |
+
|
| 430 |
+
attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
|
| 431 |
+
attn_output = attn_output.transpose(1, 2)
|
| 432 |
+
attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
|
| 433 |
+
|
| 434 |
+
attn_output = self.out_proj(attn_output)
|
| 435 |
+
|
| 436 |
+
return attn_output, attn_weights_reshaped
|
| 437 |
+
|
| 438 |
+
class EvaCLIPMLP(nn.Module):
|
| 439 |
+
def __init__(self, config):
|
| 440 |
+
super().__init__()
|
| 441 |
+
self.config = config
|
| 442 |
+
self.activation_fn = ACT2FN[config.hidden_act]
|
| 443 |
+
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
|
| 444 |
+
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
|
| 445 |
+
|
| 446 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 447 |
+
hidden_states = self.fc1(hidden_states)
|
| 448 |
+
hidden_states = self.activation_fn(hidden_states)
|
| 449 |
+
hidden_states = self.fc2(hidden_states)
|
| 450 |
+
return hidden_states
|
| 451 |
+
|
| 452 |
+
|
| 453 |
+
class EvaCLIPEncoderLayer(nn.Module):
|
| 454 |
+
def __init__(self, config: EvaCLIPConfig):
|
| 455 |
+
super().__init__()
|
| 456 |
+
self.config = config
|
| 457 |
+
self.embed_dim = config.hidden_size
|
| 458 |
+
self.post_layernorm = config.post_layernorm if config.post_layernorm is not None else False
|
| 459 |
+
self.self_attn = EvaCLIPAttention(config)
|
| 460 |
+
self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 461 |
+
self.mlp = EvaCLIPMLP(config)
|
| 462 |
+
self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 463 |
+
|
| 464 |
+
def forward(
|
| 465 |
+
self,
|
| 466 |
+
hidden_states: torch.Tensor,
|
| 467 |
+
attention_mask: torch.Tensor,
|
| 468 |
+
causal_attention_mask: torch.Tensor,
|
| 469 |
+
output_attentions: Optional[bool] = False,
|
| 470 |
+
) -> Tuple[torch.FloatTensor]:
|
| 471 |
+
"""
|
| 472 |
+
Args:
|
| 473 |
+
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
| 474 |
+
attention_mask (`torch.FloatTensor`): attention mask of size
|
| 475 |
+
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
|
| 476 |
+
`(config.encoder_attention_heads,)`.
|
| 477 |
+
output_attentions (`bool`, *optional*):
|
| 478 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| 479 |
+
returned tensors for more detail.
|
| 480 |
+
"""
|
| 481 |
+
residual = hidden_states
|
| 482 |
+
|
| 483 |
+
if not self.post_layernorm:
|
| 484 |
+
hidden_states = self.layer_norm1(hidden_states)
|
| 485 |
+
hidden_states, attn_weights = self.self_attn(
|
| 486 |
+
hidden_states=hidden_states,
|
| 487 |
+
attention_mask=attention_mask,
|
| 488 |
+
causal_attention_mask=causal_attention_mask,
|
| 489 |
+
output_attentions=output_attentions,
|
| 490 |
+
)
|
| 491 |
+
if self.post_layernorm:
|
| 492 |
+
hidden_states = self.layer_norm1(hidden_states)
|
| 493 |
+
hidden_states = residual + hidden_states
|
| 494 |
+
residual = hidden_states
|
| 495 |
+
if not self.post_layernorm:
|
| 496 |
+
hidden_states = self.layer_norm2(hidden_states)
|
| 497 |
+
hidden_states = self.mlp(hidden_states)
|
| 498 |
+
if self.post_layernorm:
|
| 499 |
+
hidden_states = self.layer_norm2(hidden_states)
|
| 500 |
+
hidden_states = residual + hidden_states
|
| 501 |
+
|
| 502 |
+
outputs = (hidden_states,)
|
| 503 |
+
|
| 504 |
+
if output_attentions:
|
| 505 |
+
outputs += (attn_weights,)
|
| 506 |
+
|
| 507 |
+
return outputs
|
| 508 |
+
|
| 509 |
+
|
| 510 |
+
class EvaCLIPPreTrainedModel(PreTrainedModel):
|
| 511 |
+
"""
|
| 512 |
+
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
| 513 |
+
models.
|
| 514 |
+
"""
|
| 515 |
+
|
| 516 |
+
config_class = EvaCLIPConfig
|
| 517 |
+
base_model_prefix = "clip"
|
| 518 |
+
supports_gradient_checkpointing = True
|
| 519 |
+
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
| 520 |
+
|
| 521 |
+
def _init_weights(self, module):
|
| 522 |
+
"""Initialize the weights"""
|
| 523 |
+
factor = self.config.initializer_factor
|
| 524 |
+
if isinstance(module, EvaCLIPTextEmbeddings):
|
| 525 |
+
module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
|
| 526 |
+
module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
|
| 527 |
+
elif isinstance(module, EvaCLIPVisionEmbeddings):
|
| 528 |
+
factor = self.config.initializer_factor
|
| 529 |
+
nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
|
| 530 |
+
nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
|
| 531 |
+
nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
|
| 532 |
+
elif isinstance(module, EvaCLIPAttention):
|
| 533 |
+
factor = self.config.initializer_factor
|
| 534 |
+
in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
| 535 |
+
out_proj_std = (module.embed_dim**-0.5) * factor
|
| 536 |
+
nn.init.normal_(module.q_proj.weight, std=in_proj_std)
|
| 537 |
+
nn.init.normal_(module.k_proj.weight, std=in_proj_std)
|
| 538 |
+
nn.init.normal_(module.v_proj.weight, std=in_proj_std)
|
| 539 |
+
nn.init.normal_(module.out_proj.weight, std=out_proj_std)
|
| 540 |
+
elif isinstance(module, EvaCLIPMLP):
|
| 541 |
+
factor = self.config.initializer_factor
|
| 542 |
+
in_proj_std = (
|
| 543 |
+
(module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
|
| 544 |
+
)
|
| 545 |
+
fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
|
| 546 |
+
nn.init.normal_(module.fc1.weight, std=fc_std)
|
| 547 |
+
nn.init.normal_(module.fc2.weight, std=in_proj_std)
|
| 548 |
+
elif isinstance(module, EvaCLIPModel):
|
| 549 |
+
nn.init.normal_(
|
| 550 |
+
module.text_projection.weight,
|
| 551 |
+
std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
|
| 552 |
+
)
|
| 553 |
+
nn.init.normal_(
|
| 554 |
+
module.visual_projection.weight,
|
| 555 |
+
std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
|
| 556 |
+
)
|
| 557 |
+
elif isinstance(module, EvaCLIPVisionModelWithProjection):
|
| 558 |
+
nn.init.normal_(
|
| 559 |
+
module.visual_projection.weight,
|
| 560 |
+
std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
|
| 561 |
+
)
|
| 562 |
+
elif isinstance(module, EvaCLIPTextModelWithProjection):
|
| 563 |
+
nn.init.normal_(
|
| 564 |
+
module.text_projection.weight,
|
| 565 |
+
std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
|
| 566 |
+
)
|
| 567 |
+
|
| 568 |
+
if isinstance(module, nn.LayerNorm):
|
| 569 |
+
module.bias.data.zero_()
|
| 570 |
+
module.weight.data.fill_(1.0)
|
| 571 |
+
if isinstance(module, nn.Linear) and module.bias is not None:
|
| 572 |
+
module.bias.data.zero_()
|
| 573 |
+
|
| 574 |
+
def _set_gradient_checkpointing(self, module, value=False):
|
| 575 |
+
if isinstance(module, EvaCLIPEncoder):
|
| 576 |
+
module.gradient_checkpointing = value
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
EvaCLIP_START_DOCSTRING = r"""
|
| 580 |
+
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 581 |
+
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 582 |
+
etc.)
|
| 583 |
+
|
| 584 |
+
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
| 585 |
+
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
| 586 |
+
and behavior.
|
| 587 |
+
|
| 588 |
+
Parameters:
|
| 589 |
+
config ([`CLIPConfig`]): Model configuration class with all the parameters of the model.
|
| 590 |
+
Initializing with a config file does not load the weights associated with the model, only the
|
| 591 |
+
configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
| 592 |
+
"""
|
| 593 |
+
|
| 594 |
+
EvaCLIP_TEXT_INPUTS_DOCSTRING = r"""
|
| 595 |
+
Args:
|
| 596 |
+
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 597 |
+
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 598 |
+
it.
|
| 599 |
+
|
| 600 |
+
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 601 |
+
[`PreTrainedTokenizer.__call__`] for details.
|
| 602 |
+
|
| 603 |
+
[What are input IDs?](../glossary#input-ids)
|
| 604 |
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 605 |
+
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
| 606 |
+
|
| 607 |
+
- 1 for tokens that are **not masked**,
|
| 608 |
+
- 0 for tokens that are **masked**.
|
| 609 |
+
|
| 610 |
+
[What are attention masks?](../glossary#attention-mask)
|
| 611 |
+
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 612 |
+
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 613 |
+
config.max_position_embeddings - 1]`.
|
| 614 |
+
|
| 615 |
+
[What are position IDs?](../glossary#position-ids)
|
| 616 |
+
output_attentions (`bool`, *optional*):
|
| 617 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
| 618 |
+
tensors for more detail.
|
| 619 |
+
output_hidden_states (`bool`, *optional*):
|
| 620 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
| 621 |
+
more detail.
|
| 622 |
+
return_dict (`bool`, *optional*):
|
| 623 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 624 |
+
"""
|
| 625 |
+
|
| 626 |
+
EvaCLIP_VISION_INPUTS_DOCSTRING = r"""
|
| 627 |
+
Args:
|
| 628 |
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
| 629 |
+
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
| 630 |
+
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
| 631 |
+
output_attentions (`bool`, *optional*):
|
| 632 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
| 633 |
+
tensors for more detail.
|
| 634 |
+
output_hidden_states (`bool`, *optional*):
|
| 635 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
| 636 |
+
more detail.
|
| 637 |
+
return_dict (`bool`, *optional*):
|
| 638 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 639 |
+
"""
|
| 640 |
+
|
| 641 |
+
EvaCLIP_INPUTS_DOCSTRING = r"""
|
| 642 |
+
Args:
|
| 643 |
+
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 644 |
+
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
| 645 |
+
it.
|
| 646 |
+
|
| 647 |
+
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
| 648 |
+
[`PreTrainedTokenizer.__call__`] for details.
|
| 649 |
+
|
| 650 |
+
[What are input IDs?](../glossary#input-ids)
|
| 651 |
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 652 |
+
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
| 653 |
+
|
| 654 |
+
- 1 for tokens that are **not masked**,
|
| 655 |
+
- 0 for tokens that are **masked**.
|
| 656 |
+
|
| 657 |
+
[What are attention masks?](../glossary#attention-mask)
|
| 658 |
+
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 659 |
+
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
| 660 |
+
config.max_position_embeddings - 1]`.
|
| 661 |
+
|
| 662 |
+
[What are position IDs?](../glossary#position-ids)
|
| 663 |
+
pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
|
| 664 |
+
Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
|
| 665 |
+
[`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
|
| 666 |
+
return_loss (`bool`, *optional*):
|
| 667 |
+
Whether or not to return the contrastive loss.
|
| 668 |
+
output_attentions (`bool`, *optional*):
|
| 669 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
|
| 670 |
+
tensors for more detail.
|
| 671 |
+
output_hidden_states (`bool`, *optional*):
|
| 672 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
|
| 673 |
+
more detail.
|
| 674 |
+
return_dict (`bool`, *optional*):
|
| 675 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 676 |
+
"""
|
| 677 |
+
|
| 678 |
+
|
| 679 |
+
class EvaCLIPEncoder(nn.Module):
|
| 680 |
+
"""
|
| 681 |
+
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
|
| 682 |
+
[`CLIPEncoderLayer`].
|
| 683 |
+
|
| 684 |
+
Args:
|
| 685 |
+
config: CLIPConfig
|
| 686 |
+
"""
|
| 687 |
+
|
| 688 |
+
def __init__(self, config: EvaCLIPConfig):
|
| 689 |
+
super().__init__()
|
| 690 |
+
self.config = config
|
| 691 |
+
self.layers = nn.ModuleList([EvaCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
|
| 692 |
+
self.gradient_checkpointing = False
|
| 693 |
+
|
| 694 |
+
def forward(
|
| 695 |
+
self,
|
| 696 |
+
inputs_embeds,
|
| 697 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 698 |
+
causal_attention_mask: Optional[torch.Tensor] = None,
|
| 699 |
+
output_attentions: Optional[bool] = None,
|
| 700 |
+
output_hidden_states: Optional[bool] = None,
|
| 701 |
+
return_dict: Optional[bool] = None,
|
| 702 |
+
) -> Union[Tuple, BaseModelOutput]:
|
| 703 |
+
r"""
|
| 704 |
+
Args:
|
| 705 |
+
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
| 706 |
+
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
|
| 707 |
+
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
| 708 |
+
than the model's internal embedding lookup matrix.
|
| 709 |
+
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 710 |
+
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
| 711 |
+
|
| 712 |
+
- 1 for tokens that are **not masked**,
|
| 713 |
+
- 0 for tokens that are **masked**.
|
| 714 |
+
|
| 715 |
+
[What are attention masks?](../glossary#attention-mask)
|
| 716 |
+
causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
| 717 |
+
Causal mask for the text model. Mask values selected in `[0, 1]`:
|
| 718 |
+
|
| 719 |
+
- 1 for tokens that are **not masked**,
|
| 720 |
+
- 0 for tokens that are **masked**.
|
| 721 |
+
|
| 722 |
+
[What are attention masks?](../glossary#attention-mask)
|
| 723 |
+
output_attentions (`bool`, *optional*):
|
| 724 |
+
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
| 725 |
+
returned tensors for more detail.
|
| 726 |
+
output_hidden_states (`bool`, *optional*):
|
| 727 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| 728 |
+
for more detail.
|
| 729 |
+
return_dict (`bool`, *optional*):
|
| 730 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 731 |
+
"""
|
| 732 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 733 |
+
output_hidden_states = (
|
| 734 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 735 |
+
)
|
| 736 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 737 |
+
|
| 738 |
+
encoder_states = () if output_hidden_states else None
|
| 739 |
+
all_attentions = () if output_attentions else None
|
| 740 |
+
|
| 741 |
+
hidden_states = inputs_embeds
|
| 742 |
+
for idx, encoder_layer in enumerate(self.layers):
|
| 743 |
+
if output_hidden_states:
|
| 744 |
+
encoder_states = encoder_states + (hidden_states,)
|
| 745 |
+
if self.gradient_checkpointing and self.training:
|
| 746 |
+
|
| 747 |
+
def create_custom_forward(module):
|
| 748 |
+
def custom_forward(*inputs):
|
| 749 |
+
return module(*inputs, output_attentions)
|
| 750 |
+
|
| 751 |
+
return custom_forward
|
| 752 |
+
|
| 753 |
+
layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 754 |
+
create_custom_forward(encoder_layer),
|
| 755 |
+
hidden_states,
|
| 756 |
+
attention_mask,
|
| 757 |
+
causal_attention_mask,
|
| 758 |
+
)
|
| 759 |
+
else:
|
| 760 |
+
layer_outputs = encoder_layer(
|
| 761 |
+
hidden_states,
|
| 762 |
+
attention_mask,
|
| 763 |
+
causal_attention_mask,
|
| 764 |
+
output_attentions=output_attentions,
|
| 765 |
+
)
|
| 766 |
+
|
| 767 |
+
hidden_states = layer_outputs[0]
|
| 768 |
+
|
| 769 |
+
if output_attentions:
|
| 770 |
+
all_attentions = all_attentions + (layer_outputs[1],)
|
| 771 |
+
|
| 772 |
+
if output_hidden_states:
|
| 773 |
+
encoder_states = encoder_states + (hidden_states,)
|
| 774 |
+
|
| 775 |
+
if not return_dict:
|
| 776 |
+
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
|
| 777 |
+
return BaseModelOutput(
|
| 778 |
+
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
|
| 779 |
+
)
|
| 780 |
+
|
| 781 |
+
|
| 782 |
+
class EvaCLIPTextTransformer(nn.Module):
|
| 783 |
+
def __init__(self, config: EvaCLIPTextConfig):
|
| 784 |
+
super().__init__()
|
| 785 |
+
self.config = config
|
| 786 |
+
embed_dim = config.hidden_size
|
| 787 |
+
self.embeddings = EvaCLIPTextEmbeddings(config)
|
| 788 |
+
self.encoder = EvaCLIPEncoder(config)
|
| 789 |
+
self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
| 790 |
+
|
| 791 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_TEXT_INPUTS_DOCSTRING)
|
| 792 |
+
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=EvaCLIPTextConfig)
|
| 793 |
+
def forward(
|
| 794 |
+
self,
|
| 795 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 796 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 797 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 798 |
+
output_attentions: Optional[bool] = None,
|
| 799 |
+
output_hidden_states: Optional[bool] = None,
|
| 800 |
+
return_dict: Optional[bool] = None,
|
| 801 |
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
| 802 |
+
r"""
|
| 803 |
+
Returns:
|
| 804 |
+
|
| 805 |
+
"""
|
| 806 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 807 |
+
output_hidden_states = (
|
| 808 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 809 |
+
)
|
| 810 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 811 |
+
|
| 812 |
+
if input_ids is None:
|
| 813 |
+
raise ValueError("You have to specify input_ids")
|
| 814 |
+
|
| 815 |
+
input_shape = input_ids.size()
|
| 816 |
+
input_ids = input_ids.view(-1, input_shape[-1])
|
| 817 |
+
|
| 818 |
+
hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
|
| 819 |
+
|
| 820 |
+
bsz, seq_len = input_shape
|
| 821 |
+
# CLIP's text model uses causal mask, prepare it here.
|
| 822 |
+
# https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
|
| 823 |
+
causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
|
| 824 |
+
hidden_states.device
|
| 825 |
+
)
|
| 826 |
+
# expand attention_mask
|
| 827 |
+
if attention_mask is not None:
|
| 828 |
+
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
|
| 829 |
+
attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
|
| 830 |
+
|
| 831 |
+
encoder_outputs = self.encoder(
|
| 832 |
+
inputs_embeds=hidden_states,
|
| 833 |
+
attention_mask=attention_mask,
|
| 834 |
+
causal_attention_mask=causal_attention_mask,
|
| 835 |
+
output_attentions=output_attentions,
|
| 836 |
+
output_hidden_states=output_hidden_states,
|
| 837 |
+
return_dict=return_dict,
|
| 838 |
+
)
|
| 839 |
+
|
| 840 |
+
last_hidden_state = encoder_outputs[0]
|
| 841 |
+
last_hidden_state = self.final_layer_norm(last_hidden_state)
|
| 842 |
+
|
| 843 |
+
# text_embeds.shape = [batch_size, sequence_length, transformer.width]
|
| 844 |
+
# take features from the eot embedding (eot_token is the highest number in each sequence)
|
| 845 |
+
# casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
|
| 846 |
+
pooled_output = last_hidden_state[
|
| 847 |
+
torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
|
| 848 |
+
input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
|
| 849 |
+
]
|
| 850 |
+
|
| 851 |
+
if not return_dict:
|
| 852 |
+
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
| 853 |
+
|
| 854 |
+
return BaseModelOutputWithPooling(
|
| 855 |
+
last_hidden_state=last_hidden_state,
|
| 856 |
+
pooler_output=pooled_output,
|
| 857 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 858 |
+
attentions=encoder_outputs.attentions,
|
| 859 |
+
)
|
| 860 |
+
|
| 861 |
+
def _build_causal_attention_mask(self, bsz, seq_len, dtype):
|
| 862 |
+
# lazily create causal attention mask, with full attention between the vision tokens
|
| 863 |
+
# pytorch uses additive attention mask; fill with -inf
|
| 864 |
+
mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
|
| 865 |
+
mask.fill_(torch.tensor(torch.finfo(dtype).min))
|
| 866 |
+
mask.triu_(1) # zero out the lower diagonal
|
| 867 |
+
mask = mask.unsqueeze(1) # expand mask
|
| 868 |
+
return mask
|
| 869 |
+
|
| 870 |
+
|
| 871 |
+
@add_start_docstrings(
|
| 872 |
+
"""The text model from EvaCLIP without any head or projection on top.""",
|
| 873 |
+
EvaCLIP_START_DOCSTRING,
|
| 874 |
+
)
|
| 875 |
+
class EvaCLIPTextModel(EvaCLIPPreTrainedModel):
|
| 876 |
+
config_class = EvaCLIPTextConfig
|
| 877 |
+
|
| 878 |
+
_no_split_modules = ["EvaCLIPEncoderLayer"]
|
| 879 |
+
|
| 880 |
+
def __init__(self, config: EvaCLIPTextConfig):
|
| 881 |
+
super().__init__(config)
|
| 882 |
+
self.text_model = EvaCLIPTextTransformer(config)
|
| 883 |
+
# Initialize weights and apply final processing
|
| 884 |
+
self.post_init()
|
| 885 |
+
|
| 886 |
+
def get_input_embeddings(self) -> nn.Module:
|
| 887 |
+
return self.text_model.embeddings.token_embedding
|
| 888 |
+
|
| 889 |
+
def set_input_embeddings(self, value):
|
| 890 |
+
self.text_model.embeddings.token_embedding = value
|
| 891 |
+
|
| 892 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_TEXT_INPUTS_DOCSTRING)
|
| 893 |
+
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=EvaCLIPTextConfig)
|
| 894 |
+
def forward(
|
| 895 |
+
self,
|
| 896 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 897 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 898 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 899 |
+
output_attentions: Optional[bool] = None,
|
| 900 |
+
output_hidden_states: Optional[bool] = None,
|
| 901 |
+
return_dict: Optional[bool] = None,
|
| 902 |
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
| 903 |
+
r"""
|
| 904 |
+
Returns:
|
| 905 |
+
|
| 906 |
+
Examples:
|
| 907 |
+
|
| 908 |
+
```python
|
| 909 |
+
>>> from transformers import AutoTokenizer, CLIPTextModel
|
| 910 |
+
|
| 911 |
+
>>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 912 |
+
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 913 |
+
|
| 914 |
+
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
| 915 |
+
|
| 916 |
+
>>> outputs = model(**inputs)
|
| 917 |
+
>>> last_hidden_state = outputs.last_hidden_state
|
| 918 |
+
>>> pooled_output = outputs.pooler_output # pooled (EOS token) states
|
| 919 |
+
```"""
|
| 920 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 921 |
+
|
| 922 |
+
return self.text_model(
|
| 923 |
+
input_ids=input_ids,
|
| 924 |
+
attention_mask=attention_mask,
|
| 925 |
+
position_ids=position_ids,
|
| 926 |
+
output_attentions=output_attentions,
|
| 927 |
+
output_hidden_states=output_hidden_states,
|
| 928 |
+
return_dict=return_dict,
|
| 929 |
+
)
|
| 930 |
+
|
| 931 |
+
|
| 932 |
+
class EvaCLIPVisionTransformer(nn.Module):
|
| 933 |
+
def __init__(self, config: EvaCLIPVisionConfig):
|
| 934 |
+
super().__init__()
|
| 935 |
+
self.config = config
|
| 936 |
+
embed_dim = config.hidden_size
|
| 937 |
+
|
| 938 |
+
self.embeddings = EvaCLIPVisionEmbeddings(config)
|
| 939 |
+
self.encoder = EvaCLIPEncoder(config)
|
| 940 |
+
self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
|
| 941 |
+
|
| 942 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_VISION_INPUTS_DOCSTRING)
|
| 943 |
+
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=EvaCLIPVisionConfig)
|
| 944 |
+
def forward(
|
| 945 |
+
self,
|
| 946 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 947 |
+
output_attentions: Optional[bool] = None,
|
| 948 |
+
output_hidden_states: Optional[bool] = None,
|
| 949 |
+
return_dict: Optional[bool] = None,
|
| 950 |
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
| 951 |
+
r"""
|
| 952 |
+
Returns:
|
| 953 |
+
|
| 954 |
+
"""
|
| 955 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 956 |
+
output_hidden_states = (
|
| 957 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 958 |
+
)
|
| 959 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 960 |
+
|
| 961 |
+
if pixel_values is None:
|
| 962 |
+
raise ValueError("You have to specify pixel_values")
|
| 963 |
+
|
| 964 |
+
hidden_states = self.embeddings(pixel_values)
|
| 965 |
+
|
| 966 |
+
encoder_outputs = self.encoder(
|
| 967 |
+
inputs_embeds=hidden_states,
|
| 968 |
+
output_attentions=output_attentions,
|
| 969 |
+
output_hidden_states=output_hidden_states,
|
| 970 |
+
return_dict=return_dict,
|
| 971 |
+
)
|
| 972 |
+
|
| 973 |
+
last_hidden_state = encoder_outputs[0]
|
| 974 |
+
pooled_output = last_hidden_state[:, 0, :]
|
| 975 |
+
pooled_output = self.post_layernorm(pooled_output)
|
| 976 |
+
|
| 977 |
+
if not return_dict:
|
| 978 |
+
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
| 979 |
+
|
| 980 |
+
return BaseModelOutputWithPooling(
|
| 981 |
+
last_hidden_state=last_hidden_state,
|
| 982 |
+
pooler_output=pooled_output,
|
| 983 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 984 |
+
attentions=encoder_outputs.attentions,
|
| 985 |
+
)
|
| 986 |
+
|
| 987 |
+
|
| 988 |
+
@add_start_docstrings(
|
| 989 |
+
"""The vision model from EvaCLIP without any head or projection on top.""",
|
| 990 |
+
EvaCLIP_START_DOCSTRING,
|
| 991 |
+
)
|
| 992 |
+
class EvaCLIPVisionModel(EvaCLIPPreTrainedModel):
|
| 993 |
+
config_class = EvaCLIPVisionConfig
|
| 994 |
+
main_input_name = "pixel_values"
|
| 995 |
+
|
| 996 |
+
def __init__(self, config: EvaCLIPVisionConfig):
|
| 997 |
+
super().__init__(config)
|
| 998 |
+
self.vision_model = EvaCLIPVisionTransformer(config)
|
| 999 |
+
# Initialize weights and apply final processing
|
| 1000 |
+
self.post_init()
|
| 1001 |
+
|
| 1002 |
+
def get_input_embeddings(self) -> nn.Module:
|
| 1003 |
+
return self.vision_model.embeddings.patch_embedding
|
| 1004 |
+
|
| 1005 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_VISION_INPUTS_DOCSTRING)
|
| 1006 |
+
@replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=EvaCLIPVisionConfig)
|
| 1007 |
+
def forward(
|
| 1008 |
+
self,
|
| 1009 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 1010 |
+
output_attentions: Optional[bool] = None,
|
| 1011 |
+
output_hidden_states: Optional[bool] = None,
|
| 1012 |
+
return_dict: Optional[bool] = None,
|
| 1013 |
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
| 1014 |
+
r"""
|
| 1015 |
+
Returns:
|
| 1016 |
+
|
| 1017 |
+
Examples:
|
| 1018 |
+
|
| 1019 |
+
```python
|
| 1020 |
+
>>> from PIL import Image
|
| 1021 |
+
>>> import requests
|
| 1022 |
+
>>> from transformers import AutoProcessor, CLIPVisionModel
|
| 1023 |
+
|
| 1024 |
+
>>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 1025 |
+
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 1026 |
+
|
| 1027 |
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 1028 |
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
| 1029 |
+
|
| 1030 |
+
>>> inputs = processor(images=image, return_tensors="pt")
|
| 1031 |
+
|
| 1032 |
+
>>> outputs = model(**inputs)
|
| 1033 |
+
>>> last_hidden_state = outputs.last_hidden_state
|
| 1034 |
+
>>> pooled_output = outputs.pooler_output # pooled CLS states
|
| 1035 |
+
```"""
|
| 1036 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1037 |
+
|
| 1038 |
+
return self.vision_model(
|
| 1039 |
+
pixel_values=pixel_values,
|
| 1040 |
+
output_attentions=output_attentions,
|
| 1041 |
+
output_hidden_states=output_hidden_states,
|
| 1042 |
+
return_dict=return_dict,
|
| 1043 |
+
)
|
| 1044 |
+
|
| 1045 |
+
|
| 1046 |
+
@add_start_docstrings(EvaCLIP_START_DOCSTRING)
|
| 1047 |
+
class EvaCLIPModel(EvaCLIPPreTrainedModel):
|
| 1048 |
+
config_class = EvaCLIPConfig
|
| 1049 |
+
|
| 1050 |
+
def __init__(self, config: EvaCLIPConfig):
|
| 1051 |
+
super().__init__(config)
|
| 1052 |
+
|
| 1053 |
+
if not (type(config.text_config).__name__ == "EvaCLIPTextConfig"):
|
| 1054 |
+
raise ValueError(
|
| 1055 |
+
"config.text_config is expected to be of type EvaCLIPTextConfig but is of type"
|
| 1056 |
+
f" {type(config.text_config)}."
|
| 1057 |
+
)
|
| 1058 |
+
|
| 1059 |
+
if not (type(config.vision_config).__name__ == "EvaCLIPVisionConfig"):
|
| 1060 |
+
raise ValueError(
|
| 1061 |
+
"config.vision_config is expected to be of type EvaCLIPVisionConfig but is of type"
|
| 1062 |
+
f" {type(config.vision_config)}."
|
| 1063 |
+
)
|
| 1064 |
+
|
| 1065 |
+
text_config = config.text_config
|
| 1066 |
+
vision_config = config.vision_config
|
| 1067 |
+
|
| 1068 |
+
self.projection_dim = config.projection_dim
|
| 1069 |
+
self.text_embed_dim = text_config.hidden_size
|
| 1070 |
+
self.vision_embed_dim = vision_config.hidden_size
|
| 1071 |
+
|
| 1072 |
+
self.text_model = EvaCLIPTextTransformer(text_config)
|
| 1073 |
+
self.vision_model = EvaCLIPVisionTransformer(vision_config)
|
| 1074 |
+
|
| 1075 |
+
self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=True)
|
| 1076 |
+
self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
|
| 1077 |
+
self.logit_scale = nn.Parameter(torch.ones([]) * self.config.logit_scale_init_value)
|
| 1078 |
+
|
| 1079 |
+
# Initialize weights and apply final processing
|
| 1080 |
+
self.post_init()
|
| 1081 |
+
|
| 1082 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_TEXT_INPUTS_DOCSTRING)
|
| 1083 |
+
def get_text_features(
|
| 1084 |
+
self,
|
| 1085 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 1086 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 1087 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 1088 |
+
output_attentions: Optional[bool] = None,
|
| 1089 |
+
output_hidden_states: Optional[bool] = None,
|
| 1090 |
+
return_dict: Optional[bool] = None,
|
| 1091 |
+
) -> torch.FloatTensor:
|
| 1092 |
+
r"""
|
| 1093 |
+
Returns:
|
| 1094 |
+
text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
|
| 1095 |
+
applying the projection layer to the pooled output of [`CLIPTextModel`].
|
| 1096 |
+
|
| 1097 |
+
Examples:
|
| 1098 |
+
|
| 1099 |
+
```python
|
| 1100 |
+
>>> from transformers import AutoTokenizer, CLIPModel
|
| 1101 |
+
|
| 1102 |
+
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 1103 |
+
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 1104 |
+
|
| 1105 |
+
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
| 1106 |
+
>>> text_features = model.get_text_features(**inputs)
|
| 1107 |
+
```"""
|
| 1108 |
+
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
|
| 1109 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1110 |
+
output_hidden_states = (
|
| 1111 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1112 |
+
)
|
| 1113 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1114 |
+
|
| 1115 |
+
text_outputs = self.text_model(
|
| 1116 |
+
input_ids=input_ids,
|
| 1117 |
+
attention_mask=attention_mask,
|
| 1118 |
+
position_ids=position_ids,
|
| 1119 |
+
output_attentions=output_attentions,
|
| 1120 |
+
output_hidden_states=output_hidden_states,
|
| 1121 |
+
return_dict=return_dict,
|
| 1122 |
+
)
|
| 1123 |
+
|
| 1124 |
+
pooled_output = text_outputs[1]
|
| 1125 |
+
text_features = self.text_projection(pooled_output)
|
| 1126 |
+
|
| 1127 |
+
return text_features
|
| 1128 |
+
|
| 1129 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_VISION_INPUTS_DOCSTRING)
|
| 1130 |
+
def get_image_features(
|
| 1131 |
+
self,
|
| 1132 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 1133 |
+
output_attentions: Optional[bool] = None,
|
| 1134 |
+
output_hidden_states: Optional[bool] = None,
|
| 1135 |
+
return_dict: Optional[bool] = None,
|
| 1136 |
+
) -> torch.FloatTensor:
|
| 1137 |
+
r"""
|
| 1138 |
+
Returns:
|
| 1139 |
+
image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
|
| 1140 |
+
applying the projection layer to the pooled output of [`EvaCLIPVisionModel`].
|
| 1141 |
+
|
| 1142 |
+
Examples:
|
| 1143 |
+
|
| 1144 |
+
```python
|
| 1145 |
+
>>> from PIL import Image
|
| 1146 |
+
>>> import requests
|
| 1147 |
+
>>> from transformers import AutoProcessor, CLIPModel
|
| 1148 |
+
|
| 1149 |
+
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 1150 |
+
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 1151 |
+
|
| 1152 |
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 1153 |
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
| 1154 |
+
|
| 1155 |
+
>>> inputs = processor(images=image, return_tensors="pt")
|
| 1156 |
+
|
| 1157 |
+
>>> image_features = model.get_image_features(**inputs)
|
| 1158 |
+
```"""
|
| 1159 |
+
# Use EvaCLIP model's config for some fields (if specified) instead of those of vision & text components.
|
| 1160 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1161 |
+
output_hidden_states = (
|
| 1162 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1163 |
+
)
|
| 1164 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1165 |
+
|
| 1166 |
+
vision_outputs = self.vision_model(
|
| 1167 |
+
pixel_values=pixel_values,
|
| 1168 |
+
output_attentions=output_attentions,
|
| 1169 |
+
output_hidden_states=output_hidden_states,
|
| 1170 |
+
return_dict=return_dict,
|
| 1171 |
+
)
|
| 1172 |
+
|
| 1173 |
+
pooled_output = vision_outputs[1] # pooled_output
|
| 1174 |
+
image_features = self.visual_projection(pooled_output)
|
| 1175 |
+
|
| 1176 |
+
return image_features
|
| 1177 |
+
|
| 1178 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_INPUTS_DOCSTRING)
|
| 1179 |
+
@replace_return_docstrings(output_type=EvaCLIPOutput, config_class=EvaCLIPConfig)
|
| 1180 |
+
def forward(
|
| 1181 |
+
self,
|
| 1182 |
+
input_ids: Optional[torch.LongTensor] = None,
|
| 1183 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 1184 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 1185 |
+
position_ids: Optional[torch.LongTensor] = None,
|
| 1186 |
+
return_loss: Optional[bool] = None,
|
| 1187 |
+
output_attentions: Optional[bool] = None,
|
| 1188 |
+
output_hidden_states: Optional[bool] = None,
|
| 1189 |
+
return_dict: Optional[bool] = None,
|
| 1190 |
+
) -> Union[Tuple, EvaCLIPOutput]:
|
| 1191 |
+
r"""
|
| 1192 |
+
Returns:
|
| 1193 |
+
|
| 1194 |
+
Examples:
|
| 1195 |
+
|
| 1196 |
+
```python
|
| 1197 |
+
>>> from PIL import Image
|
| 1198 |
+
>>> import requests
|
| 1199 |
+
>>> from transformers import AutoProcessor, CLIPModel
|
| 1200 |
+
|
| 1201 |
+
>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
|
| 1202 |
+
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 1203 |
+
|
| 1204 |
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 1205 |
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
| 1206 |
+
|
| 1207 |
+
>>> inputs = processor(
|
| 1208 |
+
... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
|
| 1209 |
+
... )
|
| 1210 |
+
|
| 1211 |
+
>>> outputs = model(**inputs)
|
| 1212 |
+
>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
|
| 1213 |
+
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
|
| 1214 |
+
```"""
|
| 1215 |
+
# Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
|
| 1216 |
+
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
| 1217 |
+
output_hidden_states = (
|
| 1218 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 1219 |
+
)
|
| 1220 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1221 |
+
|
| 1222 |
+
vision_outputs = self.vision_model(
|
| 1223 |
+
pixel_values=pixel_values,
|
| 1224 |
+
output_attentions=output_attentions,
|
| 1225 |
+
output_hidden_states=output_hidden_states,
|
| 1226 |
+
return_dict=return_dict,
|
| 1227 |
+
)
|
| 1228 |
+
|
| 1229 |
+
text_outputs = self.text_model(
|
| 1230 |
+
input_ids=input_ids,
|
| 1231 |
+
attention_mask=attention_mask,
|
| 1232 |
+
position_ids=position_ids,
|
| 1233 |
+
output_attentions=output_attentions,
|
| 1234 |
+
output_hidden_states=output_hidden_states,
|
| 1235 |
+
return_dict=return_dict,
|
| 1236 |
+
)
|
| 1237 |
+
|
| 1238 |
+
image_embeds = vision_outputs[1]
|
| 1239 |
+
image_embeds = self.visual_projection(image_embeds)
|
| 1240 |
+
|
| 1241 |
+
text_embeds = text_outputs[1]
|
| 1242 |
+
text_embeds = self.text_projection(text_embeds)
|
| 1243 |
+
|
| 1244 |
+
# normalized features
|
| 1245 |
+
image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
|
| 1246 |
+
text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
|
| 1247 |
+
|
| 1248 |
+
# cosine similarity as logits
|
| 1249 |
+
logit_scale = self.logit_scale.exp()
|
| 1250 |
+
logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
|
| 1251 |
+
logits_per_image = logits_per_text.t()
|
| 1252 |
+
|
| 1253 |
+
loss = None
|
| 1254 |
+
if return_loss:
|
| 1255 |
+
loss = clip_loss(logits_per_text)
|
| 1256 |
+
|
| 1257 |
+
if not return_dict:
|
| 1258 |
+
output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
|
| 1259 |
+
return ((loss,) + output) if loss is not None else output
|
| 1260 |
+
|
| 1261 |
+
return EvaCLIPOutput(
|
| 1262 |
+
loss=loss,
|
| 1263 |
+
logits_per_image=logits_per_image,
|
| 1264 |
+
logits_per_text=logits_per_text,
|
| 1265 |
+
text_embeds=text_embeds,
|
| 1266 |
+
image_embeds=image_embeds,
|
| 1267 |
+
text_model_output=text_outputs,
|
| 1268 |
+
vision_model_output=vision_outputs,
|
| 1269 |
+
)
|
| 1270 |
+
|
| 1271 |
+
|
| 1272 |
+
@add_start_docstrings(
|
| 1273 |
+
"""
|
| 1274 |
+
EvaCLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).
|
| 1275 |
+
""",
|
| 1276 |
+
EvaCLIP_START_DOCSTRING,
|
| 1277 |
+
)
|
| 1278 |
+
class EvaCLIPTextModelWithProjection(EvaCLIPPreTrainedModel):
|
| 1279 |
+
config_class = EvaCLIPTextConfig
|
| 1280 |
+
|
| 1281 |
+
_no_split_modules = ["EvaCLIPEncoderLayer"]
|
| 1282 |
+
|
| 1283 |
+
def __init__(self, config: EvaCLIPTextConfig):
|
| 1284 |
+
super().__init__(config)
|
| 1285 |
+
|
| 1286 |
+
self.text_model = EvaCLIPTextTransformer(config)
|
| 1287 |
+
|
| 1288 |
+
self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
|
| 1289 |
+
|
| 1290 |
+
# Initialize weights and apply final processing
|
| 1291 |
+
self.posxt_init()
|
| 1292 |
+
|
| 1293 |
+
def get_input_embeddings(self) -> nn.Module:
|
| 1294 |
+
return self.text_model.embeddings.token_embedding
|
| 1295 |
+
|
| 1296 |
+
def set_input_embeddings(self, value):
|
| 1297 |
+
self.text_model.embeddings.token_embedding = value
|
| 1298 |
+
|
| 1299 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_TEXT_INPUTS_DOCSTRING)
|
| 1300 |
+
@replace_return_docstrings(output_type=EvaCLIPTextModelOutput, config_class=EvaCLIPTextConfig)
|
| 1301 |
+
def forward(
|
| 1302 |
+
self,
|
| 1303 |
+
input_ids: Optional[torch.Tensor] = None,
|
| 1304 |
+
attention_mask: Optional[torch.Tensor] = None,
|
| 1305 |
+
position_ids: Optional[torch.Tensor] = None,
|
| 1306 |
+
output_attentions: Optional[bool] = None,
|
| 1307 |
+
output_hidden_states: Optional[bool] = None,
|
| 1308 |
+
return_dict: Optional[bool] = None,
|
| 1309 |
+
) -> Union[Tuple, EvaCLIPTextModelOutput]:
|
| 1310 |
+
r"""
|
| 1311 |
+
Returns:
|
| 1312 |
+
|
| 1313 |
+
Examples:
|
| 1314 |
+
|
| 1315 |
+
```python
|
| 1316 |
+
>>> from transformers import AutoTokenizer, CLIPTextModelWithProjection
|
| 1317 |
+
|
| 1318 |
+
>>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
|
| 1319 |
+
>>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")
|
| 1320 |
+
|
| 1321 |
+
>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
|
| 1322 |
+
|
| 1323 |
+
>>> outputs = model(**inputs)
|
| 1324 |
+
>>> text_embeds = outputs.text_embeds
|
| 1325 |
+
```"""
|
| 1326 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1327 |
+
|
| 1328 |
+
text_outputs = self.text_model(
|
| 1329 |
+
input_ids=input_ids,
|
| 1330 |
+
attention_mask=attention_mask,
|
| 1331 |
+
position_ids=position_ids,
|
| 1332 |
+
output_attentions=output_attentions,
|
| 1333 |
+
output_hidden_states=output_hidden_states,
|
| 1334 |
+
return_dict=return_dict,
|
| 1335 |
+
)
|
| 1336 |
+
|
| 1337 |
+
pooled_output = text_outputs[1]
|
| 1338 |
+
|
| 1339 |
+
text_embeds = self.text_projection(pooled_output)
|
| 1340 |
+
|
| 1341 |
+
if not return_dict:
|
| 1342 |
+
outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
|
| 1343 |
+
return tuple(output for output in outputs if output is not None)
|
| 1344 |
+
|
| 1345 |
+
return EvaCLIPTextModelOutput(
|
| 1346 |
+
text_embeds=text_embeds,
|
| 1347 |
+
last_hidden_state=text_outputs.last_hidden_state,
|
| 1348 |
+
hidden_states=text_outputs.hidden_states,
|
| 1349 |
+
attentions=text_outputs.attentions,
|
| 1350 |
+
)
|
| 1351 |
+
|
| 1352 |
+
|
| 1353 |
+
@add_start_docstrings(
|
| 1354 |
+
"""
|
| 1355 |
+
EvaCLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output).
|
| 1356 |
+
""",
|
| 1357 |
+
EvaCLIP_START_DOCSTRING,
|
| 1358 |
+
)
|
| 1359 |
+
class EvaCLIPVisionModelWithProjection(EvaCLIPPreTrainedModel):
|
| 1360 |
+
config_class = EvaCLIPVisionConfig
|
| 1361 |
+
main_input_name = "pixel_values"
|
| 1362 |
+
|
| 1363 |
+
def __init__(self, config: EvaCLIPVisionConfig):
|
| 1364 |
+
super().__init__(config)
|
| 1365 |
+
|
| 1366 |
+
self.vision_model = EvaCLIPVisionTransformer(config)
|
| 1367 |
+
|
| 1368 |
+
self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
|
| 1369 |
+
|
| 1370 |
+
# Initialize weights and apply final processing
|
| 1371 |
+
self.post_init()
|
| 1372 |
+
|
| 1373 |
+
def get_input_embeddings(self) -> nn.Module:
|
| 1374 |
+
return self.vision_model.embeddings.patch_embedding
|
| 1375 |
+
|
| 1376 |
+
@add_start_docstrings_to_model_forward(EvaCLIP_VISION_INPUTS_DOCSTRING)
|
| 1377 |
+
@replace_return_docstrings(output_type=EvaCLIPVisionModelOutput, config_class=EvaCLIPVisionConfig)
|
| 1378 |
+
def forward(
|
| 1379 |
+
self,
|
| 1380 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 1381 |
+
output_attentions: Optional[bool] = None,
|
| 1382 |
+
output_hidden_states: Optional[bool] = None,
|
| 1383 |
+
return_dict: Optional[bool] = None,
|
| 1384 |
+
) -> Union[Tuple, EvaCLIPVisionModelOutput]:
|
| 1385 |
+
r"""
|
| 1386 |
+
Returns:
|
| 1387 |
+
|
| 1388 |
+
Examples:
|
| 1389 |
+
|
| 1390 |
+
```python
|
| 1391 |
+
>>> from PIL import Image
|
| 1392 |
+
>>> import requests
|
| 1393 |
+
>>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
|
| 1394 |
+
|
| 1395 |
+
>>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
|
| 1396 |
+
>>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 1397 |
+
|
| 1398 |
+
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
|
| 1399 |
+
>>> image = Image.open(requests.get(url, stream=True).raw)
|
| 1400 |
+
|
| 1401 |
+
>>> inputs = processor(images=image, return_tensors="pt")
|
| 1402 |
+
|
| 1403 |
+
>>> outputs = model(**inputs)
|
| 1404 |
+
>>> image_embeds = outputs.image_embeds
|
| 1405 |
+
```"""
|
| 1406 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 1407 |
+
|
| 1408 |
+
vision_outputs = self.vision_model(
|
| 1409 |
+
pixel_values=pixel_values,
|
| 1410 |
+
output_attentions=output_attentions,
|
| 1411 |
+
output_hidden_states=output_hidden_states,
|
| 1412 |
+
return_dict=return_dict,
|
| 1413 |
+
)
|
| 1414 |
+
|
| 1415 |
+
pooled_output = vision_outputs[1] # pooled_output
|
| 1416 |
+
|
| 1417 |
+
image_embeds = self.visual_projection(pooled_output)
|
| 1418 |
+
|
| 1419 |
+
if not return_dict:
|
| 1420 |
+
outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
|
| 1421 |
+
return tuple(output for output in outputs if output is not None)
|
| 1422 |
+
|
| 1423 |
+
return EvaCLIPVisionModelOutput(
|
| 1424 |
+
image_embeds=image_embeds,
|
| 1425 |
+
last_hidden_state=vision_outputs.last_hidden_state,
|
| 1426 |
+
hidden_states=vision_outputs.hidden_states,
|
| 1427 |
+
attentions=vision_outputs.attentions,
|
| 1428 |
+
)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/configuration_intern_vit.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
import os
|
| 7 |
+
from typing import Union
|
| 8 |
+
|
| 9 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 10 |
+
from transformers.utils import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class InternVisionConfig(PretrainedConfig):
|
| 16 |
+
r"""
|
| 17 |
+
This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
|
| 18 |
+
instantiate a vision encoder according to the specified arguments, defining the model architecture.
|
| 19 |
+
|
| 20 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 21 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
num_channels (`int`, *optional*, defaults to 3):
|
| 25 |
+
Number of color channels in the input images (e.g., 3 for RGB).
|
| 26 |
+
patch_size (`int`, *optional*, defaults to 14):
|
| 27 |
+
The size (resolution) of each patch.
|
| 28 |
+
image_size (`int`, *optional*, defaults to 224):
|
| 29 |
+
The size (resolution) of each image.
|
| 30 |
+
qkv_bias (`bool`, *optional*, defaults to `False`):
|
| 31 |
+
Whether to add a bias to the queries and values in the self-attention layers.
|
| 32 |
+
hidden_size (`int`, *optional*, defaults to 3200):
|
| 33 |
+
Dimensionality of the encoder layers and the pooler layer.
|
| 34 |
+
num_attention_heads (`int`, *optional*, defaults to 25):
|
| 35 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 36 |
+
intermediate_size (`int`, *optional*, defaults to 12800):
|
| 37 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
| 38 |
+
qk_normalization (`bool`, *optional*, defaults to `True`):
|
| 39 |
+
Whether to normalize the queries and keys in the self-attention layers.
|
| 40 |
+
num_hidden_layers (`int`, *optional*, defaults to 48):
|
| 41 |
+
Number of hidden layers in the Transformer encoder.
|
| 42 |
+
use_flash_attn (`bool`, *optional*, defaults to `True`):
|
| 43 |
+
Whether to use flash attention mechanism.
|
| 44 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
| 45 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
| 46 |
+
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
| 47 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
| 48 |
+
The epsilon used by the layer normalization layers.
|
| 49 |
+
dropout (`float`, *optional*, defaults to 0.0):
|
| 50 |
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
| 51 |
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
| 52 |
+
Dropout rate for stochastic depth.
|
| 53 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 54 |
+
The dropout ratio for the attention probabilities.
|
| 55 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 56 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 57 |
+
initializer_factor (`float`, *optional*, defaults to 0.1):
|
| 58 |
+
A factor for layer scale.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
model_type = 'intern_vit_6b'
|
| 62 |
+
|
| 63 |
+
def __init__(
|
| 64 |
+
self,
|
| 65 |
+
num_channels=3,
|
| 66 |
+
patch_size=14,
|
| 67 |
+
image_size=224,
|
| 68 |
+
qkv_bias=False,
|
| 69 |
+
hidden_size=3200,
|
| 70 |
+
num_attention_heads=25,
|
| 71 |
+
intermediate_size=12800,
|
| 72 |
+
qk_normalization=True,
|
| 73 |
+
num_hidden_layers=48,
|
| 74 |
+
use_flash_attn=True,
|
| 75 |
+
hidden_act='gelu',
|
| 76 |
+
layer_norm_eps=1e-6,
|
| 77 |
+
dropout=0.0,
|
| 78 |
+
drop_path_rate=0.0,
|
| 79 |
+
attention_dropout=0.0,
|
| 80 |
+
initializer_range=0.02,
|
| 81 |
+
initializer_factor=0.1,
|
| 82 |
+
**kwargs,
|
| 83 |
+
):
|
| 84 |
+
super().__init__(**kwargs)
|
| 85 |
+
|
| 86 |
+
self.hidden_size = hidden_size
|
| 87 |
+
self.intermediate_size = intermediate_size
|
| 88 |
+
self.dropout = dropout
|
| 89 |
+
self.drop_path_rate = drop_path_rate
|
| 90 |
+
self.num_hidden_layers = num_hidden_layers
|
| 91 |
+
self.num_attention_heads = num_attention_heads
|
| 92 |
+
self.num_channels = num_channels
|
| 93 |
+
self.patch_size = patch_size
|
| 94 |
+
self.image_size = image_size
|
| 95 |
+
self.initializer_range = initializer_range
|
| 96 |
+
self.initializer_factor = initializer_factor
|
| 97 |
+
self.attention_dropout = attention_dropout
|
| 98 |
+
self.layer_norm_eps = layer_norm_eps
|
| 99 |
+
self.hidden_act = hidden_act
|
| 100 |
+
self.qkv_bias = qkv_bias
|
| 101 |
+
self.qk_normalization = qk_normalization
|
| 102 |
+
self.use_flash_attn = use_flash_attn
|
| 103 |
+
|
| 104 |
+
@classmethod
|
| 105 |
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
|
| 106 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 107 |
+
|
| 108 |
+
if 'vision_config' in config_dict:
|
| 109 |
+
config_dict = config_dict['vision_config']
|
| 110 |
+
|
| 111 |
+
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
|
| 112 |
+
logger.warning(
|
| 113 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
| 114 |
+
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return cls.from_dict(config_dict, **kwargs)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/flash_attention.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from einops import rearrange
|
| 4 |
+
|
| 5 |
+
try: # v1
|
| 6 |
+
from flash_attn.flash_attn_interface import \
|
| 7 |
+
flash_attn_unpadded_qkvpacked_func
|
| 8 |
+
except: # v2
|
| 9 |
+
from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
|
| 10 |
+
|
| 11 |
+
from flash_attn.bert_padding import pad_input, unpad_input
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FlashAttention(nn.Module):
|
| 15 |
+
"""Implement the scaled dot product attention with softmax.
|
| 16 |
+
Arguments
|
| 17 |
+
---------
|
| 18 |
+
softmax_scale: The temperature to use for the softmax attention.
|
| 19 |
+
(default: 1/sqrt(d_keys) where d_keys is computed at
|
| 20 |
+
runtime)
|
| 21 |
+
attention_dropout: The dropout rate to apply to the attention
|
| 22 |
+
(default: 0.0)
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None):
|
| 26 |
+
super().__init__()
|
| 27 |
+
self.softmax_scale = softmax_scale
|
| 28 |
+
self.dropout_p = attention_dropout
|
| 29 |
+
|
| 30 |
+
def forward(self, qkv, key_padding_mask=None, causal=False, cu_seqlens=None,
|
| 31 |
+
max_s=None, need_weights=False):
|
| 32 |
+
"""Implements the multihead softmax attention.
|
| 33 |
+
Arguments
|
| 34 |
+
---------
|
| 35 |
+
qkv: The tensor containing the query, key, and value. (B, S, 3, H, D) if key_padding_mask is None
|
| 36 |
+
if unpadded: (nnz, 3, h, d)
|
| 37 |
+
key_padding_mask: a bool tensor of shape (B, S)
|
| 38 |
+
"""
|
| 39 |
+
assert not need_weights
|
| 40 |
+
assert qkv.dtype in [torch.float16, torch.bfloat16]
|
| 41 |
+
assert qkv.is_cuda
|
| 42 |
+
|
| 43 |
+
if cu_seqlens is None:
|
| 44 |
+
batch_size = qkv.shape[0]
|
| 45 |
+
seqlen = qkv.shape[1]
|
| 46 |
+
if key_padding_mask is None:
|
| 47 |
+
qkv = rearrange(qkv, 'b s ... -> (b s) ...')
|
| 48 |
+
max_s = seqlen
|
| 49 |
+
cu_seqlens = torch.arange(0, (batch_size + 1) * seqlen, step=seqlen, dtype=torch.int32,
|
| 50 |
+
device=qkv.device)
|
| 51 |
+
output = flash_attn_unpadded_qkvpacked_func(
|
| 52 |
+
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 53 |
+
softmax_scale=self.softmax_scale, causal=causal
|
| 54 |
+
)
|
| 55 |
+
output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
|
| 56 |
+
else:
|
| 57 |
+
nheads = qkv.shape[-2]
|
| 58 |
+
x = rearrange(qkv, 'b s three h d -> b s (three h d)')
|
| 59 |
+
x_unpad, indices, cu_seqlens, max_s = unpad_input(x, key_padding_mask)
|
| 60 |
+
x_unpad = rearrange(x_unpad, 'nnz (three h d) -> nnz three h d', three=3, h=nheads)
|
| 61 |
+
output_unpad = flash_attn_unpadded_qkvpacked_func(
|
| 62 |
+
x_unpad, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 63 |
+
softmax_scale=self.softmax_scale, causal=causal
|
| 64 |
+
)
|
| 65 |
+
output = rearrange(pad_input(rearrange(output_unpad, 'nnz h d -> nnz (h d)'),
|
| 66 |
+
indices, batch_size, seqlen),
|
| 67 |
+
'b s (h d) -> b s h d', h=nheads)
|
| 68 |
+
else:
|
| 69 |
+
assert max_s is not None
|
| 70 |
+
output = flash_attn_unpadded_qkvpacked_func(
|
| 71 |
+
qkv, cu_seqlens, max_s, self.dropout_p if self.training else 0.0,
|
| 72 |
+
softmax_scale=self.softmax_scale, causal=causal
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return output, None
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/intern_vit_6b/modeling_intern_vit.py
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
from typing import Optional, Tuple, Union
|
| 7 |
+
|
| 8 |
+
import torch
|
| 9 |
+
import torch.nn.functional as F
|
| 10 |
+
import torch.utils.checkpoint
|
| 11 |
+
from einops import rearrange
|
| 12 |
+
from timm.models.layers import DropPath
|
| 13 |
+
from torch import nn
|
| 14 |
+
from transformers.activations import ACT2FN
|
| 15 |
+
from transformers.modeling_outputs import (BaseModelOutput,
|
| 16 |
+
BaseModelOutputWithPooling)
|
| 17 |
+
from transformers.modeling_utils import PreTrainedModel
|
| 18 |
+
from transformers.utils import logging
|
| 19 |
+
|
| 20 |
+
from .configuration_intern_vit import InternVisionConfig
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
from .flash_attention import FlashAttention
|
| 24 |
+
has_flash_attn = True
|
| 25 |
+
except:
|
| 26 |
+
print('FlashAttention is not installed.')
|
| 27 |
+
has_flash_attn = False
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
logger = logging.get_logger(__name__)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class InternRMSNorm(nn.Module):
|
| 34 |
+
def __init__(self, hidden_size, eps=1e-6):
|
| 35 |
+
super().__init__()
|
| 36 |
+
self.weight = nn.Parameter(torch.ones(hidden_size))
|
| 37 |
+
self.variance_epsilon = eps
|
| 38 |
+
|
| 39 |
+
def forward(self, hidden_states):
|
| 40 |
+
input_dtype = hidden_states.dtype
|
| 41 |
+
hidden_states = hidden_states.to(torch.float32)
|
| 42 |
+
variance = hidden_states.pow(2).mean(-1, keepdim=True)
|
| 43 |
+
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
|
| 44 |
+
return self.weight * hidden_states.to(input_dtype)
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
from apex.normalization import FusedRMSNorm
|
| 49 |
+
|
| 50 |
+
InternRMSNorm = FusedRMSNorm # noqa
|
| 51 |
+
|
| 52 |
+
logger.info('Discovered apex.normalization.FusedRMSNorm - will use it instead of InternRMSNorm')
|
| 53 |
+
except ImportError:
|
| 54 |
+
# using the normal InternRMSNorm
|
| 55 |
+
pass
|
| 56 |
+
except Exception:
|
| 57 |
+
logger.warning('discovered apex but it failed to load, falling back to InternRMSNorm')
|
| 58 |
+
pass
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class InternVisionEmbeddings(nn.Module):
|
| 62 |
+
def __init__(self, config: InternVisionConfig):
|
| 63 |
+
super().__init__()
|
| 64 |
+
self.config = config
|
| 65 |
+
self.embed_dim = config.hidden_size
|
| 66 |
+
self.image_size = config.image_size
|
| 67 |
+
self.patch_size = config.patch_size
|
| 68 |
+
|
| 69 |
+
self.class_embedding = nn.Parameter(
|
| 70 |
+
torch.randn(1, 1, self.embed_dim),
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
self.patch_embedding = nn.Conv2d(
|
| 74 |
+
in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
self.num_patches = (self.image_size // self.patch_size) ** 2
|
| 78 |
+
self.num_positions = self.num_patches + 1
|
| 79 |
+
|
| 80 |
+
self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
|
| 81 |
+
|
| 82 |
+
def _get_pos_embed(self, pos_embed, H, W):
|
| 83 |
+
target_dtype = pos_embed.dtype
|
| 84 |
+
pos_embed = pos_embed.float().reshape(
|
| 85 |
+
1, self.image_size // self.patch_size, self.image_size // self.patch_size, -1).permute(0, 3, 1, 2)
|
| 86 |
+
pos_embed = F.interpolate(pos_embed, size=(H, W), mode='bicubic', align_corners=False).\
|
| 87 |
+
reshape(1, -1, H * W).permute(0, 2, 1).to(target_dtype)
|
| 88 |
+
return pos_embed
|
| 89 |
+
|
| 90 |
+
def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
|
| 91 |
+
target_dtype = self.patch_embedding.weight.dtype
|
| 92 |
+
patch_embeds = self.patch_embedding(pixel_values) # shape = [*, channel, width, height]
|
| 93 |
+
batch_size, _, height, width = patch_embeds.shape
|
| 94 |
+
patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
|
| 95 |
+
class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
|
| 96 |
+
embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
|
| 97 |
+
position_embedding = torch.cat([
|
| 98 |
+
self.position_embedding[:, :1, :],
|
| 99 |
+
self._get_pos_embed(self.position_embedding[:, 1:, :], height, width)
|
| 100 |
+
], dim=1)
|
| 101 |
+
embeddings = embeddings + position_embedding.to(target_dtype)
|
| 102 |
+
return embeddings
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
class InternAttention(nn.Module):
|
| 106 |
+
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 107 |
+
|
| 108 |
+
def __init__(self, config: InternVisionConfig):
|
| 109 |
+
super().__init__()
|
| 110 |
+
self.config = config
|
| 111 |
+
self.embed_dim = config.hidden_size
|
| 112 |
+
self.num_heads = config.num_attention_heads
|
| 113 |
+
self.use_flash_attn = config.use_flash_attn and has_flash_attn
|
| 114 |
+
if config.use_flash_attn and not has_flash_attn:
|
| 115 |
+
print('Warning: Flash Attention is not available, use_flash_attn is set to False.')
|
| 116 |
+
self.head_dim = self.embed_dim // self.num_heads
|
| 117 |
+
if self.head_dim * self.num_heads != self.embed_dim:
|
| 118 |
+
raise ValueError(
|
| 119 |
+
f'embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:'
|
| 120 |
+
f' {self.num_heads}).'
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
self.scale = self.head_dim ** -0.5
|
| 124 |
+
self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=config.qkv_bias)
|
| 125 |
+
self.attn_drop = nn.Dropout(config.attention_dropout)
|
| 126 |
+
self.proj_drop = nn.Dropout(config.dropout)
|
| 127 |
+
|
| 128 |
+
self.qk_normalization = config.qk_normalization
|
| 129 |
+
|
| 130 |
+
if self.qk_normalization:
|
| 131 |
+
self.q_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 132 |
+
self.k_norm = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 133 |
+
|
| 134 |
+
if self.use_flash_attn:
|
| 135 |
+
self.inner_attn = FlashAttention(attention_dropout=config.attention_dropout)
|
| 136 |
+
self.proj = nn.Linear(self.embed_dim, self.embed_dim)
|
| 137 |
+
|
| 138 |
+
def _naive_attn(self, x):
|
| 139 |
+
B, N, C = x.shape
|
| 140 |
+
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
|
| 141 |
+
q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple)
|
| 142 |
+
|
| 143 |
+
if self.qk_normalization:
|
| 144 |
+
B_, H_, N_, D_ = q.shape
|
| 145 |
+
q = self.q_norm(q.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
|
| 146 |
+
k = self.k_norm(k.transpose(1, 2).flatten(-2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
|
| 147 |
+
|
| 148 |
+
attn = ((q * self.scale) @ k.transpose(-2, -1))
|
| 149 |
+
attn = attn.softmax(dim=-1)
|
| 150 |
+
attn = self.attn_drop(attn)
|
| 151 |
+
|
| 152 |
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
| 153 |
+
x = self.proj(x)
|
| 154 |
+
x = self.proj_drop(x)
|
| 155 |
+
return x
|
| 156 |
+
|
| 157 |
+
def _flash_attn(self, x, key_padding_mask=None, need_weights=False):
|
| 158 |
+
qkv = self.qkv(x)
|
| 159 |
+
qkv = rearrange(qkv, 'b s (three h d) -> b s three h d', three=3, h=self.num_heads)
|
| 160 |
+
|
| 161 |
+
if self.qk_normalization:
|
| 162 |
+
q, k, v = qkv.unbind(2)
|
| 163 |
+
q = self.q_norm(q.flatten(-2, -1)).view(q.shape)
|
| 164 |
+
k = self.k_norm(k.flatten(-2, -1)).view(k.shape)
|
| 165 |
+
qkv = torch.stack([q, k, v], dim=2)
|
| 166 |
+
|
| 167 |
+
context, _ = self.inner_attn(
|
| 168 |
+
qkv, key_padding_mask=key_padding_mask, need_weights=need_weights, causal=False
|
| 169 |
+
)
|
| 170 |
+
outs = self.proj(rearrange(context, 'b s h d -> b s (h d)'))
|
| 171 |
+
outs = self.proj_drop(outs)
|
| 172 |
+
return outs
|
| 173 |
+
|
| 174 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 175 |
+
x = self._naive_attn(hidden_states) if not self.use_flash_attn else self._flash_attn(hidden_states)
|
| 176 |
+
return x
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class InternMLP(nn.Module):
|
| 180 |
+
def __init__(self, config: InternVisionConfig):
|
| 181 |
+
super().__init__()
|
| 182 |
+
self.config = config
|
| 183 |
+
self.act = ACT2FN[config.hidden_act]
|
| 184 |
+
self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
|
| 185 |
+
self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
|
| 186 |
+
|
| 187 |
+
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
| 188 |
+
hidden_states = self.fc1(hidden_states)
|
| 189 |
+
hidden_states = self.act(hidden_states)
|
| 190 |
+
hidden_states = self.fc2(hidden_states)
|
| 191 |
+
return hidden_states
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class InternVisionEncoderLayer(nn.Module):
|
| 195 |
+
def __init__(self, config: InternVisionConfig, drop_path_rate: float):
|
| 196 |
+
super().__init__()
|
| 197 |
+
self.embed_dim = config.hidden_size
|
| 198 |
+
self.intermediate_size = config.intermediate_size
|
| 199 |
+
|
| 200 |
+
self.attn = InternAttention(config)
|
| 201 |
+
self.mlp = InternMLP(config)
|
| 202 |
+
self.norm1 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 203 |
+
self.norm2 = InternRMSNorm(self.embed_dim, eps=config.layer_norm_eps)
|
| 204 |
+
|
| 205 |
+
self.ls1 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
|
| 206 |
+
self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim))
|
| 207 |
+
self.drop_path1 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
| 208 |
+
self.drop_path2 = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
|
| 209 |
+
|
| 210 |
+
def forward(
|
| 211 |
+
self,
|
| 212 |
+
hidden_states: torch.Tensor,
|
| 213 |
+
) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor], Optional[Tuple[torch.FloatTensor]]]:
|
| 214 |
+
"""
|
| 215 |
+
Args:
|
| 216 |
+
hidden_states (`Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
| 217 |
+
"""
|
| 218 |
+
hidden_states = hidden_states + self.drop_path1(self.attn(self.norm1(hidden_states)) * self.ls1)
|
| 219 |
+
|
| 220 |
+
hidden_states = hidden_states + self.drop_path2(self.mlp(self.norm2(hidden_states)) * self.ls2)
|
| 221 |
+
|
| 222 |
+
return hidden_states
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
class InternVisionEncoder(nn.Module):
|
| 226 |
+
"""
|
| 227 |
+
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
|
| 228 |
+
[`InternEncoderLayer`].
|
| 229 |
+
|
| 230 |
+
Args:
|
| 231 |
+
config (`InternConfig`):
|
| 232 |
+
The corresponding vision configuration for the `InternEncoder`.
|
| 233 |
+
"""
|
| 234 |
+
|
| 235 |
+
def __init__(self, config: InternVisionConfig):
|
| 236 |
+
super().__init__()
|
| 237 |
+
self.config = config
|
| 238 |
+
# stochastic depth decay rule
|
| 239 |
+
dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers)]
|
| 240 |
+
self.layers = nn.ModuleList([
|
| 241 |
+
InternVisionEncoderLayer(config, dpr[idx]) for idx in range(config.num_hidden_layers)])
|
| 242 |
+
self.gradient_checkpointing = True
|
| 243 |
+
|
| 244 |
+
def forward(
|
| 245 |
+
self,
|
| 246 |
+
inputs_embeds,
|
| 247 |
+
output_hidden_states: Optional[bool] = None,
|
| 248 |
+
return_dict: Optional[bool] = None,
|
| 249 |
+
) -> Union[Tuple, BaseModelOutput]:
|
| 250 |
+
r"""
|
| 251 |
+
Args:
|
| 252 |
+
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
|
| 253 |
+
Embedded representation of the inputs. Should be float, not int tokens.
|
| 254 |
+
output_hidden_states (`bool`, *optional*):
|
| 255 |
+
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
|
| 256 |
+
for more detail.
|
| 257 |
+
return_dict (`bool`, *optional*):
|
| 258 |
+
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
|
| 259 |
+
"""
|
| 260 |
+
output_hidden_states = (
|
| 261 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 262 |
+
)
|
| 263 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 264 |
+
|
| 265 |
+
encoder_states = () if output_hidden_states else None
|
| 266 |
+
hidden_states = inputs_embeds
|
| 267 |
+
|
| 268 |
+
for idx, encoder_layer in enumerate(self.layers):
|
| 269 |
+
if output_hidden_states:
|
| 270 |
+
encoder_states = encoder_states + (hidden_states,)
|
| 271 |
+
if self.gradient_checkpointing and self.training:
|
| 272 |
+
layer_outputs = torch.utils.checkpoint.checkpoint(
|
| 273 |
+
encoder_layer,
|
| 274 |
+
hidden_states)
|
| 275 |
+
else:
|
| 276 |
+
layer_outputs = encoder_layer(
|
| 277 |
+
hidden_states,
|
| 278 |
+
)
|
| 279 |
+
hidden_states = layer_outputs
|
| 280 |
+
|
| 281 |
+
if output_hidden_states:
|
| 282 |
+
encoder_states = encoder_states + (hidden_states,)
|
| 283 |
+
|
| 284 |
+
if not return_dict:
|
| 285 |
+
return tuple(v for v in [hidden_states, encoder_states] if v is not None)
|
| 286 |
+
return BaseModelOutput(
|
| 287 |
+
last_hidden_state=hidden_states, hidden_states=encoder_states
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
|
| 291 |
+
class InternVisionModel(PreTrainedModel):
|
| 292 |
+
main_input_name = 'pixel_values'
|
| 293 |
+
config_class = InternVisionConfig
|
| 294 |
+
|
| 295 |
+
def __init__(self, config: InternVisionConfig):
|
| 296 |
+
super().__init__(config)
|
| 297 |
+
self.config = config
|
| 298 |
+
|
| 299 |
+
self.embeddings = InternVisionEmbeddings(config)
|
| 300 |
+
self.encoder = InternVisionEncoder(config)
|
| 301 |
+
|
| 302 |
+
def resize_pos_embeddings(self, old_size, new_size, patch_size):
|
| 303 |
+
pos_emb = self.embeddings.position_embedding
|
| 304 |
+
_, num_positions, embed_dim = pos_emb.shape
|
| 305 |
+
cls_emb = pos_emb[:, :1, :]
|
| 306 |
+
pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, old_size // patch_size, -1).permute(0, 3, 1, 2)
|
| 307 |
+
pos_emb = F.interpolate(pos_emb.float(), size=new_size // patch_size, mode='bicubic', align_corners=False)
|
| 308 |
+
pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, -1).permute(0, 2, 1)
|
| 309 |
+
pos_emb = torch.cat([cls_emb, pos_emb], dim=1)
|
| 310 |
+
self.embeddings.position_embedding = nn.Parameter(pos_emb)
|
| 311 |
+
logger.info('Resized position embeddings from {} to {}'.format(old_size, new_size))
|
| 312 |
+
|
| 313 |
+
def get_input_embeddings(self):
|
| 314 |
+
return self.embeddings
|
| 315 |
+
|
| 316 |
+
def forward(
|
| 317 |
+
self,
|
| 318 |
+
pixel_values: Optional[torch.FloatTensor] = None,
|
| 319 |
+
output_hidden_states: Optional[bool] = None,
|
| 320 |
+
return_dict: Optional[bool] = None,
|
| 321 |
+
pixel_embeds: Optional[torch.FloatTensor] = None,
|
| 322 |
+
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
| 323 |
+
output_hidden_states = (
|
| 324 |
+
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
| 325 |
+
)
|
| 326 |
+
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
| 327 |
+
|
| 328 |
+
if pixel_values is None and pixel_embeds is None:
|
| 329 |
+
raise ValueError('You have to specify pixel_values or pixel_embeds')
|
| 330 |
+
|
| 331 |
+
if pixel_embeds is not None:
|
| 332 |
+
hidden_states = pixel_embeds
|
| 333 |
+
else:
|
| 334 |
+
if len(pixel_values.shape) == 4:
|
| 335 |
+
hidden_states = self.embeddings(pixel_values)
|
| 336 |
+
else:
|
| 337 |
+
raise ValueError(f'wrong pixel_values size: {pixel_values.shape}')
|
| 338 |
+
encoder_outputs = self.encoder(
|
| 339 |
+
inputs_embeds=hidden_states,
|
| 340 |
+
output_hidden_states=output_hidden_states,
|
| 341 |
+
return_dict=return_dict,
|
| 342 |
+
)
|
| 343 |
+
last_hidden_state = encoder_outputs.last_hidden_state
|
| 344 |
+
pooled_output = last_hidden_state[:, 0, :]
|
| 345 |
+
|
| 346 |
+
if not return_dict:
|
| 347 |
+
return (last_hidden_state, pooled_output) + encoder_outputs[1:]
|
| 348 |
+
|
| 349 |
+
return BaseModelOutputWithPooling(
|
| 350 |
+
last_hidden_state=last_hidden_state,
|
| 351 |
+
pooler_output=pooled_output,
|
| 352 |
+
hidden_states=encoder_outputs.hidden_states,
|
| 353 |
+
attentions=encoder_outputs.attentions,
|
| 354 |
+
)
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/internvl_14b/__init__.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
import torch.nn as nn
|
| 9 |
+
import torchvision.transforms as T
|
| 10 |
+
from torchvision.transforms import InterpolationMode
|
| 11 |
+
from transformers import LlamaTokenizer
|
| 12 |
+
|
| 13 |
+
from .configuration_intern_vit import InternVisionConfig
|
| 14 |
+
from .configuration_internvl import InternVLConfig
|
| 15 |
+
from .modeling_intern_vit import InternVisionModel
|
| 16 |
+
from .modeling_internvl import InternVL_C, InternVL_G, InternVLModel
|
| 17 |
+
|
| 18 |
+
__all__ = ['InternVisionConfig', 'InternVisionModel', 'InternVLConfig',
|
| 19 |
+
'InternVLModel', 'InternVL_C', 'InternVL_G']
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# Prefix the text "summarize:"
|
| 23 |
+
class InternVLTokenizer(nn.Module):
|
| 24 |
+
def __init__(self, model_path):
|
| 25 |
+
super(InternVLTokenizer, self).__init__()
|
| 26 |
+
self.tokenizer = LlamaTokenizer.from_pretrained(model_path)
|
| 27 |
+
self.tokenizer.pad_token = ' ' # allow padding
|
| 28 |
+
self.tokenizer.add_eos_token = True
|
| 29 |
+
|
| 30 |
+
def forward(self, text, prefix='summarize:'):
|
| 31 |
+
if type(text) == str:
|
| 32 |
+
text = prefix + text
|
| 33 |
+
elif type(text) == list:
|
| 34 |
+
text = [prefix + item for item in text]
|
| 35 |
+
text = self.tokenizer(text, return_tensors='pt', max_length=80, truncation=True, padding='max_length').input_ids
|
| 36 |
+
return text
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def build_transform(task, image_size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]):
|
| 40 |
+
if task == 'retrieval':
|
| 41 |
+
transform = T.Compose([
|
| 42 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
| 43 |
+
T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC),
|
| 44 |
+
T.ToTensor(),
|
| 45 |
+
T.Normalize(mean=mean, std=std)])
|
| 46 |
+
else:
|
| 47 |
+
transform = T.Compose([
|
| 48 |
+
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
|
| 49 |
+
T.Resize(image_size, interpolation=InterpolationMode.BICUBIC),
|
| 50 |
+
T.CenterCrop(image_size),
|
| 51 |
+
T.ToTensor(),
|
| 52 |
+
T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
|
| 53 |
+
return transform
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def load_internvl_c_huggingface(ckpt_path, device, task):
|
| 57 |
+
model = InternVL_C.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
|
| 58 |
+
if model.config.use_backbone_lora:
|
| 59 |
+
model.vision_model.merge_and_unload()
|
| 60 |
+
model.vision_model = model.vision_model.model
|
| 61 |
+
if model.config.use_qllama_lora:
|
| 62 |
+
model.qllama.merge_and_unload()
|
| 63 |
+
model.qllama = model.qllama.model
|
| 64 |
+
if model.config.force_image_size is not None:
|
| 65 |
+
image_size = model.config.force_image_size
|
| 66 |
+
else:
|
| 67 |
+
image_size = model.config.vision_config.image_size
|
| 68 |
+
transform = build_transform(task, image_size)
|
| 69 |
+
tokenizer = InternVLTokenizer(ckpt_path)
|
| 70 |
+
return model, transform, tokenizer
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def load_internvl_g_huggingface(ckpt_path, device, task):
|
| 74 |
+
model = InternVL_G.from_pretrained(ckpt_path, torch_dtype=torch.float16).to(device)
|
| 75 |
+
if model.config.use_backbone_lora:
|
| 76 |
+
model.vision_model.merge_and_unload()
|
| 77 |
+
model.vision_model = model.vision_model.model
|
| 78 |
+
if model.config.use_qllama_lora:
|
| 79 |
+
model.qllama.merge_and_unload()
|
| 80 |
+
model.qllama = model.qllama.model
|
| 81 |
+
if model.config.force_image_size is not None:
|
| 82 |
+
image_size = model.config.force_image_size
|
| 83 |
+
else:
|
| 84 |
+
image_size = model.config.vision_config.image_size
|
| 85 |
+
transform = build_transform(task, image_size)
|
| 86 |
+
tokenizer = InternVLTokenizer(ckpt_path)
|
| 87 |
+
return model, transform, tokenizer
|
VLMEvalKit_old/InternVL/internvl_chat_llava/llava/model/multimodal_encoder/internvl_14b/configuration_intern_vit.py
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# --------------------------------------------------------
|
| 2 |
+
# InternVL
|
| 3 |
+
# Copyright (c) 2023 OpenGVLab
|
| 4 |
+
# Licensed under The MIT License [see LICENSE for details]
|
| 5 |
+
# --------------------------------------------------------
|
| 6 |
+
import os
|
| 7 |
+
from typing import Union
|
| 8 |
+
|
| 9 |
+
from transformers.configuration_utils import PretrainedConfig
|
| 10 |
+
from transformers.utils import logging
|
| 11 |
+
|
| 12 |
+
logger = logging.get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class InternVisionConfig(PretrainedConfig):
|
| 16 |
+
r"""
|
| 17 |
+
This is the configuration class to store the configuration of a [`InternVisionModel`]. It is used to
|
| 18 |
+
instantiate a vision encoder according to the specified arguments, defining the model architecture.
|
| 19 |
+
|
| 20 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 21 |
+
documentation from [`PretrainedConfig`] for more information.
|
| 22 |
+
|
| 23 |
+
Args:
|
| 24 |
+
num_channels (`int`, *optional*, defaults to 3):
|
| 25 |
+
Number of color channels in the input images (e.g., 3 for RGB).
|
| 26 |
+
patch_size (`int`, *optional*, defaults to 14):
|
| 27 |
+
The size (resolution) of each patch.
|
| 28 |
+
image_size (`int`, *optional*, defaults to 224):
|
| 29 |
+
The size (resolution) of each image.
|
| 30 |
+
qkv_bias (`bool`, *optional*, defaults to `False`):
|
| 31 |
+
Whether to add a bias to the queries and values in the self-attention layers.
|
| 32 |
+
hidden_size (`int`, *optional*, defaults to 3200):
|
| 33 |
+
Dimensionality of the encoder layers and the pooler layer.
|
| 34 |
+
num_attention_heads (`int`, *optional*, defaults to 25):
|
| 35 |
+
Number of attention heads for each attention layer in the Transformer encoder.
|
| 36 |
+
intermediate_size (`int`, *optional*, defaults to 12800):
|
| 37 |
+
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
| 38 |
+
qk_normalization (`bool`, *optional*, defaults to `True`):
|
| 39 |
+
Whether to normalize the queries and keys in the self-attention layers.
|
| 40 |
+
num_hidden_layers (`int`, *optional*, defaults to 48):
|
| 41 |
+
Number of hidden layers in the Transformer encoder.
|
| 42 |
+
use_flash_attn (`bool`, *optional*, defaults to `True`):
|
| 43 |
+
Whether to use flash attention mechanism.
|
| 44 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
| 45 |
+
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
| 46 |
+
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
| 47 |
+
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
| 48 |
+
The epsilon used by the layer normalization layers.
|
| 49 |
+
dropout (`float`, *optional*, defaults to 0.0):
|
| 50 |
+
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
| 51 |
+
drop_path_rate (`float`, *optional*, defaults to 0.0):
|
| 52 |
+
Dropout rate for stochastic depth.
|
| 53 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
| 54 |
+
The dropout ratio for the attention probabilities.
|
| 55 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 56 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 57 |
+
initializer_factor (`float`, *optional*, defaults to 0.1):
|
| 58 |
+
A factor for layer scale.
|
| 59 |
+
"""
|
| 60 |
+
|
| 61 |
+
model_type = 'intern_vit_6b'
|
| 62 |
+
|
| 63 |
+
def __init__(
|
| 64 |
+
self,
|
| 65 |
+
num_channels=3,
|
| 66 |
+
patch_size=14,
|
| 67 |
+
image_size=224,
|
| 68 |
+
qkv_bias=False,
|
| 69 |
+
hidden_size=3200,
|
| 70 |
+
num_attention_heads=25,
|
| 71 |
+
intermediate_size=12800,
|
| 72 |
+
qk_normalization=True,
|
| 73 |
+
num_hidden_layers=48,
|
| 74 |
+
use_flash_attn=True,
|
| 75 |
+
hidden_act='gelu',
|
| 76 |
+
layer_norm_eps=1e-6,
|
| 77 |
+
dropout=0.0,
|
| 78 |
+
drop_path_rate=0.0,
|
| 79 |
+
attention_dropout=0.0,
|
| 80 |
+
initializer_range=0.02,
|
| 81 |
+
initializer_factor=0.1,
|
| 82 |
+
**kwargs,
|
| 83 |
+
):
|
| 84 |
+
super().__init__(**kwargs)
|
| 85 |
+
|
| 86 |
+
self.hidden_size = hidden_size
|
| 87 |
+
self.intermediate_size = intermediate_size
|
| 88 |
+
self.dropout = dropout
|
| 89 |
+
self.drop_path_rate = drop_path_rate
|
| 90 |
+
self.num_hidden_layers = num_hidden_layers
|
| 91 |
+
self.num_attention_heads = num_attention_heads
|
| 92 |
+
self.num_channels = num_channels
|
| 93 |
+
self.patch_size = patch_size
|
| 94 |
+
self.image_size = image_size
|
| 95 |
+
self.initializer_range = initializer_range
|
| 96 |
+
self.initializer_factor = initializer_factor
|
| 97 |
+
self.attention_dropout = attention_dropout
|
| 98 |
+
self.layer_norm_eps = layer_norm_eps
|
| 99 |
+
self.hidden_act = hidden_act
|
| 100 |
+
self.qkv_bias = qkv_bias
|
| 101 |
+
self.qk_normalization = qk_normalization
|
| 102 |
+
self.use_flash_attn = use_flash_attn
|
| 103 |
+
|
| 104 |
+
@classmethod
|
| 105 |
+
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> 'PretrainedConfig':
|
| 106 |
+
config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
|
| 107 |
+
|
| 108 |
+
if 'vision_config' in config_dict:
|
| 109 |
+
config_dict = config_dict['vision_config']
|
| 110 |
+
|
| 111 |
+
if 'model_type' in config_dict and hasattr(cls, 'model_type') and config_dict['model_type'] != cls.model_type:
|
| 112 |
+
logger.warning(
|
| 113 |
+
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
| 114 |
+
f'{cls.model_type}. This is not supported for all configurations of models and can yield errors.'
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return cls.from_dict(config_dict, **kwargs)
|