File size: 4,969 Bytes
1ed22f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import torch
import torch.nn as nn
from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
class CLIPVisionTower(nn.Module):
def __init__(self, vision_tower, args, delay_load=False):
super().__init__()
self.is_loaded = False
self.vision_tower_name = vision_tower
self.select_layer = args.mm_vision_select_layer
self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
# ##########################################################################
# print(delay_load)
# print(getattr(args, 'unfreeze_mm_vision_tower', False))
# ##########################################################################
if not delay_load:
self.load_model()
elif getattr(args, 'unfreeze_mm_vision_tower', False):
self.load_model()
else:
self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
def load_model(self, device_map=None):
if self.is_loaded:
print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
return
self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
# self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
# ##########################################################################
self.vision_tower = CLIPVisionModel.from_pretrained('laion/CLIP-ViT-bigG-14-laion2B-39B-b160k', device_map=device_map)
SEEC = False
if SEEC == True:
print('++++++++++++++++++++++++++ SeeClick Used ++++++++++++++++++++++++++++++')
print()
new_state_dict = torch.load('vision_encoder.pth')
self.vision_tower.load_state_dict(new_state_dict, strict=False)
print('++++++++++++++++++++++++++ SeeClick Used ++++++++++++++++++++++++++++++')
else:
print('++++++++++++++++++++++++++ BigG Used ++++++++++++++++++++++++++++++')
print()
print('++++++++++++++++++++++++++ BigG Used ++++++++++++++++++++++++++++++')
# from transformers import AutoModelForCausalLM, AutoTokenizer
# model = AutoModelForCausalLM.from_pretrained("/home/kyr/BiaoWu/SeeClick/SeeClick", device_map="cuda", trust_remote_code=True, bf16=True).eval()
# model.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设置 device
# model.transformer.visual.hidden_size = 1664
# self.vision_tower = model.transformer.visual
# # self.vision_tower.hidden_size = 1664
# print(dir(self.vision_tower))
#import pdb; pdb.set_trace()
#print(self.vision_tower)
#print(self.vision_tower.hidden_size)
print('==========================================================')
##########################################################################
self.vision_tower.requires_grad_(False)
self.is_loaded = True
def feature_select(self, image_forward_outs):
image_features = image_forward_outs.hidden_states[self.select_layer]
if self.select_feature == 'patch':
image_features = image_features[:, 1:]
elif self.select_feature == 'cls_patch':
image_features = image_features
else:
raise ValueError(f'Unexpected select feature: {self.select_feature}')
return image_features
@torch.no_grad()
def forward(self, images):
if type(images) is list:
image_features = []
for image in images:
image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
image_feature = self.feature_select(image_forward_out).to(image.dtype)
image_features.append(image_feature)
else:
image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
image_features = self.feature_select(image_forward_outs).to(images.dtype)
return image_features
@property
def dummy_feature(self):
return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
@property
def dtype(self):
return self.vision_tower.dtype
@property
def device(self):
return self.vision_tower.device
@property
def config(self):
if self.is_loaded:
return self.vision_tower.config
else:
return self.cfg_only
@property
def hidden_size(self):
return self.config.hidden_size
@property
def num_patches_per_side(self):
return self.config.image_size // self.config.patch_size
@property
def num_patches(self):
return (self.config.image_size // self.config.patch_size) ** 2
|