安觅 commited on
Commit ·
deb7abb
1
Parent(s): ba113b4
update
Browse files- convnext_encoder.py +0 -157
- modeling_logics.py +4 -4
- siglip_encoder.py +0 -52
convnext_encoder.py
CHANGED
|
@@ -398,163 +398,6 @@ def _init_weights(module, name=None, head_init_scale=1.0):
|
|
| 398 |
module.bias.data.mul_(head_init_scale)
|
| 399 |
|
| 400 |
|
| 401 |
-
|
| 402 |
-
def checkpoint_filter_fn(state_dict, model):
|
| 403 |
-
""" Remap FB checkpoints -> timm """
|
| 404 |
-
if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
|
| 405 |
-
out_dict={}
|
| 406 |
-
out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
|
| 407 |
-
return out_dict # non-FB checkpoint
|
| 408 |
-
if 'model' in state_dict:
|
| 409 |
-
state_dict = state_dict['model']
|
| 410 |
-
|
| 411 |
-
out_dict = {}
|
| 412 |
-
if 'visual.trunk.stem.0.weight' in state_dict:
|
| 413 |
-
out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
|
| 414 |
-
k.startswith('visual.trunk.')}
|
| 415 |
-
|
| 416 |
-
if 'visual.head.proj.weight' in state_dict:
|
| 417 |
-
out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
|
| 418 |
-
out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
|
| 419 |
-
elif 'visual.head.mlp.fc1.weight' in state_dict:
|
| 420 |
-
out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
|
| 421 |
-
out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
|
| 422 |
-
out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
|
| 423 |
-
out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
|
| 424 |
-
return out_dict
|
| 425 |
-
|
| 426 |
-
import re
|
| 427 |
-
for k, v in state_dict.items():
|
| 428 |
-
k = k.replace('downsample_layers.0.', 'stem.')
|
| 429 |
-
k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
|
| 430 |
-
k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
|
| 431 |
-
k = k.replace('dwconv', 'conv_dw')
|
| 432 |
-
k = k.replace('pwconv', 'mlp.fc')
|
| 433 |
-
if 'grn' in k:
|
| 434 |
-
k = k.replace('grn.beta', 'mlp.grn.bias')
|
| 435 |
-
k = k.replace('grn.gamma', 'mlp.grn.weight')
|
| 436 |
-
v = v.reshape(v.shape[-1])
|
| 437 |
-
k = k.replace('head.', 'head.fc.')
|
| 438 |
-
if k.startswith('norm.'):
|
| 439 |
-
k = k.replace('norm', 'head.norm')
|
| 440 |
-
if v.ndim == 2 and 'head' not in k:
|
| 441 |
-
model_shape = model.state_dict()[k].shape
|
| 442 |
-
v = v.reshape(model_shape)
|
| 443 |
-
k=k.replace('gamma','weight')
|
| 444 |
-
out_dict[k] = v
|
| 445 |
-
|
| 446 |
-
return out_dict
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
def _filter_kwargs(kwargs, names):
|
| 450 |
-
if not kwargs or not names:
|
| 451 |
-
return
|
| 452 |
-
for n in names:
|
| 453 |
-
kwargs.pop(n, None)
|
| 454 |
-
|
| 455 |
-
#done
|
| 456 |
-
def _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter):
|
| 457 |
-
""" Update the default_cfg and kwargs before passing to model
|
| 458 |
-
|
| 459 |
-
Args:
|
| 460 |
-
pretrained_cfg: input pretrained cfg (updated in-place)
|
| 461 |
-
kwargs: keyword args passed to model build fn (updated in-place)
|
| 462 |
-
kwargs_filter: keyword arg keys that must be removed before model __init__
|
| 463 |
-
"""
|
| 464 |
-
# Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
|
| 465 |
-
default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
|
| 466 |
-
# if pretrained_cfg.get('fixed_input_size', False):
|
| 467 |
-
# # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
|
| 468 |
-
# default_kwarg_names += ('img_size',)
|
| 469 |
-
|
| 470 |
-
for n in default_kwarg_names:
|
| 471 |
-
# for legacy reasons, model __init__args uses img_size + in_chans as separate args while
|
| 472 |
-
# pretrained_cfg has one input_size=(C, H ,W) entry
|
| 473 |
-
if n == 'img_size':
|
| 474 |
-
input_size = pretrained_cfg.get('input_size', None)
|
| 475 |
-
if input_size is not None:
|
| 476 |
-
assert len(input_size) == 3
|
| 477 |
-
kwargs.setdefault(n, input_size[-2:])
|
| 478 |
-
elif n == 'in_chans':
|
| 479 |
-
input_size = pretrained_cfg.get('input_size', None)
|
| 480 |
-
if input_size is not None:
|
| 481 |
-
assert len(input_size) == 3
|
| 482 |
-
kwargs.setdefault(n, input_size[0])
|
| 483 |
-
elif n == 'num_classes':
|
| 484 |
-
default_val = pretrained_cfg.get(n, None)
|
| 485 |
-
# if default is < 0, don't pass through to model
|
| 486 |
-
if default_val is not None and default_val >= 0:
|
| 487 |
-
kwargs.setdefault(n, pretrained_cfg[n])
|
| 488 |
-
else:
|
| 489 |
-
default_val = pretrained_cfg.get(n, None)
|
| 490 |
-
if default_val is not None:
|
| 491 |
-
kwargs.setdefault(n, pretrained_cfg[n])
|
| 492 |
-
|
| 493 |
-
# Filter keyword args for task specific model variants (some 'features only' models, etc.)
|
| 494 |
-
_filter_kwargs(kwargs, names=kwargs_filter)
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
def _create_convnext(variant, pretrained=False, **kwargs):
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
kwargs.pop('xpfs', None)
|
| 501 |
-
|
| 502 |
-
pretrained_cfg = {
|
| 503 |
-
"file": "./eagle_ckeckpoint/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/open_clip_pytorch_model.bin",
|
| 504 |
-
"source": "file"}
|
| 505 |
-
print(f"pretrained_cfg: {pretrained_cfg}")
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
from timm.models._builder import resolve_pretrained_cfg,load_pretrained
|
| 509 |
-
pretrained_cfg = resolve_pretrained_cfg(
|
| 510 |
-
variant,
|
| 511 |
-
pretrained_cfg=pretrained_cfg,
|
| 512 |
-
pretrained_cfg_overlay=None
|
| 513 |
-
)
|
| 514 |
-
pretrained_cfg = pretrained_cfg.to_dict()
|
| 515 |
-
_update_default_model_kwargs(pretrained_cfg, kwargs, None)
|
| 516 |
-
model = ConvNeXt(**kwargs)
|
| 517 |
-
|
| 518 |
-
model.pretrained_cfg = pretrained_cfg
|
| 519 |
-
model.default_cfg = model.pretrained_cfg
|
| 520 |
-
features = False
|
| 521 |
-
num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
|
| 522 |
-
|
| 523 |
-
ds_label = False
|
| 524 |
-
for k, t in dict(model.named_parameters()).items():
|
| 525 |
-
if hasattr(t, "ds_id"):
|
| 526 |
-
ds_label = True
|
| 527 |
-
break
|
| 528 |
-
if ds_label:
|
| 529 |
-
from deepspeed import zero
|
| 530 |
-
with zero.GatheredParameters(list(model.parameters())):
|
| 531 |
-
load_pretrained(
|
| 532 |
-
model,
|
| 533 |
-
pretrained_cfg=pretrained_cfg,
|
| 534 |
-
num_classes=num_classes_pretrained,
|
| 535 |
-
in_chans=kwargs.get('in_chans', 3),
|
| 536 |
-
filter_fn=checkpoint_filter_fn,
|
| 537 |
-
strict=True,
|
| 538 |
-
)
|
| 539 |
-
else:
|
| 540 |
-
load_pretrained(
|
| 541 |
-
model,
|
| 542 |
-
pretrained_cfg=pretrained_cfg,
|
| 543 |
-
num_classes=num_classes_pretrained,
|
| 544 |
-
in_chans=kwargs.get('in_chans', 3),
|
| 545 |
-
filter_fn=checkpoint_filter_fn,
|
| 546 |
-
strict=True,
|
| 547 |
-
)
|
| 548 |
-
return model
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
def convnext_xxlarge(pretrained=True, **kwargs) -> ConvNeXt:
|
| 553 |
-
model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5), num_classes=1024)
|
| 554 |
-
model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
|
| 555 |
-
return model
|
| 556 |
-
|
| 557 |
-
|
| 558 |
cfg={
|
| 559 |
"crop_size": 256,
|
| 560 |
"do_center_crop": True,
|
|
|
|
| 398 |
module.bias.data.mul_(head_init_scale)
|
| 399 |
|
| 400 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 401 |
cfg={
|
| 402 |
"crop_size": 256,
|
| 403 |
"do_center_crop": True,
|
modeling_logics.py
CHANGED
|
@@ -222,7 +222,7 @@ class MultiBackboneChannelConcatenationVisionTower(nn.Module):
|
|
| 222 |
def device(self):
|
| 223 |
return next(self.clip_vision_tower.parameters()).device
|
| 224 |
|
| 225 |
-
|
| 226 |
@property
|
| 227 |
def config(self):
|
| 228 |
assert NotImplementedError
|
|
@@ -248,7 +248,7 @@ def build_vision_projector(config, delay_load=False, **kwargs):
|
|
| 248 |
projector_type = getattr(config, "mm_projector_type", "linear")
|
| 249 |
# print(projector_type)
|
| 250 |
|
| 251 |
-
mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
|
| 252 |
if mlp_gelu_match:
|
| 253 |
mlp_depth = int(mlp_gelu_match.group(1))
|
| 254 |
modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
|
|
@@ -452,13 +452,13 @@ class LogicsMetaForCausalLM(ABC):
|
|
| 452 |
image_feature = image_feature.view(2, 2, height, width, -1)
|
| 453 |
|
| 454 |
|
| 455 |
-
|
| 456 |
if "maxpool2x2" in mm_patch_merge_type:
|
| 457 |
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
| 458 |
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
| 459 |
image_feature = nn.functional.max_pool2d(image_feature, 2)
|
| 460 |
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
| 461 |
-
elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
|
| 462 |
unit = image_feature.shape[2]
|
| 463 |
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
| 464 |
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
|
|
|
| 222 |
def device(self):
|
| 223 |
return next(self.clip_vision_tower.parameters()).device
|
| 224 |
|
| 225 |
+
|
| 226 |
@property
|
| 227 |
def config(self):
|
| 228 |
assert NotImplementedError
|
|
|
|
| 248 |
projector_type = getattr(config, "mm_projector_type", "linear")
|
| 249 |
# print(projector_type)
|
| 250 |
|
| 251 |
+
mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
|
| 252 |
if mlp_gelu_match:
|
| 253 |
mlp_depth = int(mlp_gelu_match.group(1))
|
| 254 |
modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
|
|
|
|
| 452 |
image_feature = image_feature.view(2, 2, height, width, -1)
|
| 453 |
|
| 454 |
|
| 455 |
+
|
| 456 |
if "maxpool2x2" in mm_patch_merge_type:
|
| 457 |
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
| 458 |
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
| 459 |
image_feature = nn.functional.max_pool2d(image_feature, 2)
|
| 460 |
image_feature = image_feature.flatten(1, 2).transpose(0, 1)
|
| 461 |
+
elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
|
| 462 |
unit = image_feature.shape[2]
|
| 463 |
image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
|
| 464 |
image_feature = image_feature.flatten(1, 2).flatten(2, 3)
|
siglip_encoder.py
CHANGED
|
@@ -58,41 +58,6 @@ class SigLipImageProcessor(BaseImageProcessor):
|
|
| 58 |
|
| 59 |
super().__init__(**kwargs)
|
| 60 |
|
| 61 |
-
# def preprocess(self, images, return_tensors, **kwargs):
|
| 62 |
-
# """
|
| 63 |
-
# 这个函数的逻辑保持不变
|
| 64 |
-
# """
|
| 65 |
-
# # 注意:BaseImageProcessor 的 preprocess 方法签名不同,我们简化一下
|
| 66 |
-
# # 它通常期望 images 是一个列表
|
| 67 |
-
# if isinstance(images, Image.Image):
|
| 68 |
-
# images = [images]
|
| 69 |
-
# else:
|
| 70 |
-
# # to adapt video data
|
| 71 |
-
# images = [to_numpy_array(image) for image in images]
|
| 72 |
-
# assert isinstance(images, list)
|
| 73 |
-
|
| 74 |
-
# # 你的转换逻辑
|
| 75 |
-
# transforms = [
|
| 76 |
-
# convert_to_rgb,
|
| 77 |
-
# to_numpy_array,
|
| 78 |
-
# partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
|
| 79 |
-
# partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
|
| 80 |
-
# partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
|
| 81 |
-
# partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format), # 确保输入维度正确
|
| 82 |
-
# ]
|
| 83 |
-
|
| 84 |
-
# processed_images = []
|
| 85 |
-
# # for image in images:
|
| 86 |
-
# # img = image
|
| 87 |
-
# # for transform in transforms:
|
| 88 |
-
# # img = transform(img)
|
| 89 |
-
# # processed_images.append(img)
|
| 90 |
-
# processed_images = reduce(lambda x, f: [*map(f, x)], transforms, images)
|
| 91 |
-
|
| 92 |
-
# # 将结果封装在 BatchFeature 中
|
| 93 |
-
# data = {"pixel_values": processed_images}
|
| 94 |
-
# return BatchFeature(data=data, tensor_type=return_tensors)
|
| 95 |
-
|
| 96 |
def preprocess(self, images, return_tensors):
|
| 97 |
if isinstance(images, Image.Image):
|
| 98 |
images = [images]
|
|
@@ -110,9 +75,6 @@ class SigLipImageProcessor(BaseImageProcessor):
|
|
| 110 |
partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
|
| 111 |
]
|
| 112 |
|
| 113 |
-
# images = reduce(lambda x, f: [*map(f, x)], transforms, images)
|
| 114 |
-
# data = {"pixel_values": images}
|
| 115 |
-
|
| 116 |
processed_images=[]
|
| 117 |
for image in images:
|
| 118 |
img = image
|
|
@@ -362,20 +324,6 @@ class SigLipEncoderLayer(nn.Module):
|
|
| 362 |
return outputs
|
| 363 |
|
| 364 |
|
| 365 |
-
# class SigLipPreTrainedModel(PreTrainedModel):
|
| 366 |
-
# """
|
| 367 |
-
# An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
| 368 |
-
# models.
|
| 369 |
-
# """
|
| 370 |
-
|
| 371 |
-
# config_class = SigLipVisionConfig
|
| 372 |
-
# base_model_prefix = "siglip"
|
| 373 |
-
# supports_gradient_checkpointing = True
|
| 374 |
-
|
| 375 |
-
# def _init_weights(self, module):
|
| 376 |
-
# """Initialize the weights"""
|
| 377 |
-
# pass
|
| 378 |
-
|
| 379 |
|
| 380 |
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
|
| 381 |
class SigLipEncoder(nn.Module):
|
|
|
|
| 58 |
|
| 59 |
super().__init__(**kwargs)
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
def preprocess(self, images, return_tensors):
|
| 62 |
if isinstance(images, Image.Image):
|
| 63 |
images = [images]
|
|
|
|
| 75 |
partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
|
| 76 |
]
|
| 77 |
|
|
|
|
|
|
|
|
|
|
| 78 |
processed_images=[]
|
| 79 |
for image in images:
|
| 80 |
img = image
|
|
|
|
| 324 |
return outputs
|
| 325 |
|
| 326 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 327 |
|
| 328 |
# Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
|
| 329 |
class SigLipEncoder(nn.Module):
|