安觅 commited on
Commit
deb7abb
·
1 Parent(s): ba113b4
Files changed (3) hide show
  1. convnext_encoder.py +0 -157
  2. modeling_logics.py +4 -4
  3. siglip_encoder.py +0 -52
convnext_encoder.py CHANGED
@@ -398,163 +398,6 @@ def _init_weights(module, name=None, head_init_scale=1.0):
398
  module.bias.data.mul_(head_init_scale)
399
 
400
 
401
-
402
- def checkpoint_filter_fn(state_dict, model):
403
- """ Remap FB checkpoints -> timm """
404
- if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
405
- out_dict={}
406
- out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
407
- return out_dict # non-FB checkpoint
408
- if 'model' in state_dict:
409
- state_dict = state_dict['model']
410
-
411
- out_dict = {}
412
- if 'visual.trunk.stem.0.weight' in state_dict:
413
- out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
414
- k.startswith('visual.trunk.')}
415
-
416
- if 'visual.head.proj.weight' in state_dict:
417
- out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
418
- out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
419
- elif 'visual.head.mlp.fc1.weight' in state_dict:
420
- out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
421
- out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
422
- out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
423
- out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
424
- return out_dict
425
-
426
- import re
427
- for k, v in state_dict.items():
428
- k = k.replace('downsample_layers.0.', 'stem.')
429
- k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
430
- k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
431
- k = k.replace('dwconv', 'conv_dw')
432
- k = k.replace('pwconv', 'mlp.fc')
433
- if 'grn' in k:
434
- k = k.replace('grn.beta', 'mlp.grn.bias')
435
- k = k.replace('grn.gamma', 'mlp.grn.weight')
436
- v = v.reshape(v.shape[-1])
437
- k = k.replace('head.', 'head.fc.')
438
- if k.startswith('norm.'):
439
- k = k.replace('norm', 'head.norm')
440
- if v.ndim == 2 and 'head' not in k:
441
- model_shape = model.state_dict()[k].shape
442
- v = v.reshape(model_shape)
443
- k=k.replace('gamma','weight')
444
- out_dict[k] = v
445
-
446
- return out_dict
447
-
448
-
449
- def _filter_kwargs(kwargs, names):
450
- if not kwargs or not names:
451
- return
452
- for n in names:
453
- kwargs.pop(n, None)
454
-
455
- #done
456
- def _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter):
457
- """ Update the default_cfg and kwargs before passing to model
458
-
459
- Args:
460
- pretrained_cfg: input pretrained cfg (updated in-place)
461
- kwargs: keyword args passed to model build fn (updated in-place)
462
- kwargs_filter: keyword arg keys that must be removed before model __init__
463
- """
464
- # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
465
- default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
466
- # if pretrained_cfg.get('fixed_input_size', False):
467
- # # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
468
- # default_kwarg_names += ('img_size',)
469
-
470
- for n in default_kwarg_names:
471
- # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
472
- # pretrained_cfg has one input_size=(C, H ,W) entry
473
- if n == 'img_size':
474
- input_size = pretrained_cfg.get('input_size', None)
475
- if input_size is not None:
476
- assert len(input_size) == 3
477
- kwargs.setdefault(n, input_size[-2:])
478
- elif n == 'in_chans':
479
- input_size = pretrained_cfg.get('input_size', None)
480
- if input_size is not None:
481
- assert len(input_size) == 3
482
- kwargs.setdefault(n, input_size[0])
483
- elif n == 'num_classes':
484
- default_val = pretrained_cfg.get(n, None)
485
- # if default is < 0, don't pass through to model
486
- if default_val is not None and default_val >= 0:
487
- kwargs.setdefault(n, pretrained_cfg[n])
488
- else:
489
- default_val = pretrained_cfg.get(n, None)
490
- if default_val is not None:
491
- kwargs.setdefault(n, pretrained_cfg[n])
492
-
493
- # Filter keyword args for task specific model variants (some 'features only' models, etc.)
494
- _filter_kwargs(kwargs, names=kwargs_filter)
495
-
496
-
497
- def _create_convnext(variant, pretrained=False, **kwargs):
498
-
499
-
500
- kwargs.pop('xpfs', None)
501
-
502
- pretrained_cfg = {
503
- "file": "./eagle_ckeckpoint/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/open_clip_pytorch_model.bin",
504
- "source": "file"}
505
- print(f"pretrained_cfg: {pretrained_cfg}")
506
-
507
-
508
- from timm.models._builder import resolve_pretrained_cfg,load_pretrained
509
- pretrained_cfg = resolve_pretrained_cfg(
510
- variant,
511
- pretrained_cfg=pretrained_cfg,
512
- pretrained_cfg_overlay=None
513
- )
514
- pretrained_cfg = pretrained_cfg.to_dict()
515
- _update_default_model_kwargs(pretrained_cfg, kwargs, None)
516
- model = ConvNeXt(**kwargs)
517
-
518
- model.pretrained_cfg = pretrained_cfg
519
- model.default_cfg = model.pretrained_cfg
520
- features = False
521
- num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
522
-
523
- ds_label = False
524
- for k, t in dict(model.named_parameters()).items():
525
- if hasattr(t, "ds_id"):
526
- ds_label = True
527
- break
528
- if ds_label:
529
- from deepspeed import zero
530
- with zero.GatheredParameters(list(model.parameters())):
531
- load_pretrained(
532
- model,
533
- pretrained_cfg=pretrained_cfg,
534
- num_classes=num_classes_pretrained,
535
- in_chans=kwargs.get('in_chans', 3),
536
- filter_fn=checkpoint_filter_fn,
537
- strict=True,
538
- )
539
- else:
540
- load_pretrained(
541
- model,
542
- pretrained_cfg=pretrained_cfg,
543
- num_classes=num_classes_pretrained,
544
- in_chans=kwargs.get('in_chans', 3),
545
- filter_fn=checkpoint_filter_fn,
546
- strict=True,
547
- )
548
- return model
549
-
550
-
551
-
552
- def convnext_xxlarge(pretrained=True, **kwargs) -> ConvNeXt:
553
- model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5), num_classes=1024)
554
- model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
555
- return model
556
-
557
-
558
  cfg={
559
  "crop_size": 256,
560
  "do_center_crop": True,
 
398
  module.bias.data.mul_(head_init_scale)
399
 
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
  cfg={
402
  "crop_size": 256,
403
  "do_center_crop": True,
modeling_logics.py CHANGED
@@ -222,7 +222,7 @@ class MultiBackboneChannelConcatenationVisionTower(nn.Module):
222
  def device(self):
223
  return next(self.clip_vision_tower.parameters()).device
224
 
225
- #done
226
  @property
227
  def config(self):
228
  assert NotImplementedError
@@ -248,7 +248,7 @@ def build_vision_projector(config, delay_load=False, **kwargs):
248
  projector_type = getattr(config, "mm_projector_type", "linear")
249
  # print(projector_type)
250
 
251
- mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) ############todo:查看这里mlp_gelu_match是什么 看看能不能进一步简化
252
  if mlp_gelu_match:
253
  mlp_depth = int(mlp_gelu_match.group(1))
254
  modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
@@ -452,13 +452,13 @@ class LogicsMetaForCausalLM(ABC):
452
  image_feature = image_feature.view(2, 2, height, width, -1)
453
 
454
 
455
- #todo:需要确定其他if是否会走过
456
  if "maxpool2x2" in mm_patch_merge_type:
457
  image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
458
  image_feature = image_feature.flatten(1, 2).flatten(2, 3)
459
  image_feature = nn.functional.max_pool2d(image_feature, 2)
460
  image_feature = image_feature.flatten(1, 2).transpose(0, 1)
461
- elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:#todo:需要确定其他if是否会走过
462
  unit = image_feature.shape[2]
463
  image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
464
  image_feature = image_feature.flatten(1, 2).flatten(2, 3)
 
222
  def device(self):
223
  return next(self.clip_vision_tower.parameters()).device
224
 
225
+
226
  @property
227
  def config(self):
228
  assert NotImplementedError
 
248
  projector_type = getattr(config, "mm_projector_type", "linear")
249
  # print(projector_type)
250
 
251
+ mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
252
  if mlp_gelu_match:
253
  mlp_depth = int(mlp_gelu_match.group(1))
254
  modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
 
452
  image_feature = image_feature.view(2, 2, height, width, -1)
453
 
454
 
455
+
456
  if "maxpool2x2" in mm_patch_merge_type:
457
  image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
458
  image_feature = image_feature.flatten(1, 2).flatten(2, 3)
459
  image_feature = nn.functional.max_pool2d(image_feature, 2)
460
  image_feature = image_feature.flatten(1, 2).transpose(0, 1)
461
+ elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
462
  unit = image_feature.shape[2]
463
  image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
464
  image_feature = image_feature.flatten(1, 2).flatten(2, 3)
siglip_encoder.py CHANGED
@@ -58,41 +58,6 @@ class SigLipImageProcessor(BaseImageProcessor):
58
 
59
  super().__init__(**kwargs)
60
 
61
- # def preprocess(self, images, return_tensors, **kwargs):
62
- # """
63
- # 这个函数的逻辑保持不变
64
- # """
65
- # # 注意:BaseImageProcessor 的 preprocess 方法签名不同,我们简化一下
66
- # # 它通常期望 images 是一个列表
67
- # if isinstance(images, Image.Image):
68
- # images = [images]
69
- # else:
70
- # # to adapt video data
71
- # images = [to_numpy_array(image) for image in images]
72
- # assert isinstance(images, list)
73
-
74
- # # 你的转换逻辑
75
- # transforms = [
76
- # convert_to_rgb,
77
- # to_numpy_array,
78
- # partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
79
- # partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
80
- # partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
81
- # partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format), # 确保输入维度正确
82
- # ]
83
-
84
- # processed_images = []
85
- # # for image in images:
86
- # # img = image
87
- # # for transform in transforms:
88
- # # img = transform(img)
89
- # # processed_images.append(img)
90
- # processed_images = reduce(lambda x, f: [*map(f, x)], transforms, images)
91
-
92
- # # 将结果封装在 BatchFeature 中
93
- # data = {"pixel_values": processed_images}
94
- # return BatchFeature(data=data, tensor_type=return_tensors)
95
-
96
  def preprocess(self, images, return_tensors):
97
  if isinstance(images, Image.Image):
98
  images = [images]
@@ -110,9 +75,6 @@ class SigLipImageProcessor(BaseImageProcessor):
110
  partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
111
  ]
112
 
113
- # images = reduce(lambda x, f: [*map(f, x)], transforms, images)
114
- # data = {"pixel_values": images}
115
-
116
  processed_images=[]
117
  for image in images:
118
  img = image
@@ -362,20 +324,6 @@ class SigLipEncoderLayer(nn.Module):
362
  return outputs
363
 
364
 
365
- # class SigLipPreTrainedModel(PreTrainedModel):
366
- # """
367
- # An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
368
- # models.
369
- # """
370
-
371
- # config_class = SigLipVisionConfig
372
- # base_model_prefix = "siglip"
373
- # supports_gradient_checkpointing = True
374
-
375
- # def _init_weights(self, module):
376
- # """Initialize the weights"""
377
- # pass
378
-
379
 
380
  # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
381
  class SigLipEncoder(nn.Module):
 
58
 
59
  super().__init__(**kwargs)
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def preprocess(self, images, return_tensors):
62
  if isinstance(images, Image.Image):
63
  images = [images]
 
75
  partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
76
  ]
77
 
 
 
 
78
  processed_images=[]
79
  for image in images:
80
  img = image
 
324
  return outputs
325
 
326
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
 
328
  # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
329
  class SigLipEncoder(nn.Module):