Logics-MLLM
/

Logics-Thinking-8B

Safetensors

logics

custom_code

Model card Files Files and versions

xet

Community

安觅 commited on Aug 21, 2025

Commit

deb7abb

1 Parent(s): ba113b4

update

Browse files

Files changed (3) hide show

convnext_encoder.py +0 -157
modeling_logics.py +4 -4
siglip_encoder.py +0 -52

convnext_encoder.py CHANGED Viewed

@@ -398,163 +398,6 @@ def _init_weights(module, name=None, head_init_scale=1.0):
             module.bias.data.mul_(head_init_scale)
-def checkpoint_filter_fn(state_dict, model):
-    """ Remap FB checkpoints -> timm """
-    if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict:
-        out_dict={}
-        out_dict = {k.replace('gamma', 'weight'): v for k, v in state_dict.items()}
-        return out_dict  # non-FB checkpoint
-    if 'model' in state_dict:
-        state_dict = state_dict['model']
-    out_dict = {}
-    if 'visual.trunk.stem.0.weight' in state_dict:
-        out_dict = {k.replace('visual.trunk.', '').replace('gamma', 'weight'): v for k, v in state_dict.items() if
-                    k.startswith('visual.trunk.')}
-        if 'visual.head.proj.weight' in state_dict:
-            out_dict['head.fc.weight'] = state_dict['visual.head.proj.weight']
-            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.proj.weight'].shape[0])
-        elif 'visual.head.mlp.fc1.weight' in state_dict:
-            out_dict['head.pre_logits.fc.weight'] = state_dict['visual.head.mlp.fc1.weight']
-            out_dict['head.pre_logits.fc.bias'] = state_dict['visual.head.mlp.fc1.bias']
-            out_dict['head.fc.weight'] = state_dict['visual.head.mlp.fc2.weight']
-            out_dict['head.fc.bias'] = torch.zeros(state_dict['visual.head.mlp.fc2.weight'].shape[0])
-        return out_dict
-    import re
-    for k, v in state_dict.items():
-        k = k.replace('downsample_layers.0.', 'stem.')
-        k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
-        k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
-        k = k.replace('dwconv', 'conv_dw')
-        k = k.replace('pwconv', 'mlp.fc')
-        if 'grn' in k:
-            k = k.replace('grn.beta', 'mlp.grn.bias')
-            k = k.replace('grn.gamma', 'mlp.grn.weight')
-            v = v.reshape(v.shape[-1])
-        k = k.replace('head.', 'head.fc.')
-        if k.startswith('norm.'):
-            k = k.replace('norm', 'head.norm')
-        if v.ndim == 2 and 'head' not in k:
-            model_shape = model.state_dict()[k].shape
-            v = v.reshape(model_shape)
-        k=k.replace('gamma','weight')
-        out_dict[k] = v
-    return out_dict
-def _filter_kwargs(kwargs, names):
-    if not kwargs or not names:
-        return
-    for n in names:
-        kwargs.pop(n, None)
-#done
-def _update_default_model_kwargs(pretrained_cfg, kwargs, kwargs_filter):
-    """ Update the default_cfg and kwargs before passing to model
-    Args:
-        pretrained_cfg: input pretrained cfg (updated in-place)
-        kwargs: keyword args passed to model build fn (updated in-place)
-        kwargs_filter: keyword arg keys that must be removed before model __init__
-    """
-    # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs)
-    default_kwarg_names = ('num_classes', 'global_pool', 'in_chans')
-    # if pretrained_cfg.get('fixed_input_size', False):
-    #     # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size
-    #     default_kwarg_names += ('img_size',)
-    for n in default_kwarg_names:
-        # for legacy reasons, model __init__args uses img_size + in_chans as separate args while
-        # pretrained_cfg has one input_size=(C, H ,W) entry
-        if n == 'img_size':
-            input_size = pretrained_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[-2:])
-        elif n == 'in_chans':
-            input_size = pretrained_cfg.get('input_size', None)
-            if input_size is not None:
-                assert len(input_size) == 3
-                kwargs.setdefault(n, input_size[0])
-        elif n == 'num_classes':
-            default_val = pretrained_cfg.get(n, None)
-            # if default is < 0, don't pass through to model
-            if default_val is not None and default_val >= 0:
-                kwargs.setdefault(n, pretrained_cfg[n])
-        else:
-            default_val = pretrained_cfg.get(n, None)
-            if default_val is not None:
-                kwargs.setdefault(n, pretrained_cfg[n])
-    # Filter keyword args for task specific model variants (some 'features only' models, etc.)
-    _filter_kwargs(kwargs, names=kwargs_filter)
-def _create_convnext(variant, pretrained=False, **kwargs):
-    kwargs.pop('xpfs', None)
-    pretrained_cfg = {
-        "file": "./eagle_ckeckpoint/CLIP-convnext_xxlarge-laion2B-s34B-b82K-augreg-soup/open_clip_pytorch_model.bin",
-        "source": "file"}
-    print(f"pretrained_cfg: {pretrained_cfg}")
-    from timm.models._builder import resolve_pretrained_cfg,load_pretrained
-    pretrained_cfg = resolve_pretrained_cfg(
-        variant,
-        pretrained_cfg=pretrained_cfg,
-        pretrained_cfg_overlay=None
-    )
-    pretrained_cfg = pretrained_cfg.to_dict()
-    _update_default_model_kwargs(pretrained_cfg, kwargs, None)
-    model = ConvNeXt(**kwargs)
-    model.pretrained_cfg = pretrained_cfg
-    model.default_cfg = model.pretrained_cfg
-    features = False
-    num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000))
-    ds_label = False
-    for k, t in dict(model.named_parameters()).items():
-        if hasattr(t, "ds_id"):
-            ds_label = True
-            break
-    if ds_label:
-        from deepspeed import zero
-        with zero.GatheredParameters(list(model.parameters())):
-            load_pretrained(
-                model,
-                pretrained_cfg=pretrained_cfg,
-                num_classes=num_classes_pretrained,
-                in_chans=kwargs.get('in_chans', 3),
-                filter_fn=checkpoint_filter_fn,
-                strict=True,
-            )
-    else:
-        load_pretrained(
-            model,
-            pretrained_cfg=pretrained_cfg,
-            num_classes=num_classes_pretrained,
-            in_chans=kwargs.get('in_chans', 3),
-            filter_fn=checkpoint_filter_fn,
-            strict=True,
-        )
-    return model
-def convnext_xxlarge(pretrained=True, **kwargs) -> ConvNeXt:
-    model_args = dict(depths=[3, 4, 30, 3], dims=[384, 768, 1536, 3072], norm_eps=kwargs.pop('norm_eps', 1e-5), num_classes=1024)
-    model = _create_convnext('convnext_xxlarge', pretrained=pretrained, **dict(model_args, **kwargs))
-    return model
 cfg={
     "crop_size": 256,
     "do_center_crop": True,

             module.bias.data.mul_(head_init_scale)
 cfg={
     "crop_size": 256,
     "do_center_crop": True,

modeling_logics.py CHANGED Viewed

@@ -222,7 +222,7 @@ class MultiBackboneChannelConcatenationVisionTower(nn.Module):
     def device(self):
         return next(self.clip_vision_tower.parameters()).device
-    #done
     @property
     def config(self):
         assert NotImplementedError
@@ -248,7 +248,7 @@ def build_vision_projector(config, delay_load=False, **kwargs):
     projector_type = getattr(config, "mm_projector_type", "linear")
     # print(projector_type)
-    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type) ############todo：查看这里mlp_gelu_match是什么 看看能不能进一步简化
     if mlp_gelu_match:
         mlp_depth = int(mlp_gelu_match.group(1))
         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
@@ -452,13 +452,13 @@ class LogicsMetaForCausalLM(ABC):
                             image_feature = image_feature.view(2, 2, height, width, -1)
-                        #todo：需要确定其他if是否会走过
                         if "maxpool2x2" in mm_patch_merge_type:
                             image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                             image_feature = image_feature.flatten(1, 2).flatten(2, 3)
                             image_feature = nn.functional.max_pool2d(image_feature, 2)
                             image_feature = image_feature.flatten(1, 2).transpose(0, 1)
-                        elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:#todo：需要确定其他if是否会走过
                             unit = image_feature.shape[2]
                             image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                             image_feature = image_feature.flatten(1, 2).flatten(2, 3)

     def device(self):
         return next(self.clip_vision_tower.parameters()).device
     @property
     def config(self):
         assert NotImplementedError
     projector_type = getattr(config, "mm_projector_type", "linear")
     # print(projector_type)
+    mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
     if mlp_gelu_match:
         mlp_depth = int(mlp_gelu_match.group(1))
         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
                             image_feature = image_feature.view(2, 2, height, width, -1)
                         if "maxpool2x2" in mm_patch_merge_type:
                             image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                             image_feature = image_feature.flatten(1, 2).flatten(2, 3)
                             image_feature = nn.functional.max_pool2d(image_feature, 2)
                             image_feature = image_feature.flatten(1, 2).transpose(0, 1)
+                        elif "unpad" in mm_patch_merge_type and "anyres_max" in image_aspect_ratio and matched_anyres_max_num_patches:
                             unit = image_feature.shape[2]
                             image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
                             image_feature = image_feature.flatten(1, 2).flatten(2, 3)

siglip_encoder.py CHANGED Viewed

@@ -58,41 +58,6 @@ class SigLipImageProcessor(BaseImageProcessor):
         super().__init__(**kwargs)
-    # def preprocess(self, images, return_tensors, **kwargs):
-    #     """
-    #     这个函数的逻辑保持不变
-    #     """
-    #     # 注意：BaseImageProcessor 的 preprocess 方法签名不同，我们简化一下
-    #     # 它通常期望 images 是一个列表
-    #     if isinstance(images, Image.Image):
-    #         images = [images]
-    #     else:
-    #         # to adapt video data
-    #         images = [to_numpy_array(image) for image in images]
-    #         assert isinstance(images, list)
-    #     # 你的转换逻辑
-    #     transforms = [
-    #         convert_to_rgb,
-    #         to_numpy_array,
-    #         partial(resize, size=self.size, resample=self.resample, data_format=self.data_format),
-    #         partial(rescale, scale=self.rescale_factor, data_format=self.data_format),
-    #         partial(normalize, mean=self.image_mean, std=self.image_std, data_format=self.data_format),
-    #         partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format), # 确保输入维度正确
-    #     ]
-    #     processed_images = []
-    #     # for image in images:
-    #     #     img = image
-    #     #     for transform in transforms:
-    #     #         img = transform(img)
-    #     #     processed_images.append(img)
-    #     processed_images = reduce(lambda x, f: [*map(f, x)], transforms, images)
-    #     # 将结果封装在 BatchFeature 中
-    #     data = {"pixel_values": processed_images}
-    #     return BatchFeature(data=data, tensor_type=return_tensors)
     def preprocess(self, images, return_tensors):
         if isinstance(images, Image.Image):
             images = [images]
@@ -110,9 +75,6 @@ class SigLipImageProcessor(BaseImageProcessor):
             partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
         ]
-        # images = reduce(lambda x, f: [*map(f, x)], transforms, images)
-        # data = {"pixel_values": images}
         processed_images=[]
         for image in images:
             img = image
@@ -362,20 +324,6 @@ class SigLipEncoderLayer(nn.Module):
         return outputs
-# class SigLipPreTrainedModel(PreTrainedModel):
-#     """
-#     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-#     models.
-#     """
-#     config_class = SigLipVisionConfig
-#     base_model_prefix = "siglip"
-#     supports_gradient_checkpointing = True
-#     def _init_weights(self, module):
-#         """Initialize the weights"""
-#         pass
 # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
 class SigLipEncoder(nn.Module):

         super().__init__(**kwargs)
     def preprocess(self, images, return_tensors):
         if isinstance(images, Image.Image):
             images = [images]
             partial(to_channel_dimension_format, channel_dim=self.data_format, input_channel_dim=self.data_format),
         ]
         processed_images=[]
         for image in images:
             img = image
         return outputs
 # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->SigLip
 class SigLipEncoder(nn.Module):