clean modeling + fix config double loading

Files changed (4) hide show

config.json +4 -6
configuration_modernvbert.py +545 -0
configuration_vbert.py +0 -233
modeling_vbert.py → modeling_modernvbert.py +86 -266

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "additional_vocab_size": 40,
   "architectures": [
-    "VBertForMaskedLM"
   ],
   "auto_map": {
-    "AutoConfig": "configuration_vbert.VBertConfig",
-    "AutoModel": "modeling_vbert.VBertModel",
-    "AutoModelForMaskedLM": "modeling_vbert.VBertForMaskedLM"
   },
   "freeze_config": {
     "freeze_lm_head": true,
@@ -27,7 +27,6 @@
     "hidden_size": 768,
     "intermediate_size": 1152,
     "mlp_bias": false,
-    "model_type": "vbert",
     "num_hidden_layers": 22,
     "text_model_name": "jhu-clsp/ettin-encoder-150m",
     "vocab_size": 50368
@@ -41,7 +40,6 @@
     "embed_dim": 768,
     "image_size": 512,
     "intermediate_size": 3072,
-    "model_type": "vbert",
     "num_hidden_layers": 12,
     "patch_size": 16,
     "vision_model_name": "google/siglip2-base-patch16-512"

 {
   "additional_vocab_size": 40,
   "architectures": [
+    "ModernVBertForMaskedLM"
   ],
   "auto_map": {
+    "AutoConfig": "configuration_modernvbert.ModernVBertConfig",
+    "AutoModel": "modeling_modernvbert.ModernVBertModel",
+    "AutoModelForMaskedLM": "modeling_modernvbert.ModernVBertForMaskedLM"
   },
   "freeze_config": {
     "freeze_lm_head": true,
     "hidden_size": 768,
     "intermediate_size": 1152,
     "mlp_bias": false,
     "num_hidden_layers": 22,
     "text_model_name": "jhu-clsp/ettin-encoder-150m",
     "vocab_size": 50368
     "embed_dim": 768,
     "image_size": 512,
     "intermediate_size": 3072,
     "num_hidden_layers": 12,
     "patch_size": 16,
     "vision_model_name": "google/siglip2-base-patch16-512"

configuration_modernvbert.py ADDED Viewed

	@@ -0,0 +1,545 @@

+import copy
+import os
+from typing import Any, Dict, Union
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+DEFAULT_TEXT_MODEL_NAME = "jhu-clsp/ettin-encoder-150m"
+DEFAULT_VISION_MODEL_NAME = "google/siglip2-base-patch16-512"
+def collect_arg_in_candidates(config, candidates, default=None) -> Any:
+    """Gets the first available argument in a config given a list of candidate names."""
+    for c in candidates:
+        if hasattr(config, c):
+            return getattr(config, c)
+        elif c in config:
+            return config[c]
+    if default is not None:
+        return default
+    raise ValueError(
+        f"No matching arguments found in candidates. Candidates: {candidates}, Config: {config}"
+    )
+class ModernVBertTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ModernBERT`]. It is used to instantiate an ModernBERT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [jhu-clsp/ettin-encoder-150m](https://huggingface.co/jhu-clsp/ettin-encoder-150m) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    """
+    model_type = "modernvbert_text"
+    def __init__(
+        self,
+        text_model_name=DEFAULT_TEXT_MODEL_NAME,
+        hidden_size=768,
+        num_hidden_layers=22,
+        intermediate_size=1152,
+        mlp_bias=False,
+        vocab_size=50368,
+        **kwargs,
+    ):
+        super().__init__(
+            text_model_name=text_model_name,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            mlp_bias=mlp_bias,
+            vocab_size=vocab_size,
+            **kwargs,
+        )
+    @classmethod
+    def from_base_model(
+        cls,
+        text_model_name=DEFAULT_TEXT_MODEL_NAME,
+        **kwargs,
+    ):
+        text_config = AutoConfig.from_pretrained(text_model_name, trust_remote_code=True)
+        if hasattr(text_config, "text_config"):
+            text_config = text_config.text_config
+        hidden_size = collect_arg_in_candidates(text_config, ["hidden_size", "embed_dim"])
+        num_hidden_layers = collect_arg_in_candidates(text_config, ["num_hidden_layers", "num_hidden_blocks"])
+        intermediate_size = collect_arg_in_candidates(text_config, ["intermediate_size", "mlp_dim"])
+        mlp_bias = collect_arg_in_candidates(text_config, ["mlp_bias", "mlp_hidden_bias"], default=False)
+        vocab_size = collect_arg_in_candidates(text_config, ["vocab_size"])
+        return cls(
+            text_model_name=text_model_name,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            mlp_bias=mlp_bias,
+            vocab_size=vocab_size,
+            **kwargs,
+        )
+class ModernVBertVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SigLIP`]. It is used to instantiate the vision encoder part of the ModernVBERT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the SigLIP.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    """
+    model_type = "modernvbert_vision"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+    def __init__(
+        self,
+        vision_model_name=DEFAULT_VISION_MODEL_NAME,
+        embed_dim=768,
+        image_size=512,
+        patch_size=16,
+        num_hidden_layers=12,
+        intermediate_size=3072,
+        **kwargs,
+    ):
+        super().__init__(
+            vision_model_name=vision_model_name,
+            embed_dim=embed_dim,
+            image_size=image_size,
+            patch_size=patch_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            **kwargs,
+        )
+    @classmethod
+    def from_base_model(
+        cls,
+        vision_model_name=DEFAULT_VISION_MODEL_NAME,
+        **kwargs,
+    ):
+        vision_config = AutoConfig.from_pretrained(vision_model_name, trust_remote_code=True)
+        if hasattr(vision_config, "vision_config"):
+            vision_config = vision_config.vision_config
+        embed_dim = collect_arg_in_candidates(vision_config, ["embed_dim", "hidden_size"])
+        image_size = collect_arg_in_candidates(vision_config, ["image_size", "img_size"])
+        patch_size = collect_arg_in_candidates(vision_config, ["patch_size"])
+        num_hidden_layers = collect_arg_in_candidates(vision_config, ["num_hidden_layers", "num_hidden_blocks"])
+        intermediate_size = collect_arg_in_candidates(vision_config, ["intermediate_size", "mlp_dim"])
+        return cls(
+            vision_model_name=vision_model_name,
+            embed_dim=embed_dim,
+            image_size=image_size,
+            patch_size=patch_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            **kwargs,
+        )
+class ModernVBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a `ModernVBert` model. It is used to
+    instantiate a ModernVBert model according to the specified arguments and defines the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    See the documentation for [`PretrainedConfig`] for more details.
+    Args:
+        text_config (`PretrainedConfig` or `dict`, optional):
+            Custom text config or a dict with a `text_model_name` key for the text encoder. If `None`, the
+            default text backbone defined by `DEFAULT_TEXT_MODEL_NAME` is used.
+        vision_config (`PretrainedConfig` or `dict`, optional):
+            Custom vision config or a dict with a `vision_model_name` key for the vision encoder. If `None`, the
+            default vision backbone defined by `DEFAULT_VISION_MODEL_NAME` is used.
+        image_token_id (`int`, optional, defaults to 128257):
+            Token id reserved for image tokens inserted into the text stream.
+        vocab_size (`int`, optional, defaults to 128256):
+            Vocabulary size used by the text embeddings.
+        use_cache (`bool`, optional, defaults to `True`):
+            Whether to cache key/value tensors for attention (relevant for decoder architectures).
+        tie_word_embeddings (`bool`, optional, defaults to `False`):
+            Whether to tie input token embeddings and output token embeddings.
+        pixel_shuffle_factor (`int`, optional, defaults to 4):
+            Scale factor used by any pixel-shuffle / upsampling operations in the vision head.
+        additional_vocab_size (`int`, optional, defaults to 0):
+            Number of extra tokens appended to the base vocabulary (useful for adapters / special tokens).
+        pad_token_id (`int`, optional):
+            Padding token id.
+        initializer_range (`float`, optional, defaults to 0.02):
+            Stddev used for weight initialization.
+        freeze_config (`Any`, optional):
+            Optional config describing which submodules to freeze during training.
+        use_resampler (`bool`, optional, defaults to `False`):
+            Whether to enable an additional resampler on visual features.
+        neftune_noise_alpha (`float`, optional, defaults to 0.0):
+            Alpha parameter for neftune noise injection.
+    Example:
+    ```python
+    >>> from modernvbert import ModernVBertConfig
+    >>> # Initializing configuration
+    >>> configuration = ModernVBertConfig()
+    >>> # Initializing a model from the configuration (model class is implemented in
+    >>> # `modernvbert.modeling_modernvbert`)
+    >>> # from modernvbert import ModernVBertModel
+    >>> # model = ModernVBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> # cfg = model.config
+    ```"""
+    model_type = "modernvbert"
+    is_composition = True
+    def __init__(
+        self,
+        text_config: Union[PretrainedConfig, Dict[str, Any]] = None,
+        vision_config: Union[PretrainedConfig, Dict[str, Any]] = None,
+        image_token_id: int = 128_257,
+        vocab_size=50368,
+        use_cache=True,
+        tie_word_embeddings=False,
+        freeze_config=None,
+        pad_token_id=None,
+        initializer_range=0.02,
+        pixel_shuffle_factor=4,
+        use_resampler=False,
+        additional_vocab_size=0,
+        neftune_noise_alpha=0.0,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.scale_factor = pixel_shuffle_factor
+        self.additional_vocab_size = additional_vocab_size
+        if text_config is None:
+            base_text_config = AutoConfig.from_pretrained(DEFAULT_TEXT_MODEL_NAME, trust_remote_code=True)
+            text_config = ModernVBertTextConfig(base_text_config)
+        elif isinstance(text_config, dict):
+            text_config = ModernVBertTextConfig.from_dict(text_config)
+        self.text_config = text_config
+        if vision_config is None:
+            base_vision_config = AutoConfig.from_pretrained(DEFAULT_VISION_MODEL_NAME, trust_remote_code=True)
+            vision_config = ModernVBertVisionConfig(base_vision_config)
+        elif isinstance(vision_config, dict):
+            vision_config = ModernVBertVisionConfig.from_dict(vision_config)
+        self.vision_config = vision_config
+        self.freeze_config = freeze_config
+        self.pixel_shuffle_factor = pixel_shuffle_factor
+        self.use_resampler = use_resampler
+        self.neftune_noise_alpha = neftune_noise_alpha
+        self.initializer_range = initializer_range
+        hidden_size = kwargs.pop("hidden_size", self.text_config.hidden_size)
+        super().__init__(
+            **kwargs,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+        )
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        output["model_type"] = self.__class__.model_type
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        return output
+    @classmethod
+    def from_pretrained_models(
+        cls,
+        text_model_name: Union[str, os.PathLike],
+        vision_model_name: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "PretrainedConfig":
+        text_model_config = ModernVBertTextConfig.from_base_model(text_model_name)
+        vision_model_config = ModernVBertVisionConfig.from_base_model(vision_model_name)
+        return cls(
+            text_config=text_model_config,
+            vision_config=vision_model_config,
+            **kwargs,
+        )import copy
+import os
+from typing import Any, Dict, Union
+from transformers import AutoConfig
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+DEFAULT_TEXT_MODEL_NAME = "jhu-clsp/ettin-encoder-150m"
+DEFAULT_VISION_MODEL_NAME = "google/siglip2-base-patch16-512"
+def collect_arg_in_candidates(config, candidates, default=None) -> Any:
+    """Gets the first available argument in a config given a list of candidate names."""
+    for c in candidates:
+        if hasattr(config, c):
+            return getattr(config, c)
+        elif c in config:
+            return config[c]
+    if default is not None:
+        return default
+    raise ValueError(
+        f"No matching arguments found in candidates. Candidates: {candidates}, Config: {config}"
+    )
+class ModernVBertTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ModernBERT`]. It is used to instantiate an ModernBERT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the [jhu-clsp/ettin-encoder-150m](https://huggingface.co/jhu-clsp/ettin-encoder-150m) architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    """
+    model_type = "modernvbert_text"
+    def __init__(
+        self,
+        text_model_name=DEFAULT_TEXT_MODEL_NAME,
+        hidden_size=768,
+        num_hidden_layers=22,
+        intermediate_size=1152,
+        mlp_bias=False,
+        vocab_size=50368,
+        **kwargs,
+    ):
+        super().__init__(
+            text_model_name=text_model_name,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            mlp_bias=mlp_bias,
+            vocab_size=vocab_size,
+            **kwargs,
+        )
+    @classmethod
+    def from_base_model(
+        cls,
+        text_model_name=DEFAULT_TEXT_MODEL_NAME,
+        **kwargs,
+    ):
+        text_config = AutoConfig.from_pretrained(text_model_name, trust_remote_code=True)
+        if hasattr(text_config, "text_config"):
+            text_config = text_config.text_config
+        hidden_size = collect_arg_in_candidates(text_config, ["hidden_size", "embed_dim"])
+        num_hidden_layers = collect_arg_in_candidates(text_config, ["num_hidden_layers", "num_hidden_blocks"])
+        intermediate_size = collect_arg_in_candidates(text_config, ["intermediate_size", "mlp_dim"])
+        mlp_bias = collect_arg_in_candidates(text_config, ["mlp_bias", "mlp_hidden_bias"], default=False)
+        vocab_size = collect_arg_in_candidates(text_config, ["vocab_size"])
+        return cls(
+            text_model_name=text_model_name,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            mlp_bias=mlp_bias,
+            vocab_size=vocab_size,
+            **kwargs,
+        )
+class ModernVBertVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`SigLIP`]. It is used to instantiate the vision encoder part of the ModernVBERT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the SigLIP.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    """
+    model_type = "modernvbert_vision"
+    attribute_map = {
+        "hidden_size": "embed_dim",
+    }
+    def __init__(
+        self,
+        vision_model_name=DEFAULT_VISION_MODEL_NAME,
+        embed_dim=768,
+        image_size=512,
+        patch_size=16,
+        num_hidden_layers=12,
+        intermediate_size=3072,
+        **kwargs,
+    ):
+        super().__init__(
+            vision_model_name=vision_model_name,
+            embed_dim=embed_dim,
+            image_size=image_size,
+            patch_size=patch_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            **kwargs,
+        )
+    @classmethod
+    def from_base_model(
+        cls,
+        vision_model_name=DEFAULT_VISION_MODEL_NAME,
+        **kwargs,
+    ):
+        vision_config = AutoConfig.from_pretrained(vision_model_name, trust_remote_code=True)
+        if hasattr(vision_config, "vision_config"):
+            vision_config = vision_config.vision_config
+        embed_dim = collect_arg_in_candidates(vision_config, ["embed_dim", "hidden_size"])
+        image_size = collect_arg_in_candidates(vision_config, ["image_size", "img_size"])
+        patch_size = collect_arg_in_candidates(vision_config, ["patch_size"])
+        num_hidden_layers = collect_arg_in_candidates(vision_config, ["num_hidden_layers", "num_hidden_blocks"])
+        intermediate_size = collect_arg_in_candidates(vision_config, ["intermediate_size", "mlp_dim"])
+        return cls(
+            vision_model_name=vision_model_name,
+            embed_dim=embed_dim,
+            image_size=image_size,
+            patch_size=patch_size,
+            num_hidden_layers=num_hidden_layers,
+            intermediate_size=intermediate_size,
+            **kwargs,
+        )
+class ModernVBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a `ModernVBert` model. It is used to
+    instantiate a ModernVBert model according to the specified arguments and defines the model architecture.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
+    See the documentation for [`PretrainedConfig`] for more details.
+    Args:
+        text_config (`PretrainedConfig` or `dict`, optional):
+            Custom text config or a dict with a `text_model_name` key for the text encoder. If `None`, the
+            default text backbone defined by `DEFAULT_TEXT_MODEL_NAME` is used.
+        vision_config (`PretrainedConfig` or `dict`, optional):
+            Custom vision config or a dict with a `vision_model_name` key for the vision encoder. If `None`, the
+            default vision backbone defined by `DEFAULT_VISION_MODEL_NAME` is used.
+        image_token_id (`int`, optional, defaults to 128257):
+            Token id reserved for image tokens inserted into the text stream.
+        vocab_size (`int`, optional, defaults to 128256):
+            Vocabulary size used by the text embeddings.
+        use_cache (`bool`, optional, defaults to `True`):
+            Whether to cache key/value tensors for attention (relevant for decoder architectures).
+        tie_word_embeddings (`bool`, optional, defaults to `False`):
+            Whether to tie input token embeddings and output token embeddings.
+        pixel_shuffle_factor (`int`, optional, defaults to 4):
+            Scale factor used by any pixel-shuffle / upsampling operations in the vision head.
+        additional_vocab_size (`int`, optional, defaults to 0):
+            Number of extra tokens appended to the base vocabulary (useful for adapters / special tokens).
+        pad_token_id (`int`, optional):
+            Padding token id.
+        initializer_range (`float`, optional, defaults to 0.02):
+            Stddev used for weight initialization.
+        freeze_config (`Any`, optional):
+            Optional config describing which submodules to freeze during training.
+        use_resampler (`bool`, optional, defaults to `False`):
+            Whether to enable an additional resampler on visual features.
+        neftune_noise_alpha (`float`, optional, defaults to 0.0):
+            Alpha parameter for neftune noise injection.
+    Example:
+    ```python
+    >>> from modernvbert import ModernVBertConfig
+    >>> # Initializing configuration
+    >>> configuration = ModernVBertConfig()
+    >>> # Initializing a model from the configuration (model class is implemented in
+    >>> # `modernvbert.modeling_modernvbert`)
+    >>> # from modernvbert import ModernVBertModel
+    >>> # model = ModernVBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> # cfg = model.config
+    ```"""
+    model_type = "modernvbert"
+    is_composition = True
+    def __init__(
+        self,
+        text_config: Union[PretrainedConfig, Dict[str, Any]] = None,
+        vision_config: Union[PretrainedConfig, Dict[str, Any]] = None,
+        image_token_id: int = 128_257,
+        vocab_size=50368,
+        use_cache=True,
+        tie_word_embeddings=False,
+        freeze_config=None,
+        pad_token_id=None,
+        initializer_range=0.02,
+        pixel_shuffle_factor=4,
+        use_resampler=False,
+        additional_vocab_size=0,
+        neftune_noise_alpha=0.0,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.use_cache = use_cache
+        self.tie_word_embeddings = tie_word_embeddings
+        self.scale_factor = pixel_shuffle_factor
+        self.additional_vocab_size = additional_vocab_size
+        if text_config is None:
+            base_text_config = AutoConfig.from_pretrained(DEFAULT_TEXT_MODEL_NAME, trust_remote_code=True)
+            text_config = ModernVBertTextConfig(base_text_config)
+        elif isinstance(text_config, dict):
+            text_config = ModernVBertTextConfig.from_dict(text_config)
+        self.text_config = text_config
+        if vision_config is None:
+            base_vision_config = AutoConfig.from_pretrained(DEFAULT_VISION_MODEL_NAME, trust_remote_code=True)
+            vision_config = ModernVBertVisionConfig(base_vision_config)
+        elif isinstance(vision_config, dict):
+            vision_config = ModernVBertVisionConfig.from_dict(vision_config)
+        self.vision_config = vision_config
+        self.freeze_config = freeze_config
+        self.pixel_shuffle_factor = pixel_shuffle_factor
+        self.use_resampler = use_resampler
+        self.neftune_noise_alpha = neftune_noise_alpha
+        self.initializer_range = initializer_range
+        hidden_size = kwargs.pop("hidden_size", self.text_config.hidden_size)
+        super().__init__(
+            **kwargs,
+            pad_token_id=pad_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+        )
+    def to_dict(self):
+        output = copy.deepcopy(self.__dict__)
+        output["model_type"] = self.__class__.model_type
+        output["vision_config"] = self.vision_config.to_dict()
+        output["text_config"] = self.text_config.to_dict()
+        return output
+    @classmethod
+    def from_pretrained_models(
+        cls,
+        text_model_name: Union[str, os.PathLike],
+        vision_model_name: Union[str, os.PathLike],
+        **kwargs,
+    ) -> "PretrainedConfig":
+        text_model_config = ModernVBertTextConfig.from_base_model(text_model_name)
+        vision_model_config = ModernVBertVisionConfig.from_base_model(vision_model_name)
+        return cls(
+            text_config=text_model_config,
+            vision_config=vision_model_config,
+            **kwargs,
+        )

configuration_vbert.py DELETED Viewed

@@ -1,233 +0,0 @@
-import copy
-import os
-from typing import Union, Any, Dict
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-from transformers import CONFIG_MAPPING, AutoConfig
-logger = logging.get_logger(__name__)
-def collect_arg_in_candidates(config, candidates, default = None) -> Any:
-    """ Gets the argument in a config given a list of candidates """
-    for c in candidates:
-        if hasattr(config, c):
-            return getattr(config, c)
-        elif c in config:
-            return config[c]
-    if default is not None:
-        return default
-    raise ValueError("No matching arguments found in candidates. Candidates: {}, Config: {}".format(candidates, config))
-class VBertTextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        embed_dim (`int`, *optional*, defaults to 1152):
-            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `embed_dim`)
-        image_size (`int`, *optional*, defaults to 384):
-            The size (resolution) of each image.
-    """
-    model_type = "vbert"
-    def __init__(
-        self,
-        # Case for when vllama3 is from the hub with no vision_model_name
-        text_model_name="EuroBERT/EuroBERT-210m",
-        **kwargs,
-    ):
-        self.text_model_name = text_model_name
-        text_config = AutoConfig.from_pretrained(text_model_name, trust_remote_code=True)
-        if hasattr(text_config, "text_config"):
-            text_config = text_config.text_config
-        self.hidden_size = collect_arg_in_candidates(text_config, ["hidden_size", "embed_dim"])
-        self.num_hidden_layers = collect_arg_in_candidates(text_config, ["num_hidden_layers", "num_hidden_blocks"])
-        self.intermediate_size = collect_arg_in_candidates(text_config, ["intermediate_size", "mlp_dim"])
-        self.mlp_bias = collect_arg_in_candidates(text_config, ["mlp_bias", "mlp_hidden_bias"], default = False)
-        self.vocab_size = collect_arg_in_candidates(text_config, ["vocab_size"])
-        super().__init__(text_model_name=text_model_name, **kwargs)
-class VBertVisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the LLaMA-7B.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        embed_dim (`int`, *optional*, defaults to 1152):
-            Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `embed_dim`)
-        image_size (`int`, *optional*, defaults to 384):
-            The size (resolution) of each image.
-    """
-    model_type = "vbert"
-    attribute_map = {
-        "hidden_size": "embed_dim",
-    }
-    def __init__(
-        self,
-        # Case for when vllama3 is from the hub with no vision_model_name
-        vision_model_name="google/siglip2-base-patch16-512",
-        **kwargs,
-    ):
-        self.vision_model_name = vision_model_name
-        vision_config = AutoConfig.from_pretrained(vision_model_name, trust_remote_code=True)
-        if hasattr(vision_config, "vision_config"):
-            vision_config = vision_config.vision_config
-        self.embed_dim = collect_arg_in_candidates(vision_config, ["embed_dim", "hidden_size"])
-        self.image_size = collect_arg_in_candidates(vision_config, ["image_size", "img_size"])
-        self.patch_size = collect_arg_in_candidates(vision_config, ["patch_size"])
-        self.num_hidden_layers = collect_arg_in_candidates(vision_config, ["num_hidden_layers", "num_hidden_blocks"])
-        self.intermediate_size = collect_arg_in_candidates(vision_config, ["intermediate_size", "mlp_dim"])
-        super().__init__(vision_model_name=vision_model_name, **kwargs)
-class VBertConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
-    SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
-    [HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should cache the key/value pairs of the attention mechanism. Only
-            relevant if `config.is_decoder=True`.
-        image_token_id (`int`, *optional*, defaults to 128257):
-            The id of the "image" token.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether or not to tie the word embeddings with the token embeddings.
-        vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
-            Custom vision config or dict for the vision tower
-        text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
-            Custom text config or dict for the text model
-        scale_factor (`int`, *optional*, defaults to 2):
-            The scale factor for the image encoder.
-        pad_token_id (`int`, *optional*, defaults to 128002):
-            The id of the padding token.
-    Example:
-    ```python
-    >>> from transformers import SmolVLMModel, SmolVLMConfig
-    >>> # Initializing configuration
-    >>> configuration = SmolVLMConfig()
-    >>> # Initializing a model from the configuration
-    >>> model = SmolVLMModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "vbert"
-    is_composition = True
-    # sub_configs = {"text_config": VBertTextConfig, "vision_config": VBertVisionConfig}
-    DEFAULT_TEXT_MODEL_NAME = "EuroBERT/EuroBERT-210m"
-    DEFAULT_VISION_MODEL_NAME = "google/siglip2-base-patch16-512"
-    def __init__(
-        self,
-        text_config: Union[PretrainedConfig, Dict[str, Any]] = None,
-        vision_config: Union[PretrainedConfig, Dict[str, Any]] = None,
-        image_token_id: int = 128_257,
-        vocab_size=128_256,
-        use_cache = True,
-        tie_word_embeddings = False,
-        freeze_config = None,
-        pad_token_id = None,
-        initializer_range = 0.02,
-        pixel_shuffle_factor = 4,
-        use_resampler = False,
-        additional_vocab_size = 0,
-        neftune_noise_alpha = 0.0,
-        **kwargs,
-    ):
-        self.image_token_id = image_token_id
-        self.use_cache = use_cache
-        self.tie_word_embeddings = tie_word_embeddings
-        self.scale_factor = pixel_shuffle_factor
-        self.additional_vocab_size = additional_vocab_size
-        if text_config is None:
-            text_config = AutoConfig.from_pretrained(self.DEFAULT_TEXT_MODEL_NAME, trust_remote_code=True)
-        elif isinstance(text_config, dict):
-            text_config = VBertTextConfig(text_config["text_model_name"])
-        self.text_config = text_config
-        if vision_config is None:
-            vision_config = AutoConfig.from_pretrained(self.DEFAULT_VISION_MODEL_NAME, trust_remote_code=True)
-        elif isinstance(vision_config, dict):
-            vision_config = VBertVisionConfig(vision_config["vision_model_name"])
-        self.vision_config = vision_config
-        self.freeze_config = freeze_config
-        # Pixel shuffle factor
-        self.pixel_shuffle_factor = pixel_shuffle_factor
-        self.use_resampler = use_resampler
-        self.neftune_noise_alpha = neftune_noise_alpha
-        self.initializer_range = initializer_range
-        hidden_size = kwargs.pop("hidden_size", self.text_config.hidden_size)
-        super().__init__(
-            **kwargs,
-            pad_token_id=pad_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-        )
-    def to_dict(self):
-        """
-        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
-        Returns:
-            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
-        """
-        output = copy.deepcopy(self.__dict__)
-        output["model_type"] = self.__class__.model_type
-        output["vision_config"] = self.vision_config.to_dict()
-        output["text_config"] = self.text_config.to_dict()
-        # output["freeze_config"] = self.freeze_config.to_dict()
-        return output
-    # @classmethod
-    # def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-    #     outputs = super(VBertConfig, cls).from_pretrained(pretrained_model_name_or_path, **kwargs)
-    #     return outputs
-    @classmethod
-    def from_pretrained_models(
-        cls,
-        text_model_name: Union[str, os.PathLike],
-        vision_model_name: Union[str, os.PathLike],
-        **kwargs
-    ) -> "PretrainedConfig":
-        # text_model_config = AutoConfig.from_pretrained(text_model_name, trust_remote_code=True)
-        # vision_model_config = AutoConfig.from_pretrained(vision_model_name, trust_remote_code=True)
-        text_model_config = VBertTextConfig(text_model_name)
-        vision_model_config = VBertVisionConfig(vision_model_name)
-        return cls(
-            text_config=text_model_config,
-            vision_config=vision_model_config,
-            **kwargs
-        )

modeling_vbert.py → modeling_modernvbert.py RENAMED Viewed

@@ -1,25 +1,15 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
-from typing import Optional, Tuple, Union, List
-from transformers.cache_utils import DynamicCache
-from .configuration_vbert import VBertConfig
-from transformers import AutoModel, AutoConfig, AutoModelForMaskedLM, PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.bert.modeling_bert import BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.utils.checkpoint
-from dataclasses import dataclass
-from transformers import logging
 logger = logging.get_logger(__name__)
@@ -51,6 +41,7 @@ class DecoupledEmbedding(nn.Embedding):
         """
         if padding_idx is not None and padding_idx > num_embeddings:
             raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
         super().__init__(
             num_embeddings=num_embeddings,
             embedding_dim=embedding_dim,
@@ -60,7 +51,6 @@ class DecoupledEmbedding(nn.Embedding):
             **kwargs,
         )
         self.num_embeddings = num_embeddings
-        self.padding_idx = padding_idx
         self.num_additional_embeddings = num_additional_embeddings
         self.partially_freeze = partially_freeze
@@ -69,7 +59,7 @@ class DecoupledEmbedding(nn.Embedding):
         if self.num_additional_embeddings > 0:
             self.additional_embedding = nn.Embedding(
-                num_embeddings=self.num_additional_embeddings,
                 embedding_dim=embedding_dim,
                 device=device,
                 dtype=dtype,
@@ -97,9 +87,8 @@ class DecoupledEmbedding(nn.Embedding):
         """
         if self.num_additional_embeddings == 0:
-            return self.additional_embedding(input_ids)
-        # Clone so that we don't modify the original input_ids later on
         input_ids = input_ids.clone()
         additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
         input_ids_additional_vocab = input_ids[additional_vocab_indices]
@@ -108,37 +97,19 @@ class DecoupledEmbedding(nn.Embedding):
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
         full_vector = F.embedding(input_ids, self.weight)
-        # overwrite the records with high indices
-        full_vector[additional_vocab_indices] = additional_embeddings
         return full_vector
-    def extra_repr(self) -> str:
-        return "num_embeddings={}, num_additional_embeddings={}, embedding_dim={}, partially_freeze={}".format(
-            self.num_embeddings,
-            self.num_additional_embeddings,
-            self.embedding_dim,
-            self.partially_freeze,
-        )
 @dataclass
-class VBertBaseModelOutput(BaseModelOutput):
     """
-    Base class for SmolVLM model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
             If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
             hidden_size)` is output.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
-            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
-            encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
-            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
-            input) to speed up sequential decoding.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
@@ -153,16 +124,16 @@ class VBertBaseModelOutput(BaseModelOutput):
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
-class VBertMaskedLMOutput(MaskedLMOutput):
     """
-    Base class for SmolVLM model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
             Masked language modeling (MLM) loss.
@@ -188,7 +159,9 @@ class VBertMaskedLMOutput(MaskedLMOutput):
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
-class VBertSimpleMLP(nn.Module):
     def __init__(self, input_size, output_size):
         super().__init__()
         self.proj = nn.Linear(input_size, output_size, bias=False)
@@ -196,13 +169,18 @@ class VBertSimpleMLP(nn.Module):
     def forward(self, x):
         return self.proj(x)
-class VBertConnector(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.scale_factor = config.pixel_shuffle_factor
-        self.modality_projection = VBertSimpleMLP(
             input_size=config.vision_config.hidden_size * (config.scale_factor**2),
-            output_size=config.text_config.hidden_size
         )
     def pixel_shuffle(self, x, scale_factor):
@@ -213,36 +191,25 @@ class VBertConnector(nn.Module):
         x = x.permute(0, 2, 1, 3)
         x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
         x = x.permute(0, 2, 1, 3)
-        x = x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
-        return x
     def forward(self, image_hidden_states):
         image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
-        image_hidden_states = self.modality_projection(image_hidden_states)
-        return image_hidden_states
-class VBertPreTrainedModel(PreTrainedModel):
-    config_class = VBertConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
-    _no_split_modules = ["VBertDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
     def _init_weights(self, module):
-        """Initialize the weights."""
-        std = (
-            self.config.initializer_range
-            if hasattr(self.config, "initializer_range")
-            else self.config.text_config.initializer_range
-        )
-        if hasattr(module, "class_embedding"):
-            module.class_embedding.data.normal_(mean=0.0, std=std)
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
@@ -252,53 +219,41 @@ class VBertPreTrainedModel(PreTrainedModel):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
-class VBertModel(VBertPreTrainedModel):
-    """
-    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
-    in forward. Instead, we override inputs_merger here with custom logic.
-    """
-    def __init__(self, config: VBertConfig, **kwargs):
         super().__init__(config)
-        self.vision_model = VBertModel.init_vision_model(config, **kwargs)
-        self.connector = VBertConnector(config)
-        self.text_model = VBertModel.init_language_model(config, **kwargs)
         self.image_seq_len = int(
             ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
         )
-        self.image_token_id = self.config.image_token_id
         self.post_init()
     @staticmethod
-    def init_vision_model(config: VBertConfig, **kwargs):
         vision_model_config = AutoConfig.from_pretrained(
             config.vision_config.vision_model_name,
-            trust_remote_code=True,
             **kwargs,
         )
         vision_model = AutoModel.from_config(vision_model_config, trust_remote_code=True, **kwargs)
-        if hasattr(vision_model, "vision_model"):
-            # If the model has a vision_model attribute, it means it's a wrapper around another model
-            vision_model = vision_model.vision_model
-        return vision_model
     @staticmethod
-    def init_language_model(config: VBertConfig, **kwargs):
         text_model_config = AutoConfig.from_pretrained(
             config.text_config.text_model_name,
             trust_remote_code=True,
             **kwargs,
         )
         text_model = AutoModel.from_config(text_model_config, trust_remote_code=True, **kwargs)
-        # extractor = regex_lookup(language_model_name, language_model_name2model)
         embed_layer = DecoupledEmbedding(
             num_embeddings=text_model_config.vocab_size,
             num_additional_embeddings=config.additional_vocab_size,
@@ -306,11 +261,9 @@ class VBertModel(VBertPreTrainedModel):
             partially_freeze=config.freeze_config["freeze_text_layers"],
             padding_idx=config.pad_token_id,
         )
         text_model.set_input_embeddings(embed_layer)
         return text_model
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings.
@@ -337,20 +290,15 @@ class VBertModel(VBertPreTrainedModel):
             make_inputs_require_grads
         )
-    def disable_input_require_grads(self):
-        self._text_require_grads_hook.remove()
-        self._vision_require_grads_hook.remove()
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
-    def inputs_merger(
-        self, input_ids: torch.LongTensor, inputs_embeds: torch.Tensor, image_hidden_states: torch.Tensor
-    ):
-        """
         This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
         The merging happens as follows:
         - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
@@ -359,135 +307,57 @@ class VBertModel(VBertPreTrainedModel):
         - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
         - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
         """
-        _, patch_size, _ = image_hidden_states.shape
         image_mask = input_ids == self.image_token_id
         num_image_tokens = image_mask.sum(dim=1)
         if not torch.all(num_image_tokens % patch_size == 0):
-            raise ValueError("At least one sample has <image> tokens not divisible by patch_size.")
         blocks_per_sample = num_image_tokens // patch_size
         offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
         block_offset = offsets[:-1]
         row_cum = image_mask.cumsum(dim=-1)
         chunk_idx = (row_cum - 1) // patch_size
         local_idx = (row_cum - 1) % patch_size
         block_idx = block_offset.unsqueeze(1) + chunk_idx
         image_embeds = torch.zeros_like(inputs_embeds)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
-        merged_embeds = torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
-        return merged_embeds
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
         image_hidden_states: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if self.training and self.text_model.gradient_checkpointing and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-            )
-            use_cache = False
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        past_seen_tokens = 0
-        if use_cache:
-            if past_key_values is None:
-                past_key_values = DynamicCache()
-            past_seen_tokens = past_key_values.get_seq_length()
-        if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0:
-            raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.")
         if inputs_embeds is None:
             inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
-        # START VISUAL INPUTS INTEGRATION
-        if pixel_values is not None and image_hidden_states is not None:
-            raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time")
-        elif pixel_values is not None:
-            batch_size, num_images, num_channels, height, width = pixel_values.shape
-            pixel_values = pixel_values
             pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
-            # Remove padding images - padding images are full 0.
             nb_values_per_image = pixel_values.shape[1:].numel()
             real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
             if not any(real_images_inds):
-                # no images, leave one empty image.
                 real_images_inds[0] = True
             pixel_values = pixel_values[real_images_inds].contiguous()
-            # Handle the vision attention mask
-            if pixel_attention_mask is None:
-                pixel_attention_mask = torch.ones(
-                    size=[pixel_values.shape[i] for i in (0, 2, 3)],
-                    dtype=torch.bool,
-                    device=pixel_values.device,
-                )
-            else:
-                # Remove padding images from the mask
-                pixel_attention_mask = pixel_attention_mask.view(
-                    batch_size * num_images, *pixel_attention_mask.shape[2:]
-                )
-                pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous()
-            # patch_size = self.config.vision_config.patch_size
-            # patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size)
-            # patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size)
-            # patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool()
-            # Get sequence from the vision encoder
-            image_hidden_states = self.vision_model(
-                pixel_values=pixel_values,
-                # patch_attention_mask=patch_attention_mask,
-            ).last_hidden_state
-            # Modality projection & resampling
             image_hidden_states = self.connector(image_hidden_states)
         elif image_hidden_states is not None:
             image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
         if inputs_embeds is not None and image_hidden_states is not None:
-            # When we embed, we don't want to replace the potential image_token_id that we generated by images
-            # that simply don't exist
-            inputs_embeds = self.inputs_merger(
-                input_ids=input_ids,
-                inputs_embeds=inputs_embeds,
-                image_hidden_states=image_hidden_states,
-            )
         outputs = self.text_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
@@ -495,138 +365,88 @@ class VBertModel(VBertPreTrainedModel):
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
-            # past_key_values=past_key_values,
-            # use_cache=use_cache,
-            # cache_position=cache_position,
         )
         if not return_dict:
             return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
-        return VBertBaseModelOutput(
             last_hidden_state=outputs.last_hidden_state,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=image_hidden_states,
         )
-class VBertLMHead(nn.Module):
     def __init__(self, config, **kwargs):
         super().__init__()
-        pretrained_config = AutoConfig.from_pretrained(
-            config.text_config.text_model_name,
-            trust_remote_code=True,
-            **kwargs,
-        )
         pretrained_model = AutoModelForMaskedLM.from_config(pretrained_config, trust_remote_code=True, **kwargs)
         self.head = pretrained_model.head
         self.decoder = pretrained_model.decoder
     def forward(self, hidden_states):
-        hidden_states = self.head(hidden_states)
-        hidden_states = self.decoder(hidden_states)
-        return hidden_states
-class VBertForMaskedLM(VBertPreTrainedModel):
-    # _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"]
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.image_token_id = config.image_token_id
         self.in_features = config.hidden_size
         self.out_additional_features = config.additional_vocab_size
         self.vocab_size = config.vocab_size
-        if config.is_decoder:
-            logger.warning(
-                "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for "
-                "bi-directional self-attention."
-            )
-        self.model = VBertModel(config, **kwargs)
-        self.lm_head = VBertLMHead(config, **kwargs)
         if self.out_additional_features > 0:
-            self.additional_fc = nn.Linear(
-                in_features=self.in_features,
-                out_features=self.out_additional_features,
-                bias=False,
-            )
-        # Initialize weights and apply final processing
         self.post_init()
     def forward(
-            self,
-            input_ids: torch.LongTensor = None,
-            attention_mask: Optional[torch.Tensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            past_key_values: Optional[List[torch.FloatTensor]] = None,
-            inputs_embeds: Optional[torch.FloatTensor] = None,
-            pixel_values: Optional[torch.FloatTensor] = None,
-            pixel_attention_mask: Optional[torch.BoolTensor] = None,
-            image_hidden_states: Optional[torch.FloatTensor] = None,
-            labels: Optional[torch.LongTensor] = None,
-            use_cache: Optional[bool] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, VBertMaskedLMOutput]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`).
-                Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only
-                computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Pass the inputs to VBertModel
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
-            past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             pixel_values=pixel_values,
             pixel_attention_mask=pixel_attention_mask,
             image_hidden_states=image_hidden_states,
-            use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
-        # Pass the outputs to the MLM head
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         if self.out_additional_features > 0:
             proj_states = self.lm_head.head(hidden_states)
             additional_features = self.additional_fc(proj_states)
             logits = torch.cat((logits, additional_features), -1)
-        logits = logits.float()
-        masked_lm_loss = None
         if labels is not None:
-            # print the ratio of not ignored tokens
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(logits.view(-1, self.vocab_size + self.out_additional_features), labels.view(-1))
         if not return_dict:
             output = (logits,) + outputs[2:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return VBertMaskedLMOutput(
-            loss=masked_lm_loss,
-            logits=logits,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=outputs.image_hidden_states,

+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModel, AutoModelForMaskedLM, PreTrainedModel, logging
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.bert.modeling_bert import BaseModelOutputWithPoolingAndCrossAttentions, MaskedLMOutput
+from .configuration_modernvbert import ModernVBertConfig
 logger = logging.get_logger(__name__)
         """
         if padding_idx is not None and padding_idx > num_embeddings:
             raise ValueError(f"padding_idx must be within num_embeddings. Got {padding_idx} and {num_embeddings}")
         super().__init__(
             num_embeddings=num_embeddings,
             embedding_dim=embedding_dim,
             **kwargs,
         )
         self.num_embeddings = num_embeddings
         self.num_additional_embeddings = num_additional_embeddings
         self.partially_freeze = partially_freeze
         if self.num_additional_embeddings > 0:
             self.additional_embedding = nn.Embedding(
+                num_embeddings=num_additional_embeddings,
                 embedding_dim=embedding_dim,
                 device=device,
                 dtype=dtype,
         """
         if self.num_additional_embeddings == 0:
+            return super().forward(input_ids)
         input_ids = input_ids.clone()
         additional_vocab_indices = torch.where(input_ids >= self.num_embeddings)
         input_ids_additional_vocab = input_ids[additional_vocab_indices]
         # for successful lookup replace input_ids with 0, the results of these will be discarded anyway
         input_ids[additional_vocab_indices] = 0
         full_vector = F.embedding(input_ids, self.weight)
+        full_vector[additional_vocab_indices] = additional_embeddings      # overwrite the records with high indices
         return full_vector
 @dataclass
+class ModernVBertBaseModelOutput(BaseModelOutput):
     """
+    Base class for ModernVBERT model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
             Sequence of hidden-states at the output of the last layer of the model.
             If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
             hidden_size)` is output.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
             sequence_length, hidden_size)`.
             image_hidden_states of the model produced by the vision encoder
     """
     last_hidden_state: torch.FloatTensor = None
     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
     attentions: Optional[Tuple[torch.FloatTensor]] = None
     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
 @dataclass
+class ModernVBertMaskedLMOutput(MaskedLMOutput):
     """
+    Base class for ModernVBERT model's outputs that may also contain a past key/values (to speed up sequential decoding).
     Args:
         loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
             Masked language modeling (MLM) loss.
     attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
     image_hidden_states: Optional[torch.FloatTensor] = None
+class ModernVBertSimpleMLP(nn.Module):
+    """A simple linear projection layer to project the vision hidden states to the text hidden states."""
     def __init__(self, input_size, output_size):
         super().__init__()
         self.proj = nn.Linear(input_size, output_size, bias=False)
     def forward(self, x):
         return self.proj(x)
+class ModernVBertConnector(nn.Module):
+    """
+    Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
+    Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
+    """
     def __init__(self, config):
         super().__init__()
         self.scale_factor = config.pixel_shuffle_factor
+        self.modality_projection = ModernVBertSimpleMLP(
             input_size=config.vision_config.hidden_size * (config.scale_factor**2),
+            output_size=config.text_config.hidden_size,
         )
     def pixel_shuffle(self, x, scale_factor):
         x = x.permute(0, 2, 1, 3)
         x = x.reshape(bsz, int(width / scale_factor), int(height / scale_factor), embed_dim * (scale_factor**2))
         x = x.permute(0, 2, 1, 3)
+        return x.reshape(bsz, int(seq / (scale_factor**2)), embed_dim * (scale_factor**2))
     def forward(self, image_hidden_states):
         image_hidden_states = self.pixel_shuffle(image_hidden_states, self.scale_factor)
+        return self.modality_projection(image_hidden_states)
+class ModernVBertPreTrainedModel(PreTrainedModel):
+    config_class = ModernVBertConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["ModernVBertDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_sdpa = True
     _supports_cache_class = True
     def _init_weights(self, module):
+        std = getattr(self.config, "initializer_range", 0.02)
         if isinstance(module, (nn.Linear, nn.Conv2d)):
             module.weight.data.normal_(mean=0.0, std=std)
             if module.bias is not None:
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
+class ModernVBertModel(ModernVBertPreTrainedModel):
+    def __init__(self, config: ModernVBertConfig, **kwargs):
         super().__init__(config)
+        self.vision_model = ModernVBertModel.init_vision_model(config, **kwargs)
+        self.connector = ModernVBertConnector(config)
+        self.text_model = ModernVBertModel.init_language_model(config, **kwargs)
         self.image_seq_len = int(
             ((config.vision_config.image_size // config.vision_config.patch_size) ** 2) / (config.scale_factor**2)
         )
+        self.image_token_id = config.image_token_id
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.post_init()
     @staticmethod
+    def init_vision_model(config: ModernVBertConfig, **kwargs):
         vision_model_config = AutoConfig.from_pretrained(
             config.vision_config.vision_model_name,
+            _attn_implementation=config._attn_implementation,
+            dtype=config.torch_dtype,
             **kwargs,
         )
         vision_model = AutoModel.from_config(vision_model_config, trust_remote_code=True, **kwargs)
+        return getattr(vision_model, "vision_model", vision_model)
     @staticmethod
+    def init_language_model(config: ModernVBertConfig, **kwargs):
         text_model_config = AutoConfig.from_pretrained(
             config.text_config.text_model_name,
+            _attn_implementation=config._attn_implementation,
+            dtype=config.torch_dtype,
             trust_remote_code=True,
             **kwargs,
         )
         text_model = AutoModel.from_config(text_model_config, trust_remote_code=True, **kwargs)
         embed_layer = DecoupledEmbedding(
             num_embeddings=text_model_config.vocab_size,
             num_additional_embeddings=config.additional_vocab_size,
             partially_freeze=config.freeze_config["freeze_text_layers"],
             padding_idx=config.pad_token_id,
         )
         text_model.set_input_embeddings(embed_layer)
         return text_model
     def enable_input_require_grads(self):
         """
         Enables the gradients for the input embeddings.
             make_inputs_require_grads
         )
     def get_input_embeddings(self):
         return self.text_model.get_input_embeddings()
     def set_input_embeddings(self, value):
         self.text_model.set_input_embeddings(value)
+    def inputs_merger(self, input_ids, inputs_embeds, image_hidden_states):
+        """Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/smolvlm/modeling_smolvlm.py
         This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
         The merging happens as follows:
         - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
         - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
         - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
         """
+        _, patch_size, _ = image_hidden_states.shape
         image_mask = input_ids == self.image_token_id
         num_image_tokens = image_mask.sum(dim=1)
         if not torch.all(num_image_tokens % patch_size == 0):
+            raise ValueError("Number of <image> tokens not divisible by patch_size.")
         blocks_per_sample = num_image_tokens // patch_size
         offsets = torch.nn.functional.pad(blocks_per_sample.cumsum(dim=0), (1, 0), value=0)
         block_offset = offsets[:-1]
         row_cum = image_mask.cumsum(dim=-1)
         chunk_idx = (row_cum - 1) // patch_size
         local_idx = (row_cum - 1) % patch_size
         block_idx = block_offset.unsqueeze(1) + chunk_idx
         image_embeds = torch.zeros_like(inputs_embeds)
         image_embeds[image_mask] = image_hidden_states[block_idx[image_mask], local_idx[image_mask], :]
+        return torch.where(image_mask.unsqueeze(-1), image_embeds, inputs_embeds)
     def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         pixel_values: Optional[torch.FloatTensor] = None,
         pixel_attention_mask: Optional[torch.BoolTensor] = None,
         image_hidden_states: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         if inputs_embeds is None:
             inputs_embeds = self.text_model.get_input_embeddings()(input_ids).to(input_ids.device)
+        if pixel_values is not None:
+            batch_size, num_images, _, _, _ = pixel_values.shape
             pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:])
             nb_values_per_image = pixel_values.shape[1:].numel()
             real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image
             if not any(real_images_inds):
                 real_images_inds[0] = True
             pixel_values = pixel_values[real_images_inds].contiguous()
+            image_hidden_states = self.vision_model(pixel_values=pixel_values).last_hidden_state
             image_hidden_states = self.connector(image_hidden_states)
         elif image_hidden_states is not None:
             image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device)
         if inputs_embeds is not None and image_hidden_states is not None:
+            inputs_embeds = self.inputs_merger(input_ids, inputs_embeds, image_hidden_states)
         outputs = self.text_model(
             inputs_embeds=inputs_embeds,
             attention_mask=attention_mask,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         if not return_dict:
             return tuple(v for v in [*outputs, image_hidden_states] if v is not None)
+        return ModernVBertBaseModelOutput(
             last_hidden_state=outputs.last_hidden_state,
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=image_hidden_states,
         )
+class ModernVBertLMHead(nn.Module):
     def __init__(self, config, **kwargs):
         super().__init__()
+        pretrained_config = AutoConfig.from_pretrained(config.text_config.text_model_name, trust_remote_code=True, **kwargs)
         pretrained_model = AutoModelForMaskedLM.from_config(pretrained_config, trust_remote_code=True, **kwargs)
         self.head = pretrained_model.head
         self.decoder = pretrained_model.decoder
     def forward(self, hidden_states):
+        return self.decoder(self.head(hidden_states))
+class ModernVBertForMaskedLM(ModernVBertPreTrainedModel):
     def __init__(self, config, **kwargs):
         super().__init__(config)
         self.image_token_id = config.image_token_id
         self.in_features = config.hidden_size
         self.out_additional_features = config.additional_vocab_size
         self.vocab_size = config.vocab_size
+        self.model = ModernVBertModel(config, **kwargs)
+        self.lm_head = ModernVBertLMHead(config, **kwargs)
         if self.out_additional_features > 0:
+            self.additional_fc = nn.Linear(self.in_features, self.out_additional_features, bias=False)
         self.post_init()
     def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.BoolTensor] = None,
+        image_hidden_states: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, ModernVBertMaskedLMOutput]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
         )
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
             position_ids=position_ids,
             inputs_embeds=inputs_embeds,
             pixel_values=pixel_values,
             pixel_attention_mask=pixel_attention_mask,
             image_hidden_states=image_hidden_states,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
         )
         hidden_states = outputs[0]
         logits = self.lm_head(hidden_states)
         if self.out_additional_features > 0:
             proj_states = self.lm_head.head(hidden_states)
             additional_features = self.additional_fc(proj_states)
             logits = torch.cat((logits, additional_features), -1)
+        loss = None
         if labels is not None:
+            loss = CrossEntropyLoss()(logits.view(-1, self.vocab_size + self.out_additional_features), labels.view(-1))
         if not return_dict:
             output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return ModernVBertMaskedLMOutput(
+            loss=loss,
+            logits=logits.float(),
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
             image_hidden_states=outputs.image_hidden_states,