Commit ·
1f31f46
1
Parent(s): 8d33f67
trying the autoconfig
Browse files- config.json +1 -1
- configuration_img2html.py +12 -14
- modeling_img2html.py +8 -8
config.json
CHANGED
|
@@ -10,7 +10,7 @@
|
|
| 10 |
],
|
| 11 |
"attention_dropout": 0.0,
|
| 12 |
"auto_map": {
|
| 13 |
-
"
|
| 14 |
"AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
|
| 15 |
},
|
| 16 |
"bos_token_id": 1,
|
|
|
|
| 10 |
],
|
| 11 |
"attention_dropout": 0.0,
|
| 12 |
"auto_map": {
|
| 13 |
+
"AutoConfig": "confgiration_img2html.Img2HTMLConfig",
|
| 14 |
"AutoModelForCausalLM": "modeling_img2html.Img2HTMLForVisionText2Text"
|
| 15 |
},
|
| 16 |
"bos_token_id": 1,
|
configuration_img2html.py
CHANGED
|
@@ -24,16 +24,15 @@ MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
|
| 24 |
}
|
| 25 |
|
| 26 |
|
| 27 |
-
class
|
| 28 |
r"""
|
| 29 |
"""
|
| 30 |
-
model_type = "
|
| 31 |
|
| 32 |
def __init__(
|
| 33 |
self,
|
| 34 |
hidden_size=768,
|
| 35 |
intermediate_size=3072,
|
| 36 |
-
projection_dim=512,
|
| 37 |
num_hidden_layers=12,
|
| 38 |
num_attention_heads=12,
|
| 39 |
num_channels=3,
|
|
@@ -51,7 +50,6 @@ class VMistralVisionConfig(PretrainedConfig):
|
|
| 51 |
|
| 52 |
self.hidden_size = hidden_size
|
| 53 |
self.intermediate_size = intermediate_size
|
| 54 |
-
self.projection_dim = projection_dim
|
| 55 |
self.num_hidden_layers = num_hidden_layers
|
| 56 |
self.num_attention_heads = num_attention_heads
|
| 57 |
self.num_channels = num_channels
|
|
@@ -65,7 +63,7 @@ class VMistralVisionConfig(PretrainedConfig):
|
|
| 65 |
self._flash_attn_2_enabled = _flash_attn_2_enabled
|
| 66 |
|
| 67 |
|
| 68 |
-
class
|
| 69 |
r"""
|
| 70 |
TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
| 71 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
@@ -91,7 +89,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
|
|
| 91 |
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
|
| 92 |
Whether or not to use qk layer norms in perceiver
|
| 93 |
"""
|
| 94 |
-
model_type = "
|
| 95 |
|
| 96 |
def __init__(
|
| 97 |
self,
|
|
@@ -111,7 +109,7 @@ class VMistralPerceiverConfig(PretrainedConfig):
|
|
| 111 |
super().__init__(**kwargs)
|
| 112 |
|
| 113 |
|
| 114 |
-
class
|
| 115 |
r"""
|
| 116 |
This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
| 117 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
@@ -203,7 +201,7 @@ class VMistralConfig(PretrainedConfig):
|
|
| 203 |
>>> # Accessing the model configuration
|
| 204 |
>>> configuration = model.config
|
| 205 |
```"""
|
| 206 |
-
model_type = "
|
| 207 |
is_composition = False
|
| 208 |
|
| 209 |
def __init__(
|
|
@@ -282,17 +280,17 @@ class VMistralConfig(PretrainedConfig):
|
|
| 282 |
self.attention_dropout = attention_dropout
|
| 283 |
|
| 284 |
if perceiver_config is None:
|
| 285 |
-
self.perceiver_config =
|
| 286 |
elif isinstance(perceiver_config, dict):
|
| 287 |
-
self.perceiver_config =
|
| 288 |
-
elif isinstance(perceiver_config,
|
| 289 |
self.perceiver_config = perceiver_config
|
| 290 |
|
| 291 |
if vision_config is None:
|
| 292 |
-
self.vision_config =
|
| 293 |
elif isinstance(vision_config, dict):
|
| 294 |
-
self.vision_config =
|
| 295 |
-
elif isinstance(vision_config,
|
| 296 |
self.vision_config = vision_config
|
| 297 |
|
| 298 |
super().__init__(
|
|
|
|
| 24 |
}
|
| 25 |
|
| 26 |
|
| 27 |
+
class Img2HTMLVisionConfig(PretrainedConfig):
|
| 28 |
r"""
|
| 29 |
"""
|
| 30 |
+
model_type = "img2html"
|
| 31 |
|
| 32 |
def __init__(
|
| 33 |
self,
|
| 34 |
hidden_size=768,
|
| 35 |
intermediate_size=3072,
|
|
|
|
| 36 |
num_hidden_layers=12,
|
| 37 |
num_attention_heads=12,
|
| 38 |
num_channels=3,
|
|
|
|
| 50 |
|
| 51 |
self.hidden_size = hidden_size
|
| 52 |
self.intermediate_size = intermediate_size
|
|
|
|
| 53 |
self.num_hidden_layers = num_hidden_layers
|
| 54 |
self.num_attention_heads = num_attention_heads
|
| 55 |
self.num_channels = num_channels
|
|
|
|
| 63 |
self._flash_attn_2_enabled = _flash_attn_2_enabled
|
| 64 |
|
| 65 |
|
| 66 |
+
class Img2HTMLPerceiverConfig(PretrainedConfig):
|
| 67 |
r"""
|
| 68 |
TThis is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
| 69 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
|
|
| 89 |
qk_layer_norms_perceiver (`bool`, *optional*, defaults to `False`):
|
| 90 |
Whether or not to use qk layer norms in perceiver
|
| 91 |
"""
|
| 92 |
+
model_type = "img2html"
|
| 93 |
|
| 94 |
def __init__(
|
| 95 |
self,
|
|
|
|
| 109 |
super().__init__(**kwargs)
|
| 110 |
|
| 111 |
|
| 112 |
+
class Img2HTMLConfig(PretrainedConfig):
|
| 113 |
r"""
|
| 114 |
This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
|
| 115 |
Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
|
|
|
|
| 201 |
>>> # Accessing the model configuration
|
| 202 |
>>> configuration = model.config
|
| 203 |
```"""
|
| 204 |
+
model_type = "img2html"
|
| 205 |
is_composition = False
|
| 206 |
|
| 207 |
def __init__(
|
|
|
|
| 280 |
self.attention_dropout = attention_dropout
|
| 281 |
|
| 282 |
if perceiver_config is None:
|
| 283 |
+
self.perceiver_config = Img2HTMLPerceiverConfig()
|
| 284 |
elif isinstance(perceiver_config, dict):
|
| 285 |
+
self.perceiver_config = Img2HTMLPerceiverConfig(**perceiver_config)
|
| 286 |
+
elif isinstance(perceiver_config, Img2HTMLPerceiverConfig):
|
| 287 |
self.perceiver_config = perceiver_config
|
| 288 |
|
| 289 |
if vision_config is None:
|
| 290 |
+
self.vision_config = Img2HTMLVisionConfig()
|
| 291 |
elif isinstance(vision_config, dict):
|
| 292 |
+
self.vision_config = Img2HTMLVisionConfig(**vision_config)
|
| 293 |
+
elif isinstance(vision_config, Img2HTMLVisionConfig):
|
| 294 |
self.vision_config = vision_config
|
| 295 |
|
| 296 |
super().__init__(
|
modeling_img2html.py
CHANGED
|
@@ -43,7 +43,7 @@ from transformers import PreTrainedModel
|
|
| 43 |
from transformers.utils import logging
|
| 44 |
from transformers.modeling_outputs import ModelOutput
|
| 45 |
|
| 46 |
-
from .configuration_img2html import
|
| 47 |
from .vision import SiglipVisionModel
|
| 48 |
|
| 49 |
|
|
@@ -55,7 +55,7 @@ if is_flash_attn_2_available():
|
|
| 55 |
|
| 56 |
logger = logging.get_logger(__name__)
|
| 57 |
|
| 58 |
-
_CONFIG_FOR_DOC = "
|
| 59 |
|
| 60 |
IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
| 61 |
"HuggingFaceM4/Img2HTML"
|
|
@@ -698,7 +698,7 @@ class MistralAttention(nn.Module):
|
|
| 698 |
and "Generating Long Sequences with Sparse Transformers".
|
| 699 |
"""
|
| 700 |
|
| 701 |
-
def __init__(self, config:
|
| 702 |
super().__init__()
|
| 703 |
self.config = config
|
| 704 |
self.hidden_size = config.hidden_size
|
|
@@ -1093,7 +1093,7 @@ class MistralFlashAttention2(MistralAttention):
|
|
| 1093 |
|
| 1094 |
|
| 1095 |
class MistralDecoderLayer(nn.Module):
|
| 1096 |
-
def __init__(self, config:
|
| 1097 |
super().__init__()
|
| 1098 |
self.hidden_size = config.hidden_size
|
| 1099 |
self.self_attn = (
|
|
@@ -1176,7 +1176,7 @@ MISTRAL_START_DOCSTRING = r"""
|
|
| 1176 |
and behavior.
|
| 1177 |
|
| 1178 |
Parameters:
|
| 1179 |
-
config ([`
|
| 1180 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 1181 |
load the weights associated with the model, only the configuration. Check out the
|
| 1182 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
@@ -1188,7 +1188,7 @@ MISTRAL_START_DOCSTRING = r"""
|
|
| 1188 |
MISTRAL_START_DOCSTRING,
|
| 1189 |
)
|
| 1190 |
class VMistralPreTrainedModel(PreTrainedModel):
|
| 1191 |
-
config_class =
|
| 1192 |
base_model_prefix = "model"
|
| 1193 |
supports_gradient_checkpointing = True
|
| 1194 |
_no_split_modules = ["MistralDecoderLayer"]
|
|
@@ -1290,10 +1290,10 @@ class VMistralModel(VMistralPreTrainedModel):
|
|
| 1290 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
|
| 1291 |
|
| 1292 |
Args:
|
| 1293 |
-
config:
|
| 1294 |
"""
|
| 1295 |
|
| 1296 |
-
def __init__(self, config:
|
| 1297 |
super().__init__(config)
|
| 1298 |
self.config = config
|
| 1299 |
self.padding_idx = config.pad_token_id
|
|
|
|
| 43 |
from transformers.utils import logging
|
| 44 |
from transformers.modeling_outputs import ModelOutput
|
| 45 |
|
| 46 |
+
from .configuration_img2html import Img2HTMLConfig
|
| 47 |
from .vision import SiglipVisionModel
|
| 48 |
|
| 49 |
|
|
|
|
| 55 |
|
| 56 |
logger = logging.get_logger(__name__)
|
| 57 |
|
| 58 |
+
_CONFIG_FOR_DOC = "Img2HTMLConfig"
|
| 59 |
|
| 60 |
IMG2HTML_PRETRAINED_MODEL_ARCHIVE_LIST = [
|
| 61 |
"HuggingFaceM4/Img2HTML"
|
|
|
|
| 698 |
and "Generating Long Sequences with Sparse Transformers".
|
| 699 |
"""
|
| 700 |
|
| 701 |
+
def __init__(self, config: Img2HTMLConfig, qk_layer_norms: bool = False):
|
| 702 |
super().__init__()
|
| 703 |
self.config = config
|
| 704 |
self.hidden_size = config.hidden_size
|
|
|
|
| 1093 |
|
| 1094 |
|
| 1095 |
class MistralDecoderLayer(nn.Module):
|
| 1096 |
+
def __init__(self, config: Img2HTMLConfig):
|
| 1097 |
super().__init__()
|
| 1098 |
self.hidden_size = config.hidden_size
|
| 1099 |
self.self_attn = (
|
|
|
|
| 1176 |
and behavior.
|
| 1177 |
|
| 1178 |
Parameters:
|
| 1179 |
+
config ([`Img2HTMLConfig`]):
|
| 1180 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 1181 |
load the weights associated with the model, only the configuration. Check out the
|
| 1182 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
|
| 1188 |
MISTRAL_START_DOCSTRING,
|
| 1189 |
)
|
| 1190 |
class VMistralPreTrainedModel(PreTrainedModel):
|
| 1191 |
+
config_class = Img2HTMLConfig
|
| 1192 |
base_model_prefix = "model"
|
| 1193 |
supports_gradient_checkpointing = True
|
| 1194 |
_no_split_modules = ["MistralDecoderLayer"]
|
|
|
|
| 1290 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MistralDecoderLayer`]
|
| 1291 |
|
| 1292 |
Args:
|
| 1293 |
+
config: Img2HTMLConfig
|
| 1294 |
"""
|
| 1295 |
|
| 1296 |
+
def __init__(self, config: Img2HTMLConfig, vision_model=None):
|
| 1297 |
super().__init__(config)
|
| 1298 |
self.config = config
|
| 1299 |
self.padding_idx = config.pad_token_id
|