ankanmbz commited on
Commit
eb65dcf
·
verified ·
1 Parent(s): 6a50894

Upload jdeval checkpoint-1000

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. jdeval-checkpoint-1000/chat_template.jinja +5 -0
  2. jdeval-checkpoint-1000/config.json +68 -0
  3. jdeval-checkpoint-1000/configuration_llada.py +175 -0
  4. jdeval-checkpoint-1000/latest +1 -0
  5. jdeval-checkpoint-1000/merges.txt +0 -0
  6. jdeval-checkpoint-1000/model-00001-of-00004.safetensors +3 -0
  7. jdeval-checkpoint-1000/model-00002-of-00004.safetensors +3 -0
  8. jdeval-checkpoint-1000/model-00003-of-00004.safetensors +3 -0
  9. jdeval-checkpoint-1000/model-00004-of-00004.safetensors +3 -0
  10. jdeval-checkpoint-1000/model.safetensors +3 -0
  11. jdeval-checkpoint-1000/model.safetensors.index.json +725 -0
  12. jdeval-checkpoint-1000/modeling_dhara.py +1111 -0
  13. jdeval-checkpoint-1000/rng_state.pth +3 -0
  14. jdeval-checkpoint-1000/rng_state_0.pth +3 -0
  15. jdeval-checkpoint-1000/rng_state_1.pth +3 -0
  16. jdeval-checkpoint-1000/rng_state_10.pth +3 -0
  17. jdeval-checkpoint-1000/rng_state_11.pth +3 -0
  18. jdeval-checkpoint-1000/rng_state_12.pth +3 -0
  19. jdeval-checkpoint-1000/rng_state_13.pth +3 -0
  20. jdeval-checkpoint-1000/rng_state_14.pth +3 -0
  21. jdeval-checkpoint-1000/rng_state_15.pth +3 -0
  22. jdeval-checkpoint-1000/rng_state_16.pth +3 -0
  23. jdeval-checkpoint-1000/rng_state_17.pth +3 -0
  24. jdeval-checkpoint-1000/rng_state_18.pth +3 -0
  25. jdeval-checkpoint-1000/rng_state_19.pth +3 -0
  26. jdeval-checkpoint-1000/rng_state_2.pth +3 -0
  27. jdeval-checkpoint-1000/rng_state_20.pth +3 -0
  28. jdeval-checkpoint-1000/rng_state_21.pth +3 -0
  29. jdeval-checkpoint-1000/rng_state_22.pth +3 -0
  30. jdeval-checkpoint-1000/rng_state_23.pth +3 -0
  31. jdeval-checkpoint-1000/rng_state_24.pth +3 -0
  32. jdeval-checkpoint-1000/rng_state_25.pth +3 -0
  33. jdeval-checkpoint-1000/rng_state_26.pth +3 -0
  34. jdeval-checkpoint-1000/rng_state_27.pth +3 -0
  35. jdeval-checkpoint-1000/rng_state_28.pth +3 -0
  36. jdeval-checkpoint-1000/rng_state_29.pth +3 -0
  37. jdeval-checkpoint-1000/rng_state_3.pth +3 -0
  38. jdeval-checkpoint-1000/rng_state_30.pth +3 -0
  39. jdeval-checkpoint-1000/rng_state_31.pth +3 -0
  40. jdeval-checkpoint-1000/rng_state_32.pth +3 -0
  41. jdeval-checkpoint-1000/rng_state_33.pth +3 -0
  42. jdeval-checkpoint-1000/rng_state_34.pth +3 -0
  43. jdeval-checkpoint-1000/rng_state_35.pth +3 -0
  44. jdeval-checkpoint-1000/rng_state_36.pth +3 -0
  45. jdeval-checkpoint-1000/rng_state_37.pth +3 -0
  46. jdeval-checkpoint-1000/rng_state_38.pth +3 -0
  47. jdeval-checkpoint-1000/rng_state_39.pth +3 -0
  48. jdeval-checkpoint-1000/rng_state_4.pth +3 -0
  49. jdeval-checkpoint-1000/rng_state_40.pth +3 -0
  50. jdeval-checkpoint-1000/rng_state_41.pth +3 -0
jdeval-checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>
2
+
3
+ '+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>
4
+
5
+ ' }}
jdeval-checkpoint-1000/config.json ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_faster_video": false,
3
+ "add_time_instruction": false,
4
+ "architectures": [
5
+ "LlavaLLaDAModelLM"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "auto_map": {
10
+ "AutoConfig": "configuration_llada.LLaDAConfig",
11
+ "AutoModel": "modeling_llada.LLaDAModelLM",
12
+ "AutoModelForCausalLM": "modeling_llada.LLaDAModelLM"
13
+ },
14
+ "bos_token_id": 126080,
15
+ "dtype": "bfloat16",
16
+ "embedding_loss_lambda": 0.5,
17
+ "embedding_loss_type": "infonce_learnable",
18
+ "embedding_pool_strategy": "response_tokens",
19
+ "eos_token_id": 126081,
20
+ "faster_token_stride": 10,
21
+ "force_sample": false,
22
+ "hidden_act": "silu",
23
+ "hidden_size": 4096,
24
+ "image_aspect_ratio": "square",
25
+ "image_crop_resolution": null,
26
+ "image_grid_pinpoints": null,
27
+ "image_split_resolution": null,
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 12288,
30
+ "max_position_embeddings": 16384,
31
+ "mm_hidden_size": 1152,
32
+ "mm_newline_position": "grid",
33
+ "mm_patch_merge_type": "flat",
34
+ "mm_projector_lr": null,
35
+ "mm_projector_type": "mlp2x_gelu",
36
+ "mm_resampler_type": null,
37
+ "mm_spatial_pool_mode": "bilinear",
38
+ "mm_spatial_pool_stride": null,
39
+ "mm_tunable_parts": "mm_mlp_adapter,mm_language_model",
40
+ "mm_use_im_patch_token": false,
41
+ "mm_use_im_start_end": false,
42
+ "mm_vision_select_feature": "patch",
43
+ "mm_vision_select_layer": -2,
44
+ "mm_vision_tower": "google/siglip2-so400m-patch14-384",
45
+ "mm_vision_tower_lr": null,
46
+ "model_type": "llada",
47
+ "num_attention_heads": 32,
48
+ "num_hidden_layers": 32,
49
+ "num_key_value_heads": 32,
50
+ "pad_token_id": 126081,
51
+ "pos_skipping_range": 4096,
52
+ "pretraining_tp": 1,
53
+ "rms_norm_eps": 1e-05,
54
+ "rope_scaling": null,
55
+ "rope_theta": 500000.0,
56
+ "tie_word_embeddings": false,
57
+ "tokenizer_model_max_length": 2048,
58
+ "tokenizer_padding_side": "right",
59
+ "transformers_version": "4.57.1",
60
+ "use_auxiliary_embedding_loss": false,
61
+ "use_cache": false,
62
+ "use_mm_proj": true,
63
+ "use_pos_skipping": false,
64
+ "vision_tower_pretrained": null,
65
+ "vocab_size": 126349,
66
+ "y_encoder_dim": 1024,
67
+ "y_encoder_name": "abhinand/MedEmbed-large-v0.1"
68
+ }
jdeval-checkpoint-1000/configuration_llada.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ LLaDA model configuration"""
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ LLaDA_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
29
+
30
+
31
+ class LLaDAConfig(PretrainedConfig):
32
+ r"""
33
+ This is the configuration class to store the configuration of a [`LLaDAModel`]. It is used to instantiate an LLaDA
34
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
35
+ defaults will yield a similar configuration to that of the LLaDA-8B.
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+
41
+ Args:
42
+ vocab_size (`int`, *optional*, defaults to 32000):
43
+ Vocabulary size of the LLaDA model. Defines the number of different tokens that can be represented by the
44
+ `inputs_ids` passed when calling [`LLaDAModel`]
45
+ hidden_size (`int`, *optional*, defaults to 4096):
46
+ Dimension of the hidden representations.
47
+ intermediate_size (`int`, *optional*, defaults to 11008):
48
+ Dimension of the MLP representations.
49
+ num_hidden_layers (`int`, *optional*, defaults to 32):
50
+ Number of hidden layers in the Transformer decoder.
51
+ num_attention_heads (`int`, *optional*, defaults to 32):
52
+ Number of attention heads for each attention layer in the Transformer decoder.
53
+ num_key_value_heads (`int`, *optional*):
54
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
55
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
56
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
57
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
58
+ by meanpooling all the original heads within that group. For more details checkout [this
59
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
60
+ `num_attention_heads`.
61
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
62
+ The non-linear activation function (function or string) in the decoder.
63
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
64
+ The maximum sequence length that this model might ever be used with.
65
+ initializer_range (`float`, *optional*, defaults to 0.02):
66
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
67
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
68
+ The epsilon used by the rms normalization layers.
69
+ use_cache (`bool`, *optional*, defaults to `True`):
70
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
71
+ relevant if `config.is_decoder=True`.
72
+ pad_token_id (`int`, *optional*):
73
+ Padding token id.
74
+ bos_token_id (`int`, *optional*, defaults to 1):
75
+ Beginning of stream token id.
76
+ eos_token_id (`int`, *optional*, defaults to 2):
77
+ End of stream token id.
78
+ pretraining_tp (`int`, *optional*, defaults to 1):
79
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
80
+ document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
81
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
82
+ issue](https://github.com/pytorch/pytorch/issues/76232).
83
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
84
+ Whether to tie weight embeddings
85
+ rope_theta (`float`, *optional*, defaults to 10000.0):
86
+ The base period of the RoPE embeddings.
87
+ rope_scaling (`Dict`, *optional*):
88
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
89
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
90
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
91
+ `max_position_embeddings` to the expected new maximum.
92
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
93
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
94
+ attention_dropout (`float`, *optional*, defaults to 0.0):
95
+ The dropout ratio for the attention probabilities.
96
+ """
97
+
98
+ model_type = "llada"
99
+ keys_to_ignore_at_inference = ["past_key_values"]
100
+
101
+ def __init__(
102
+ self,
103
+ vocab_size=32000,
104
+ hidden_size=4096,
105
+ intermediate_size=11008,
106
+ num_hidden_layers=32,
107
+ num_attention_heads=32,
108
+ num_key_value_heads=None,
109
+ hidden_act="silu",
110
+ max_position_embeddings=2048,
111
+ initializer_range=0.02,
112
+ rms_norm_eps=1e-6,
113
+ use_cache=True,
114
+ pad_token_id=None,
115
+ bos_token_id=1,
116
+ eos_token_id=2,
117
+ pretraining_tp=1,
118
+ tie_word_embeddings=False,
119
+ rope_theta=10000.0,
120
+ rope_scaling=None,
121
+ attention_bias=False,
122
+ attention_dropout=0.0,
123
+ **kwargs,
124
+ ):
125
+ self.vocab_size = vocab_size
126
+ self.max_position_embeddings = max_position_embeddings
127
+ self.hidden_size = hidden_size
128
+ self.intermediate_size = intermediate_size
129
+ self.num_hidden_layers = num_hidden_layers
130
+ self.num_attention_heads = num_attention_heads
131
+
132
+ # for backward compatibility
133
+ if num_key_value_heads is None:
134
+ num_key_value_heads = num_attention_heads
135
+
136
+ self.num_key_value_heads = num_key_value_heads
137
+ self.hidden_act = hidden_act
138
+ self.initializer_range = initializer_range
139
+ self.rms_norm_eps = rms_norm_eps
140
+ self.pretraining_tp = pretraining_tp
141
+ self.use_cache = use_cache
142
+ self.rope_theta = rope_theta
143
+ self.rope_scaling = rope_scaling
144
+ self._rope_scaling_validation()
145
+ self.attention_bias = attention_bias
146
+ self.attention_dropout = attention_dropout
147
+
148
+ super().__init__(
149
+ pad_token_id=pad_token_id,
150
+ bos_token_id=bos_token_id,
151
+ eos_token_id=eos_token_id,
152
+ tie_word_embeddings=tie_word_embeddings,
153
+ **kwargs,
154
+ )
155
+
156
+ def _rope_scaling_validation(self):
157
+ """
158
+ Validate the `rope_scaling` configuration.
159
+ """
160
+ if self.rope_scaling is None:
161
+ return
162
+
163
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
164
+ raise ValueError(
165
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
166
+ f"got {self.rope_scaling}"
167
+ )
168
+ rope_scaling_type = self.rope_scaling.get("type", None)
169
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
170
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
171
+ raise ValueError(
172
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
173
+ )
174
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
175
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
jdeval-checkpoint-1000/latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step200
jdeval-checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
jdeval-checkpoint-1000/model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ca6a58ce8e8e70217fe1541ec713d50828992943cc43cbe047630cd0ac7633f
3
+ size 4994639360
jdeval-checkpoint-1000/model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7e275933d65285301fe41fce816681ccd423b6d33e370e680b1c8e9c91c6f343
3
+ size 4999802600
jdeval-checkpoint-1000/model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f75de052127aec097e6c8cc3cac4648be758b932919f19e7f23e8a3ee4da6a98
3
+ size 4999827272
jdeval-checkpoint-1000/model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42ef28e0d099f372c077d84a9135fb631de2d28095931c630722aef3530b08db
3
+ size 1873621192
jdeval-checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fff5993a4dc4d9190f5ae48e875d413575861198b10ac815947846c5f3abc39e
3
+ size 977900912
jdeval-checkpoint-1000/model.safetensors.index.json ADDED
@@ -0,0 +1,725 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 663456,
4
+ "total_size": 16867789888
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.image_newline": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
29
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
30
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
31
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
32
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
39
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
42
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
44
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
51
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
56
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
63
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
66
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
68
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
75
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
78
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
80
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
87
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
92
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
99
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
102
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
104
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
106
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
107
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
108
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
109
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
110
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
111
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
114
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
116
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
118
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
119
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
121
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
123
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
124
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
125
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
126
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
127
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
128
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
133
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
138
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
140
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
147
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
150
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
152
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
154
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
155
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
156
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
157
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
158
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
159
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
160
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
161
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
162
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
163
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
164
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
165
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
171
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
174
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
176
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
183
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
186
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
188
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
195
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
200
+ "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
207
+ "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
210
+ "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
212
+ "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
218
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
219
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
220
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
221
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
222
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
223
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
224
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
225
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
226
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
231
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00004.safetensors",
236
+ "model.layers.31.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
238
+ "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
239
+ "model.layers.31.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
242
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
243
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
245
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
246
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
247
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
248
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
249
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
250
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
251
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
252
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
253
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
254
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
255
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
256
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
257
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
258
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
259
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
260
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
261
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
267
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
272
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
279
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
282
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
284
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
286
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
287
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
289
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
291
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
292
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
293
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
294
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
295
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
296
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
298
+ "model.mm_projector.0.bias": "model-00004-of-00004.safetensors",
299
+ "model.mm_projector.0.weight": "model-00004-of-00004.safetensors",
300
+ "model.mm_projector.2.bias": "model-00004-of-00004.safetensors",
301
+ "model.mm_projector.2.weight": "model-00004-of-00004.safetensors",
302
+ "model.norm.weight": "model-00003-of-00004.safetensors",
303
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00004-of-00004.safetensors",
304
+ "model.vision_tower.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00004-of-00004.safetensors",
305
+ "model.vision_tower.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00004-of-00004.safetensors",
306
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00004-of-00004.safetensors",
307
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00004-of-00004.safetensors",
308
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00004-of-00004.safetensors",
309
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00004-of-00004.safetensors",
310
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00004-of-00004.safetensors",
311
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00004-of-00004.safetensors",
312
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00004-of-00004.safetensors",
313
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00004-of-00004.safetensors",
314
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
315
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
316
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
317
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
318
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
319
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
320
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
321
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
322
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00004-of-00004.safetensors",
323
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00004-of-00004.safetensors",
324
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00004-of-00004.safetensors",
325
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00004-of-00004.safetensors",
326
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00004-of-00004.safetensors",
327
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00004-of-00004.safetensors",
328
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00004-of-00004.safetensors",
329
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00004-of-00004.safetensors",
330
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
331
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
332
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
333
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
334
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
335
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
336
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
337
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
338
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00004-of-00004.safetensors",
339
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00004-of-00004.safetensors",
340
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00004-of-00004.safetensors",
341
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00004-of-00004.safetensors",
342
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00004-of-00004.safetensors",
343
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00004-of-00004.safetensors",
344
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00004-of-00004.safetensors",
345
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00004-of-00004.safetensors",
346
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
347
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
348
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
349
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
350
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
351
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
352
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
353
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
354
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00004-of-00004.safetensors",
355
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00004-of-00004.safetensors",
356
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00004-of-00004.safetensors",
357
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00004-of-00004.safetensors",
358
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00004-of-00004.safetensors",
359
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00004-of-00004.safetensors",
360
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00004-of-00004.safetensors",
361
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00004-of-00004.safetensors",
362
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
363
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
364
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
365
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
366
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
367
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
368
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
369
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
370
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00004-of-00004.safetensors",
371
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00004-of-00004.safetensors",
372
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00004-of-00004.safetensors",
373
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00004-of-00004.safetensors",
374
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00004-of-00004.safetensors",
375
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00004-of-00004.safetensors",
376
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00004-of-00004.safetensors",
377
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00004-of-00004.safetensors",
378
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
379
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
380
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
381
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
382
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
383
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
384
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
385
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
386
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00004-of-00004.safetensors",
387
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00004-of-00004.safetensors",
388
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00004-of-00004.safetensors",
389
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00004-of-00004.safetensors",
390
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00004-of-00004.safetensors",
391
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00004-of-00004.safetensors",
392
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00004-of-00004.safetensors",
393
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00004-of-00004.safetensors",
394
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
395
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
396
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
397
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
398
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
399
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
400
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
401
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
402
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00004-of-00004.safetensors",
403
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00004-of-00004.safetensors",
404
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00004-of-00004.safetensors",
405
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00004-of-00004.safetensors",
406
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00004-of-00004.safetensors",
407
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00004-of-00004.safetensors",
408
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00004-of-00004.safetensors",
409
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00004-of-00004.safetensors",
410
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
411
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
412
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
413
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
414
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
415
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
416
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
417
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
418
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00004-of-00004.safetensors",
419
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00004-of-00004.safetensors",
420
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00004-of-00004.safetensors",
421
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00004-of-00004.safetensors",
422
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00004-of-00004.safetensors",
423
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00004-of-00004.safetensors",
424
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00004-of-00004.safetensors",
425
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00004-of-00004.safetensors",
426
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
427
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
428
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
429
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
430
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
431
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
432
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
433
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
434
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00004-of-00004.safetensors",
435
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00004-of-00004.safetensors",
436
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00004-of-00004.safetensors",
437
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00004-of-00004.safetensors",
438
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00004-of-00004.safetensors",
439
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00004-of-00004.safetensors",
440
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00004-of-00004.safetensors",
441
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00004-of-00004.safetensors",
442
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
443
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
444
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
445
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
446
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
447
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
448
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
449
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
450
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00004-of-00004.safetensors",
451
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00004-of-00004.safetensors",
452
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00004-of-00004.safetensors",
453
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00004-of-00004.safetensors",
454
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00004-of-00004.safetensors",
455
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00004-of-00004.safetensors",
456
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00004-of-00004.safetensors",
457
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00004-of-00004.safetensors",
458
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
459
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
460
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
461
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
462
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
463
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
464
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
465
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
466
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00004-of-00004.safetensors",
467
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00004-of-00004.safetensors",
468
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00004-of-00004.safetensors",
469
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00004-of-00004.safetensors",
470
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00004-of-00004.safetensors",
471
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00004-of-00004.safetensors",
472
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00004-of-00004.safetensors",
473
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00004-of-00004.safetensors",
474
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
475
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
476
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
477
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
478
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
479
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
480
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
481
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
482
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00004-of-00004.safetensors",
483
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00004-of-00004.safetensors",
484
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00004-of-00004.safetensors",
485
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00004-of-00004.safetensors",
486
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00004-of-00004.safetensors",
487
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00004-of-00004.safetensors",
488
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00004-of-00004.safetensors",
489
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00004-of-00004.safetensors",
490
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
491
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
492
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
493
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
494
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
495
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
496
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
497
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
498
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00004-of-00004.safetensors",
499
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00004-of-00004.safetensors",
500
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00004-of-00004.safetensors",
501
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00004-of-00004.safetensors",
502
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00004-of-00004.safetensors",
503
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00004-of-00004.safetensors",
504
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00004-of-00004.safetensors",
505
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00004-of-00004.safetensors",
506
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
507
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
508
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
509
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
510
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
511
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
512
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
513
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
514
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00004-of-00004.safetensors",
515
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00004-of-00004.safetensors",
516
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00004-of-00004.safetensors",
517
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00004-of-00004.safetensors",
518
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00004-of-00004.safetensors",
519
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00004-of-00004.safetensors",
520
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00004-of-00004.safetensors",
521
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00004-of-00004.safetensors",
522
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
523
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
524
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
525
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
526
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
527
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
528
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
529
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
530
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00004-of-00004.safetensors",
531
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00004-of-00004.safetensors",
532
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00004-of-00004.safetensors",
533
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00004-of-00004.safetensors",
534
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00004-of-00004.safetensors",
535
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00004-of-00004.safetensors",
536
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00004-of-00004.safetensors",
537
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00004-of-00004.safetensors",
538
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
539
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
540
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
541
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
542
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
543
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
544
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
545
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
546
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00004-of-00004.safetensors",
547
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00004-of-00004.safetensors",
548
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00004-of-00004.safetensors",
549
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00004-of-00004.safetensors",
550
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00004-of-00004.safetensors",
551
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00004-of-00004.safetensors",
552
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00004-of-00004.safetensors",
553
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00004-of-00004.safetensors",
554
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
555
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
556
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
557
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
558
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
559
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
560
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
561
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
562
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00004-of-00004.safetensors",
563
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00004-of-00004.safetensors",
564
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00004-of-00004.safetensors",
565
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00004-of-00004.safetensors",
566
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00004-of-00004.safetensors",
567
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00004-of-00004.safetensors",
568
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00004-of-00004.safetensors",
569
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00004-of-00004.safetensors",
570
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
571
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
572
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
573
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
574
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
575
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
576
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
577
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
578
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00004-of-00004.safetensors",
579
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00004-of-00004.safetensors",
580
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00004-of-00004.safetensors",
581
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00004-of-00004.safetensors",
582
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00004-of-00004.safetensors",
583
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00004-of-00004.safetensors",
584
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00004-of-00004.safetensors",
585
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00004-of-00004.safetensors",
586
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
587
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
588
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
589
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
590
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
591
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
592
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
593
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
594
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00004-of-00004.safetensors",
595
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00004-of-00004.safetensors",
596
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00004-of-00004.safetensors",
597
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00004-of-00004.safetensors",
598
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00004-of-00004.safetensors",
599
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00004-of-00004.safetensors",
600
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00004-of-00004.safetensors",
601
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00004-of-00004.safetensors",
602
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
603
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
604
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
605
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
606
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
607
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
608
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
609
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
610
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00004-of-00004.safetensors",
611
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00004-of-00004.safetensors",
612
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00004-of-00004.safetensors",
613
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00004-of-00004.safetensors",
614
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00004-of-00004.safetensors",
615
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00004-of-00004.safetensors",
616
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00004-of-00004.safetensors",
617
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00004-of-00004.safetensors",
618
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
619
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
620
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
621
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
622
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
623
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
624
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
625
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
626
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00004-of-00004.safetensors",
627
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00004-of-00004.safetensors",
628
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00004-of-00004.safetensors",
629
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00004-of-00004.safetensors",
630
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00004-of-00004.safetensors",
631
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00004-of-00004.safetensors",
632
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00004-of-00004.safetensors",
633
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00004-of-00004.safetensors",
634
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
635
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
636
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
637
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
638
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
639
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
640
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
641
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
642
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00004-of-00004.safetensors",
643
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00004-of-00004.safetensors",
644
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00004-of-00004.safetensors",
645
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00004-of-00004.safetensors",
646
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00004-of-00004.safetensors",
647
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00004-of-00004.safetensors",
648
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00004-of-00004.safetensors",
649
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00004-of-00004.safetensors",
650
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
651
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
652
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
653
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
654
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
655
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
656
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
657
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
658
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00004-of-00004.safetensors",
659
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00004-of-00004.safetensors",
660
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00004-of-00004.safetensors",
661
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00004-of-00004.safetensors",
662
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00004-of-00004.safetensors",
663
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00004-of-00004.safetensors",
664
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00004-of-00004.safetensors",
665
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00004-of-00004.safetensors",
666
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
667
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
668
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
669
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
670
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
671
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
672
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
673
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
674
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00004-of-00004.safetensors",
675
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00004-of-00004.safetensors",
676
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00004-of-00004.safetensors",
677
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00004-of-00004.safetensors",
678
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00004-of-00004.safetensors",
679
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00004-of-00004.safetensors",
680
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00004-of-00004.safetensors",
681
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00004-of-00004.safetensors",
682
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
683
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
684
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
685
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
686
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
687
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
688
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
689
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
690
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00004-of-00004.safetensors",
691
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00004-of-00004.safetensors",
692
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00004-of-00004.safetensors",
693
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00004-of-00004.safetensors",
694
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00004-of-00004.safetensors",
695
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00004-of-00004.safetensors",
696
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00004-of-00004.safetensors",
697
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00004-of-00004.safetensors",
698
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
699
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
700
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
701
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
702
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
703
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
704
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
705
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
706
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00004-of-00004.safetensors",
707
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00004-of-00004.safetensors",
708
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00004-of-00004.safetensors",
709
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00004-of-00004.safetensors",
710
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00004-of-00004.safetensors",
711
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00004-of-00004.safetensors",
712
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00004-of-00004.safetensors",
713
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00004-of-00004.safetensors",
714
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
715
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
716
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00004-of-00004.safetensors",
717
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00004-of-00004.safetensors",
718
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
719
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
720
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
721
+ "model.vision_tower.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
722
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.bias": "model-00004-of-00004.safetensors",
723
+ "model.vision_tower.vision_tower.vision_model.post_layernorm.weight": "model-00004-of-00004.safetensors"
724
+ }
725
+ }
jdeval-checkpoint-1000/modeling_dhara.py ADDED
@@ -0,0 +1,1111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dhara: Diffusion LLM with Canon Layers
4
+
5
+ Combines:
6
+ 1. Dhara's masked diffusion training (bidirectional attention, high throughput)
7
+ 2. Canon layers (local context mixing via causal depthwise convolutions)
8
+
9
+ Canon layers from "Physics of Language Models: Part 4.1" by Zeyuan Allen-Zhu:
10
+ - Position A: After input LayerNorm, before attention
11
+ - Position C: After post-attention LayerNorm, before MLP
12
+ - kernel_size=4, residual=True, activation=False (default)
13
+
14
+ Expected benefits:
15
+ - ~280-290 tok/s throughput (Dhara's parallel generation)
16
+ - +0.25-0.5% accuracy improvement (Canon's local context mixing)
17
+ """
18
+
19
+ import math
20
+ import warnings
21
+ from typing import Optional, Tuple, Union, List
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.nn.functional as F
26
+ from torch.nn import CrossEntropyLoss
27
+
28
+ from transformers import PreTrainedModel
29
+ from transformers.generation import GenerationMixin
30
+ from transformers.modeling_outputs import BaseModelOutputWithPast, MaskedLMOutput
31
+ from transformers.utils import logging
32
+ from transformers.cache_utils import Cache, DynamicCache
33
+ from transformers import PretrainedConfig
34
+
35
+ logger = logging.get_logger(__name__)
36
+
37
+ # Optional performance imports
38
+ try:
39
+ from flash_attn import flash_attn_func
40
+ FLASH_ATTN_AVAILABLE = True
41
+ except ImportError:
42
+ FLASH_ATTN_AVAILABLE = False
43
+
44
+ try:
45
+ import xformers.ops as xops
46
+ XFORMERS_AVAILABLE = True
47
+ except ImportError:
48
+ XFORMERS_AVAILABLE = False
49
+
50
+
51
+ class DharaConfig(PretrainedConfig):
52
+ """
53
+ Configuration for Dhara model.
54
+
55
+ Combines Dhara diffusion config with Canon layer parameters.
56
+ """
57
+
58
+ model_type = "dhara"
59
+
60
+ def __init__(
61
+ self,
62
+ # Core architecture
63
+ vocab_size: int = 50304,
64
+ hidden_size: int = 384,
65
+ num_hidden_layers: int = 32,
66
+ num_attention_heads: int = 8,
67
+ num_key_value_heads: int = 4,
68
+ intermediate_size: int = 1024,
69
+ head_dim: int = None,
70
+ max_position_embeddings: int = 2048,
71
+
72
+ # Model specifics
73
+ hidden_act: str = "silu",
74
+ rms_norm_eps: float = 1e-6,
75
+ rope_theta: float = 10000.0,
76
+ initializer_range: float = 0.02,
77
+ tie_word_embeddings: bool = True,
78
+ attention_dropout: float = 0.0,
79
+
80
+ # Canon layer parameters
81
+ canon_set: str = "AC", # Positions: A (before attn), C (before MLP)
82
+ canon_kernel: int = 4, # Kernel size (2-4)
83
+ canon_residual: bool = True, # Highly recommended
84
+ canon_activation: bool = False, # NOT recommended for transformers
85
+ canon_bias: bool = False,
86
+
87
+ # Diffusion specific
88
+ mask_token_id: int = None, # Will be set from tokenizer
89
+ mask_epsilon: float = 0.001, # Minimum mask probability
90
+ num_diffusion_steps: int = 1000,
91
+
92
+ # Special tokens
93
+ bos_token_id: int = 1,
94
+ eos_token_id: int = 2,
95
+ pad_token_id: int = 0,
96
+
97
+ # Performance flags
98
+ use_cache: bool = False,
99
+ use_flash_attention: bool = True,
100
+ use_xformers: bool = False,
101
+
102
+ **kwargs
103
+ ):
104
+ super().__init__(
105
+ bos_token_id=bos_token_id,
106
+ eos_token_id=eos_token_id,
107
+ pad_token_id=pad_token_id,
108
+ tie_word_embeddings=tie_word_embeddings,
109
+ **kwargs
110
+ )
111
+
112
+ # Core architecture
113
+ self.vocab_size = vocab_size
114
+ self.hidden_size = hidden_size
115
+ self.num_hidden_layers = num_hidden_layers
116
+ self.num_attention_heads = num_attention_heads
117
+ self.num_key_value_heads = num_key_value_heads
118
+ self.intermediate_size = intermediate_size
119
+ self.head_dim = head_dim or (hidden_size // num_attention_heads)
120
+ self.max_position_embeddings = max_position_embeddings
121
+
122
+ # Model specifics
123
+ self.hidden_act = hidden_act
124
+ self.rms_norm_eps = rms_norm_eps
125
+ self.rope_theta = rope_theta
126
+ self.initializer_range = initializer_range
127
+ self.tie_word_embeddings = tie_word_embeddings
128
+ self.attention_dropout = attention_dropout
129
+
130
+ # Canon parameters
131
+ self.canon_set = canon_set
132
+ self.canon_kernel = canon_kernel
133
+ self.canon_residual = canon_residual
134
+ self.canon_activation = canon_activation
135
+ self.canon_bias = canon_bias
136
+
137
+ # Diffusion specific
138
+ self.mask_token_id = mask_token_id if mask_token_id is not None else (vocab_size - 1)
139
+ self.mask_epsilon = mask_epsilon
140
+ self.num_diffusion_steps = num_diffusion_steps
141
+
142
+ # Special tokens
143
+ self.bos_token_id = bos_token_id
144
+ self.eos_token_id = eos_token_id
145
+ self.pad_token_id = pad_token_id
146
+
147
+ # Performance
148
+ self.use_cache = use_cache
149
+ self.use_flash_attention = use_flash_attention
150
+ self.use_xformers = use_xformers
151
+
152
+
153
+ class CanonLayer(nn.Module):
154
+ """
155
+ Canon Layer: Causal 1D depthwise convolution for local context mixing.
156
+
157
+ From "Physics of Language Models: Part 4.1" by Zeyuan Allen-Zhu.
158
+ Captures local sequential dependencies with O(n) complexity.
159
+ """
160
+
161
+ def __init__(
162
+ self,
163
+ hidden_size: int,
164
+ kernel_size: int = 4,
165
+ use_residual: bool = True,
166
+ use_activation: bool = False,
167
+ use_bias: bool = False,
168
+ ):
169
+ super().__init__()
170
+ self.hidden_size = hidden_size
171
+ self.kernel_size = kernel_size
172
+ self.use_residual = use_residual
173
+ self.use_activation = use_activation
174
+
175
+ # Depthwise causal convolution
176
+ self.conv = nn.Conv1d(
177
+ in_channels=hidden_size,
178
+ out_channels=hidden_size,
179
+ kernel_size=kernel_size,
180
+ padding=kernel_size - 1, # Causal (left-pad)
181
+ groups=hidden_size, # Depthwise
182
+ bias=use_bias,
183
+ )
184
+
185
+ # Initialize for stability
186
+ nn.init.normal_(self.conv.weight, mean=0.0, std=0.02)
187
+ if use_bias:
188
+ nn.init.zeros_(self.conv.bias)
189
+
190
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
191
+ """
192
+ Args:
193
+ hidden_states: [batch_size, seq_len, hidden_size]
194
+ Returns:
195
+ output: [batch_size, seq_len, hidden_size]
196
+ """
197
+ batch_size, seq_len, hidden_size = hidden_states.shape
198
+
199
+ # Transpose for Conv1d: [B, H, L]
200
+ x = hidden_states.transpose(1, 2)
201
+
202
+ # Apply conv with causal padding
203
+ out = self.conv(x)
204
+ # Remove right padding to make it causal
205
+ out = out[:, :, :seq_len]
206
+
207
+ # Optional activation
208
+ if self.use_activation:
209
+ out = F.silu(out)
210
+
211
+ # Transpose back: [B, L, H]
212
+ out = out.transpose(1, 2)
213
+
214
+ # Residual connection
215
+ if self.use_residual:
216
+ out = hidden_states + out
217
+
218
+ return out
219
+
220
+
221
+ class RMSNorm(nn.Module):
222
+ """Root Mean Square Layer Normalization"""
223
+
224
+ def __init__(self, hidden_size, eps=1e-6):
225
+ super().__init__()
226
+ self.weight = nn.Parameter(torch.ones(hidden_size))
227
+ self.variance_epsilon = eps
228
+
229
+ def forward(self, hidden_states):
230
+ input_dtype = hidden_states.dtype
231
+ hidden_states = hidden_states.to(torch.float32)
232
+ variance = hidden_states.pow(2).mean(-1, keepdim=True)
233
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
234
+ return self.weight * hidden_states.to(input_dtype)
235
+
236
+
237
+ class RotaryEmbedding(nn.Module):
238
+ """Rotary Position Embeddings (RoPE)"""
239
+
240
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
241
+ super().__init__()
242
+ self.dim = dim
243
+ self.max_position_embeddings = max_position_embeddings
244
+ self.base = base
245
+
246
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
247
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
248
+
249
+ self._set_cos_sin_cache(
250
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
251
+ )
252
+
253
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
254
+ self.max_seq_len_cached = seq_len
255
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
256
+
257
+ freqs = torch.outer(t, self.inv_freq)
258
+ emb = torch.cat((freqs, freqs), dim=-1)
259
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
260
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
261
+
262
+ def forward(self, x, seq_len=None):
263
+ if seq_len > self.max_seq_len_cached:
264
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
265
+
266
+ return (
267
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
268
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
269
+ )
270
+
271
+
272
+ def rotate_half(x):
273
+ """Rotates half the hidden dims of the input."""
274
+ x1 = x[..., : x.shape[-1] // 2]
275
+ x2 = x[..., x.shape[-1] // 2 :]
276
+ return torch.cat((-x2, x1), dim=-1)
277
+
278
+
279
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
280
+ """Applies Rotary Position Embedding to query and key tensors."""
281
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
282
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
283
+ # Cast to input dtype for consistency
284
+ cos = cos.to(q.dtype)
285
+ sin = sin.to(q.dtype)
286
+ q_embed = (q * cos) + (rotate_half(q) * sin)
287
+ k_embed = (k * cos) + (rotate_half(k) * sin)
288
+ return q_embed, k_embed
289
+
290
+
291
+ class DharaMLP(nn.Module):
292
+ """MLP with SwiGLU activation"""
293
+
294
+ def __init__(self, config):
295
+ super().__init__()
296
+ self.config = config
297
+ self.hidden_size = config.hidden_size
298
+ self.intermediate_size = config.intermediate_size
299
+
300
+ self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
301
+ self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
302
+ self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
303
+
304
+ self.act_fn = nn.SiLU()
305
+
306
+ def forward(self, x):
307
+ return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
308
+
309
+
310
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
311
+ """Repeat KV heads for GQA."""
312
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
313
+ if n_rep == 1:
314
+ return hidden_states
315
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
316
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
317
+
318
+
319
+ class DharaAttention(nn.Module):
320
+ """Multi-Head Bidirectional Attention with GQA support (for diffusion)"""
321
+
322
+ def __init__(self, config: DharaConfig, layer_idx: Optional[int] = None):
323
+ super().__init__()
324
+ self.config = config
325
+ self.layer_idx = layer_idx
326
+
327
+ self.attention_dropout = config.attention_dropout
328
+ self.hidden_size = config.hidden_size
329
+ self.num_heads = config.num_attention_heads
330
+ self.head_dim = config.head_dim
331
+ self.num_key_value_heads = config.num_key_value_heads
332
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
333
+ self.max_position_embeddings = config.max_position_embeddings
334
+ self.rope_theta = config.rope_theta
335
+ self.is_causal = False # CRITICAL: Dhara uses bidirectional attention
336
+
337
+ if (self.head_dim * self.num_heads) != self.hidden_size:
338
+ raise ValueError(
339
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
340
+ f" and `num_heads`: {self.num_heads})."
341
+ )
342
+
343
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
344
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
345
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
346
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
347
+
348
+ self.rotary_emb = RotaryEmbedding(
349
+ self.head_dim,
350
+ max_position_embeddings=self.max_position_embeddings,
351
+ base=self.rope_theta,
352
+ )
353
+
354
+ def forward(
355
+ self,
356
+ hidden_states: torch.Tensor,
357
+ attention_mask: Optional[torch.Tensor] = None,
358
+ position_ids: Optional[torch.LongTensor] = None,
359
+ past_key_value=None,
360
+ output_attentions: bool = False,
361
+ use_cache: bool = False,
362
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
363
+ bsz, q_len, _ = hidden_states.size()
364
+
365
+ query_states = self.q_proj(hidden_states)
366
+ key_states = self.k_proj(hidden_states)
367
+ value_states = self.v_proj(hidden_states)
368
+
369
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
370
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
371
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
372
+
373
+ kv_seq_len = key_states.shape[-2]
374
+ if past_key_value is not None:
375
+ if self.layer_idx is None:
376
+ raise ValueError(
377
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
378
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
379
+ "with a layer index."
380
+ )
381
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
382
+
383
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
384
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
385
+
386
+ if past_key_value is not None:
387
+ cache_kwargs = {"sin": sin, "cos": cos}
388
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
389
+
390
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
391
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
392
+
393
+ # Flash Attention for bidirectional
394
+ if FLASH_ATTN_AVAILABLE and self.config.use_flash_attention:
395
+ query_states = query_states.transpose(1, 2).contiguous()
396
+ key_states = key_states.transpose(1, 2).contiguous()
397
+ value_states = value_states.transpose(1, 2).contiguous()
398
+
399
+ if query_states.dtype not in [torch.float16, torch.bfloat16]:
400
+ query_states = query_states.to(torch.bfloat16)
401
+ key_states = key_states.to(torch.bfloat16)
402
+ value_states = value_states.to(torch.bfloat16)
403
+
404
+ attn_output = flash_attn_func(
405
+ query_states,
406
+ key_states,
407
+ value_states,
408
+ dropout_p=0.0,
409
+ causal=False, # Bidirectional for diffusion
410
+ )
411
+
412
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
413
+
414
+ else:
415
+ # Standard attention
416
+ attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
417
+
418
+ if attention_mask is not None:
419
+ attn_weights = attn_weights + attention_mask
420
+
421
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
422
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
423
+ attn_output = torch.matmul(attn_weights, value_states)
424
+
425
+ attn_output = attn_output.transpose(1, 2).contiguous()
426
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
427
+
428
+ attn_output = self.o_proj(attn_output)
429
+
430
+ if not output_attentions:
431
+ attn_weights = None
432
+
433
+ return attn_output, attn_weights, past_key_value
434
+
435
+
436
+ class DharaDecoderLayer(nn.Module):
437
+ """
438
+ Dhara decoder layer with Canon layers at positions A and C.
439
+
440
+ Flow:
441
+ x -> LayerNorm -> [CanonA] -> Attention -> + residual
442
+ x -> LayerNorm -> [CanonC] -> MLP -> + residual
443
+ """
444
+
445
+ def __init__(self, config: DharaConfig, layer_idx: int):
446
+ super().__init__()
447
+ self.hidden_size = config.hidden_size
448
+ self.config = config
449
+
450
+ # Pre-attention norm
451
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
452
+
453
+ # Canon-A: before attention
454
+ self.canon_a = None
455
+ if "A" in config.canon_set:
456
+ self.canon_a = CanonLayer(
457
+ hidden_size=config.hidden_size,
458
+ kernel_size=config.canon_kernel,
459
+ use_residual=config.canon_residual,
460
+ use_activation=config.canon_activation,
461
+ use_bias=config.canon_bias,
462
+ )
463
+
464
+ # Attention
465
+ self.self_attn = DharaAttention(config=config, layer_idx=layer_idx)
466
+
467
+ # Post-attention norm
468
+ self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
469
+
470
+ # Canon-C: before MLP
471
+ self.canon_c = None
472
+ if "C" in config.canon_set:
473
+ self.canon_c = CanonLayer(
474
+ hidden_size=config.hidden_size,
475
+ kernel_size=config.canon_kernel,
476
+ use_residual=config.canon_residual,
477
+ use_activation=config.canon_activation,
478
+ use_bias=config.canon_bias,
479
+ )
480
+
481
+ # MLP
482
+ self.mlp = DharaMLP(config)
483
+
484
+ def forward(
485
+ self,
486
+ hidden_states: torch.Tensor,
487
+ attention_mask: Optional[torch.Tensor] = None,
488
+ position_ids: Optional[torch.LongTensor] = None,
489
+ past_key_value=None,
490
+ output_attentions: Optional[bool] = False,
491
+ use_cache: Optional[bool] = False,
492
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
493
+ residual = hidden_states
494
+
495
+ # Pre-attention layernorm
496
+ hidden_states = self.input_layernorm(hidden_states)
497
+
498
+ # Canon-A (before attention)
499
+ if self.canon_a is not None:
500
+ hidden_states = self.canon_a(hidden_states)
501
+
502
+ # Self Attention (bidirectional)
503
+ hidden_states, self_attn_weights, present_key_value = self.self_attn(
504
+ hidden_states=hidden_states,
505
+ attention_mask=attention_mask,
506
+ position_ids=position_ids,
507
+ past_key_value=past_key_value,
508
+ output_attentions=output_attentions,
509
+ use_cache=use_cache,
510
+ )
511
+ hidden_states = residual + hidden_states
512
+
513
+ # MLP block
514
+ residual = hidden_states
515
+ hidden_states = self.post_attention_layernorm(hidden_states)
516
+
517
+ # Canon-C (before MLP)
518
+ if self.canon_c is not None:
519
+ hidden_states = self.canon_c(hidden_states)
520
+
521
+ hidden_states = self.mlp(hidden_states)
522
+ hidden_states = residual + hidden_states
523
+
524
+ outputs = (hidden_states,)
525
+
526
+ if output_attentions:
527
+ outputs += (self_attn_weights,)
528
+
529
+ if use_cache:
530
+ outputs += (present_key_value,)
531
+
532
+ return outputs
533
+
534
+
535
+ class DharaPreTrainedModel(PreTrainedModel):
536
+ config_class = DharaConfig
537
+ base_model_prefix = "model"
538
+ supports_gradient_checkpointing = True
539
+ _no_split_modules = ["DharaDecoderLayer"]
540
+ _skip_keys_device_placement = "past_key_values"
541
+ _supports_flash_attn_2 = True
542
+ _supports_cache_class = True
543
+
544
+ def _init_weights(self, module):
545
+ std = self.config.initializer_range
546
+ if isinstance(module, nn.Linear):
547
+ module.weight.data.normal_(mean=0.0, std=std)
548
+ if module.bias is not None:
549
+ module.bias.data.zero_()
550
+ elif isinstance(module, nn.Embedding):
551
+ module.weight.data.normal_(mean=0.0, std=std)
552
+ if module.padding_idx is not None:
553
+ module.weight.data[module.padding_idx].zero_()
554
+
555
+
556
+ class DharaModel(DharaPreTrainedModel):
557
+ """
558
+ Dhara base model with bidirectional attention and Canon layers.
559
+ """
560
+
561
+ def __init__(self, config: DharaConfig):
562
+ super().__init__(config)
563
+ self.padding_idx = config.pad_token_id
564
+ self.vocab_size = config.vocab_size
565
+
566
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
567
+ self.layers = nn.ModuleList(
568
+ [DharaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
569
+ )
570
+
571
+ self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
572
+ self.gradient_checkpointing = False
573
+
574
+ self.config = config
575
+ self.mask_token_id = config.mask_token_id
576
+ self._use_flash_attention_2 = config.use_flash_attention and FLASH_ATTN_AVAILABLE
577
+
578
+ self.post_init()
579
+
580
+ def get_input_embeddings(self):
581
+ return self.embed_tokens
582
+
583
+ def set_input_embeddings(self, value):
584
+ self.embed_tokens = value
585
+
586
+ def forward(
587
+ self,
588
+ input_ids: torch.LongTensor = None,
589
+ attention_mask: Optional[torch.Tensor] = None,
590
+ position_ids: Optional[torch.LongTensor] = None,
591
+ past_key_values=None,
592
+ inputs_embeds: Optional[torch.FloatTensor] = None,
593
+ use_cache: Optional[bool] = None,
594
+ output_attentions: Optional[bool] = None,
595
+ output_hidden_states: Optional[bool] = None,
596
+ return_dict: Optional[bool] = None,
597
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
598
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
599
+ output_hidden_states = (
600
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
601
+ )
602
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
603
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
604
+
605
+ if input_ids is not None and inputs_embeds is not None:
606
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
607
+ elif input_ids is not None:
608
+ batch_size, seq_length = input_ids.shape[:2]
609
+ elif inputs_embeds is not None:
610
+ batch_size, seq_length = inputs_embeds.shape[:2]
611
+ else:
612
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
613
+
614
+ if self.gradient_checkpointing and self.training:
615
+ if use_cache:
616
+ logger.warning_once(
617
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
618
+ )
619
+ use_cache = False
620
+
621
+ past_key_values_length = 0
622
+ if use_cache:
623
+ use_legacy_cache = not isinstance(past_key_values, Cache)
624
+ if use_legacy_cache:
625
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
626
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
627
+
628
+ if position_ids is None:
629
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
630
+ position_ids = torch.arange(
631
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
632
+ )
633
+ position_ids = position_ids.unsqueeze(0)
634
+
635
+ if inputs_embeds is None:
636
+ inputs_embeds = self.embed_tokens(input_ids)
637
+
638
+ if self._use_flash_attention_2:
639
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
640
+ else:
641
+ # Bidirectional attention mask (not causal)
642
+ if attention_mask is not None:
643
+ if attention_mask.dim() == 2:
644
+ batch_size, seq_length = attention_mask.shape
645
+ attention_mask_4d = attention_mask[:, None, None, :].expand(
646
+ batch_size, 1, seq_length, seq_length
647
+ ).to(dtype=inputs_embeds.dtype)
648
+ attention_mask = torch.where(
649
+ attention_mask_4d == 0,
650
+ torch.tensor(float('-inf'), dtype=inputs_embeds.dtype, device=attention_mask_4d.device),
651
+ torch.tensor(0.0, dtype=inputs_embeds.dtype, device=attention_mask_4d.device)
652
+ )
653
+ else:
654
+ attention_mask = attention_mask
655
+ else:
656
+ attention_mask = None
657
+
658
+ hidden_states = inputs_embeds
659
+
660
+ all_hidden_states = () if output_hidden_states else None
661
+ all_self_attns = () if output_attentions else None
662
+ next_decoder_cache = None
663
+
664
+ for decoder_layer in self.layers:
665
+ if output_hidden_states:
666
+ all_hidden_states += (hidden_states,)
667
+
668
+ if self.gradient_checkpointing and self.training:
669
+ layer_outputs = self._gradient_checkpointing_func(
670
+ decoder_layer.__call__,
671
+ hidden_states,
672
+ attention_mask,
673
+ position_ids,
674
+ past_key_values,
675
+ output_attentions,
676
+ use_cache,
677
+ )
678
+ else:
679
+ layer_outputs = decoder_layer(
680
+ hidden_states,
681
+ attention_mask=attention_mask,
682
+ position_ids=position_ids,
683
+ past_key_value=past_key_values,
684
+ output_attentions=output_attentions,
685
+ use_cache=use_cache,
686
+ )
687
+
688
+ hidden_states = layer_outputs[0]
689
+
690
+ if use_cache:
691
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
692
+
693
+ if output_attentions:
694
+ all_self_attns += (layer_outputs[1],)
695
+
696
+ hidden_states = self.norm(hidden_states)
697
+
698
+ if output_hidden_states:
699
+ all_hidden_states += (hidden_states,)
700
+
701
+ next_cache = None
702
+ if use_cache:
703
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
704
+
705
+ if not return_dict:
706
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
707
+
708
+ return BaseModelOutputWithPast(
709
+ last_hidden_state=hidden_states,
710
+ past_key_values=next_cache,
711
+ hidden_states=all_hidden_states,
712
+ attentions=all_self_attns,
713
+ )
714
+
715
+ def add_noise_to_tokens(self, input_ids: torch.LongTensor, t: torch.FloatTensor, eps: float = None):
716
+ """
717
+ MDM-style masking: Replace tokens with [MASK] based on noise level t.
718
+
719
+ Args:
720
+ input_ids: Input token IDs [batch_size, seq_len]
721
+ t: Noise level in [0, 1] [batch_size]
722
+ eps: Minimum mask probability
723
+
724
+ Returns:
725
+ Tuple of (noisy_input_ids, corruption_mask, p_mask)
726
+ """
727
+ batch_size, seq_len = input_ids.shape
728
+ device = input_ids.device
729
+
730
+ if eps is None:
731
+ eps = getattr(self.config, 'mask_epsilon', 0.001)
732
+ p_mask = (1 - eps) * t + eps
733
+
734
+ p_mask = p_mask.unsqueeze(-1).expand(batch_size, seq_len)
735
+
736
+ corruption_mask = torch.rand(batch_size, seq_len, device=device) < p_mask
737
+
738
+ mask_token_id = self.mask_token_id
739
+ noisy_input_ids = torch.where(corruption_mask, mask_token_id, input_ids)
740
+
741
+ return noisy_input_ids, corruption_mask, p_mask
742
+
743
+
744
+ class DharaForMaskedDiffusion(DharaPreTrainedModel, GenerationMixin):
745
+ """Dhara Model with Masked Diffusion head for training"""
746
+ _tied_weights_keys = ["lm_head.weight"]
747
+
748
+ def __init__(self, config):
749
+ super().__init__(config)
750
+ self.model = DharaModel(config)
751
+ self.vocab_size = config.vocab_size
752
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
753
+
754
+ self.config = config
755
+ self.mask_token_id = config.mask_token_id
756
+
757
+ self.post_init()
758
+
759
+ def get_input_embeddings(self):
760
+ return self.model.embed_tokens
761
+
762
+ def set_input_embeddings(self, value):
763
+ self.model.embed_tokens = value
764
+
765
+ def get_output_embeddings(self):
766
+ return self.lm_head
767
+
768
+ def set_output_embeddings(self, new_embeddings):
769
+ self.lm_head = new_embeddings
770
+
771
+ def set_decoder(self, decoder):
772
+ self.model = decoder
773
+
774
+ def get_decoder(self):
775
+ return self.model
776
+
777
+ def forward(
778
+ self,
779
+ input_ids: torch.LongTensor = None,
780
+ attention_mask: Optional[torch.Tensor] = None,
781
+ position_ids: Optional[torch.LongTensor] = None,
782
+ past_key_values=None,
783
+ inputs_embeds: Optional[torch.FloatTensor] = None,
784
+ labels: Optional[torch.LongTensor] = None,
785
+ use_cache: Optional[bool] = None,
786
+ output_attentions: Optional[bool] = None,
787
+ output_hidden_states: Optional[bool] = None,
788
+ return_dict: Optional[bool] = None,
789
+ corruption_mask: Optional[torch.BoolTensor] = None,
790
+ p_mask: Optional[torch.Tensor] = None,
791
+ ) -> Union[Tuple, MaskedLMOutput]:
792
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
793
+ output_hidden_states = (
794
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
795
+ )
796
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
797
+
798
+ outputs = self.model(
799
+ input_ids=input_ids,
800
+ attention_mask=attention_mask,
801
+ position_ids=position_ids,
802
+ past_key_values=past_key_values,
803
+ inputs_embeds=inputs_embeds,
804
+ use_cache=use_cache,
805
+ output_attentions=output_attentions,
806
+ output_hidden_states=output_hidden_states,
807
+ return_dict=return_dict,
808
+ )
809
+
810
+ hidden_states = outputs[0]
811
+ if self.config.tie_word_embeddings:
812
+ logits = F.linear(hidden_states, self.model.embed_tokens.weight)
813
+ else:
814
+ logits = self.lm_head(hidden_states)
815
+ logits = logits.float()
816
+
817
+ loss = None
818
+ if labels is not None:
819
+ loss = self.compute_diffusion_loss(logits, labels, corruption_mask, p_mask)
820
+
821
+ if not return_dict:
822
+ output = (logits,) + outputs[1:]
823
+ return (loss,) + output if loss is not None else output
824
+
825
+ return MaskedLMOutput(
826
+ loss=loss,
827
+ logits=logits,
828
+ hidden_states=outputs.hidden_states,
829
+ attentions=outputs.attentions,
830
+ )
831
+
832
+ def compute_diffusion_loss(self, logits, labels, corruption_mask=None, p_mask=None):
833
+ """
834
+ MDM loss with p_mask importance weighting.
835
+ """
836
+ if corruption_mask is None or p_mask is None:
837
+ raise ValueError(
838
+ "MDM requires both corruption_mask and p_mask for loss computation."
839
+ )
840
+
841
+ loss = F.cross_entropy(
842
+ logits.view(-1, self.config.vocab_size),
843
+ labels.view(-1),
844
+ reduction='none'
845
+ )
846
+ loss = loss.view(labels.shape)
847
+
848
+ masked_losses = loss[corruption_mask]
849
+ masked_p_mask = p_mask[corruption_mask]
850
+
851
+ weighted_losses = masked_losses / masked_p_mask
852
+
853
+ total_positions = labels.shape[0] * labels.shape[1]
854
+ return weighted_losses.sum() / total_positions
855
+
856
+ def add_noise_to_tokens(self, input_ids: torch.LongTensor, t: torch.FloatTensor, eps: float = None):
857
+ """Delegate to the base model"""
858
+ return self.model.add_noise_to_tokens(input_ids, t, eps)
859
+
860
+ def prepare_inputs_for_generation(
861
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
862
+ ):
863
+ if past_key_values is not None:
864
+ if isinstance(past_key_values, Cache):
865
+ cache_length = past_key_values.get_seq_length()
866
+ past_length = past_key_values.seen_tokens
867
+ max_cache_length = past_key_values.get_max_length()
868
+ else:
869
+ cache_length = past_length = past_key_values[0][0].shape[2]
870
+ max_cache_length = None
871
+
872
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
873
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
874
+ elif past_length < input_ids.shape[1]:
875
+ input_ids = input_ids[:, past_length:]
876
+
877
+ if (
878
+ max_cache_length is not None
879
+ and attention_mask is not None
880
+ and cache_length + input_ids.shape[1] > max_cache_length
881
+ ):
882
+ attention_mask = attention_mask[:, -max_cache_length:]
883
+
884
+ position_ids = kwargs.get("position_ids", None)
885
+ if attention_mask is not None and position_ids is None:
886
+ position_ids = attention_mask.long().cumsum(-1) - 1
887
+ position_ids.masked_fill_(attention_mask == 0, 1)
888
+ if past_key_values:
889
+ position_ids = position_ids[:, -input_ids.shape[1] :]
890
+
891
+ if inputs_embeds is not None and past_key_values is None:
892
+ model_inputs = {"inputs_embeds": inputs_embeds}
893
+ else:
894
+ model_inputs = {"input_ids": input_ids}
895
+
896
+ model_inputs.update(
897
+ {
898
+ "position_ids": position_ids,
899
+ "past_key_values": past_key_values,
900
+ "use_cache": kwargs.get("use_cache"),
901
+ "attention_mask": attention_mask,
902
+ }
903
+ )
904
+ return model_inputs
905
+
906
+ @staticmethod
907
+ def _reorder_cache(past_key_values, beam_idx):
908
+ reordered_past = ()
909
+ for layer_past in past_key_values:
910
+ reordered_past += (
911
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
912
+ )
913
+ return reordered_past
914
+
915
+ @torch.no_grad()
916
+ def generate(
917
+ self,
918
+ input_ids: Optional[torch.LongTensor] = None,
919
+ max_length: Optional[int] = None,
920
+ max_new_tokens: Optional[int] = None,
921
+ num_diffusion_steps: int = 10,
922
+ temperature: float = 1.0,
923
+ top_p: float = 0.9,
924
+ top_k: int = 50,
925
+ do_sample: bool = True,
926
+ pad_token_id: Optional[int] = None,
927
+ eos_token_id: Optional[int] = None,
928
+ repetition_penalty: float = 1.2,
929
+ **kwargs
930
+ ) -> torch.LongTensor:
931
+ """
932
+ Generate text using autoregressive sampling with the diffusion model.
933
+
934
+ Since this model was converted from AR to diffusion via WSD training,
935
+ we generate tokens one at a time left-to-right, using the model's
936
+ next-token predictions at each position.
937
+
938
+ Args:
939
+ input_ids: Input prompt token IDs [batch_size, prompt_len]
940
+ max_length: Maximum total sequence length (prompt + generation)
941
+ max_new_tokens: Number of new tokens to generate (alternative to max_length)
942
+ num_diffusion_steps: Number of refinement iterations per token (higher = better quality)
943
+ temperature: Sampling temperature (higher = more random)
944
+ top_p: Nucleus sampling threshold
945
+ top_k: Top-k sampling threshold
946
+ do_sample: Whether to sample or take argmax
947
+ pad_token_id: Token ID for padding
948
+ eos_token_id: Token ID for end of sequence
949
+ repetition_penalty: Penalty for repeating tokens (>1 = less repetition)
950
+
951
+ Returns:
952
+ Generated token IDs including the prompt
953
+ """
954
+ # Handle device and dtype
955
+ device = input_ids.device if input_ids is not None else next(self.parameters()).device
956
+
957
+ # Determine generation length
958
+ if input_ids is not None:
959
+ batch_size, prompt_len = input_ids.shape
960
+ else:
961
+ batch_size, prompt_len = 1, 0
962
+ input_ids = torch.empty(batch_size, 0, dtype=torch.long, device=device)
963
+
964
+ if max_new_tokens is not None:
965
+ gen_len = max_new_tokens
966
+ elif max_length is not None:
967
+ gen_len = max_length - prompt_len
968
+ else:
969
+ gen_len = 50 # Default generation length
970
+
971
+ if gen_len <= 0:
972
+ return input_ids
973
+
974
+ # Get special token IDs
975
+ mask_token_id = self.config.mask_token_id
976
+ if pad_token_id is None:
977
+ pad_token_id = self.config.pad_token_id if hasattr(self.config, 'pad_token_id') else 0
978
+ if eos_token_id is None:
979
+ eos_token_id = self.config.eos_token_id if hasattr(self.config, 'eos_token_id') else 2
980
+
981
+ # Start with the prompt
982
+ generated = input_ids.clone()
983
+
984
+ # Track generated tokens for repetition penalty
985
+ generated_set = set()
986
+ for i in range(prompt_len):
987
+ for b in range(batch_size):
988
+ generated_set.add(input_ids[b, i].item())
989
+
990
+ # Generate tokens one at a time (autoregressive style)
991
+ for pos in range(gen_len):
992
+ # Add a mask token at the next position
993
+ current_seq = torch.cat([
994
+ generated,
995
+ torch.full((batch_size, 1), mask_token_id, dtype=torch.long, device=device)
996
+ ], dim=1)
997
+
998
+ # Get model predictions
999
+ outputs = self(input_ids=current_seq)
1000
+ logits = outputs.logits # [batch, seq_len, vocab]
1001
+
1002
+ # Get logits for the last (masked) position
1003
+ next_token_logits = logits[:, -1, :] # [batch, vocab]
1004
+
1005
+ # Apply repetition penalty
1006
+ if repetition_penalty != 1.0:
1007
+ for b in range(batch_size):
1008
+ for prev_token in generated_set:
1009
+ if prev_token < next_token_logits.shape[1]:
1010
+ next_token_logits[b, prev_token] /= repetition_penalty
1011
+
1012
+ # Apply temperature
1013
+ if temperature != 1.0 and temperature > 0:
1014
+ next_token_logits = next_token_logits / temperature
1015
+
1016
+ if do_sample and temperature > 0:
1017
+ # Apply top-k filtering
1018
+ if top_k > 0:
1019
+ indices_to_remove = next_token_logits < torch.topk(next_token_logits, top_k)[0][..., -1, None]
1020
+ next_token_logits[indices_to_remove] = float('-inf')
1021
+
1022
+ # Apply top-p (nucleus) filtering
1023
+ if top_p < 1.0:
1024
+ sorted_logits, sorted_indices = torch.sort(next_token_logits, descending=True)
1025
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
1026
+
1027
+ # Remove tokens with cumulative probability above threshold
1028
+ sorted_indices_to_remove = cumulative_probs > top_p
1029
+ # Shift the indices to the right to keep the first token above threshold
1030
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
1031
+ sorted_indices_to_remove[..., 0] = False
1032
+
1033
+ # Scatter sorted indices to original indexing
1034
+ indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
1035
+ next_token_logits[indices_to_remove] = float('-inf')
1036
+
1037
+ # Sample from the filtered distribution
1038
+ probs = F.softmax(next_token_logits, dim=-1)
1039
+ next_tokens = torch.multinomial(probs, num_samples=1).squeeze(-1)
1040
+ else:
1041
+ # Greedy decoding
1042
+ next_tokens = next_token_logits.argmax(dim=-1)
1043
+
1044
+ # Add to generated sequence
1045
+ generated = torch.cat([generated, next_tokens.unsqueeze(-1)], dim=1)
1046
+
1047
+ # Update generated set for repetition penalty
1048
+ for b in range(batch_size):
1049
+ generated_set.add(next_tokens[b].item())
1050
+
1051
+ # Check for EOS
1052
+ if eos_token_id is not None and (next_tokens == eos_token_id).all():
1053
+ break
1054
+
1055
+ return generated
1056
+
1057
+ def save_pretrained(self, save_directory, **kwargs):
1058
+ """Override to save in SafeTensors format by default"""
1059
+ kwargs['safe_serialization'] = kwargs.get('safe_serialization', True)
1060
+ return super().save_pretrained(save_directory, **kwargs)
1061
+
1062
+
1063
+ def count_parameters(model):
1064
+ """Count total and Canon-specific parameters."""
1065
+ total = sum(p.numel() for p in model.parameters())
1066
+ canon = sum(p.numel() for n, p in model.named_parameters() if 'canon' in n.lower())
1067
+ return total, canon
1068
+
1069
+
1070
+ if __name__ == "__main__":
1071
+ # Quick test
1072
+ print("Testing Dhara model creation...")
1073
+
1074
+ config = DharaConfig(
1075
+ vocab_size=50304,
1076
+ hidden_size=384,
1077
+ num_hidden_layers=32,
1078
+ num_attention_heads=8,
1079
+ num_key_value_heads=4,
1080
+ intermediate_size=1024,
1081
+ canon_set="AC",
1082
+ canon_kernel=4,
1083
+ canon_residual=True,
1084
+ )
1085
+
1086
+ model = DharaForMaskedDiffusion(config)
1087
+
1088
+ total, canon = count_parameters(model)
1089
+ print(f"Model created successfully!")
1090
+ print(f"Total params: {total:,} ({total/1e6:.2f}M)")
1091
+ print(f"Canon params: {canon:,} ({100*canon/total:.3f}%)")
1092
+ print(f"Base Dhara would be: {total - canon:,}")
1093
+
1094
+ # Test forward pass
1095
+ batch_size, seq_len = 2, 64
1096
+ input_ids = torch.randint(0, 50304, (batch_size, seq_len))
1097
+
1098
+ # Test with diffusion noise
1099
+ t = torch.rand(batch_size)
1100
+ noisy_ids, corruption_mask, p_mask = model.add_noise_to_tokens(input_ids, t)
1101
+
1102
+ with torch.no_grad():
1103
+ outputs = model(
1104
+ input_ids=noisy_ids,
1105
+ labels=input_ids,
1106
+ corruption_mask=corruption_mask,
1107
+ p_mask=p_mask,
1108
+ )
1109
+
1110
+ print(f"Forward pass: loss={outputs.loss.item():.4f}")
1111
+ print("Ready for training!")
jdeval-checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ecedb06a0bb26547b0284c4718df2ff4290fd82eaac41e99ee6071421bd5de3
3
+ size 16359
jdeval-checkpoint-1000/rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff0deccf6cc0327b6d15d25cabf7f2a0d356cf51d21c36baa146f79968217330
3
+ size 16389
jdeval-checkpoint-1000/rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94fd0c2f567d1fb6de10ac4e034312791f47de2f3677c14b2065bd3bb4caacd3
3
+ size 16389
jdeval-checkpoint-1000/rng_state_10.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0793e5bf841f943b9f71063e9093b28914addc2a623033eafb36e77448d9bf1d
3
+ size 16404
jdeval-checkpoint-1000/rng_state_11.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ccd0f7f3d813a3fa0429ca5243454f3dd2c132e5ad03139cf14bb7d5b461e99
3
+ size 16404
jdeval-checkpoint-1000/rng_state_12.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:332ed95b7d22fd2c487b024b2363b05e4894c93a5b5860a6da5bfc752429d7f1
3
+ size 16404
jdeval-checkpoint-1000/rng_state_13.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47b6580e8b30ea7ee4e1e0f9f9b9bf857ad03b820c5c80bf56e44fbb7994ad73
3
+ size 16404
jdeval-checkpoint-1000/rng_state_14.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c9b7074032903706b8b7bed650fc27332ab9c7f13a2144a5b9ddc32d6ca7278
3
+ size 16404
jdeval-checkpoint-1000/rng_state_15.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31c90ce3361b61ea147f7b84c8b2b3cbe73f7883841d7704d63699469820cfc8
3
+ size 16404
jdeval-checkpoint-1000/rng_state_16.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:015208a90007d88b5e024abc8b9bbe1b6ecd6559a24d91ab70537579b981b8c8
3
+ size 16404
jdeval-checkpoint-1000/rng_state_17.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d29653a983da2e011c8d534613be7b95e940008b5d14361eebb62119f757ece
3
+ size 16404
jdeval-checkpoint-1000/rng_state_18.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aee43527f2d891269202a40fcce4cead1b848b2dae5b43beb27ac807a4f5229f
3
+ size 16404
jdeval-checkpoint-1000/rng_state_19.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:001dd3a6db3831e3e1a1d2546544afadf5cddc2c5539ee440811e27246fba0ec
3
+ size 16404
jdeval-checkpoint-1000/rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:810b0678dcccd58dcbc2310d0812a99892c3674d3f1cf4f6b9df0e48fed60ca7
3
+ size 16389
jdeval-checkpoint-1000/rng_state_20.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f39208b864961c81599698a23757bb5487c8100887b787d04d69da99f96f9d54
3
+ size 16404
jdeval-checkpoint-1000/rng_state_21.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e85727e586442fa93a1fbd79ab71434f108af95ca1c58146d3c7bd11d0e3c6
3
+ size 16404
jdeval-checkpoint-1000/rng_state_22.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:823b93d2af4444da5157372a456117020a16e0a16e4d141f3a138f5bc00f94f3
3
+ size 16404
jdeval-checkpoint-1000/rng_state_23.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f51eb1e822fe4b3720efb937f2a54b11ee3ff75a023b4accb9a60d7c144d0120
3
+ size 16404
jdeval-checkpoint-1000/rng_state_24.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8725aebd4b28eb949771fb1e167ed0394b738ec657d0ca2fcad82325fce080b
3
+ size 16404
jdeval-checkpoint-1000/rng_state_25.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50d2a4acb865cabf79cbb9229e43f1f1920075e96b96211dffef359302f0224a
3
+ size 16404
jdeval-checkpoint-1000/rng_state_26.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7bdeceee0a9f8e37578d9b776e88ee3b7bb839894d3fa841dbb588255f9d2b
3
+ size 16404
jdeval-checkpoint-1000/rng_state_27.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7acaee94b567e1b43706208367f6d5c6da7379cb887c882408591082b42ae9
3
+ size 16404
jdeval-checkpoint-1000/rng_state_28.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235a45c1683df3afdb9e1887701925c581a5fb158d0843f42dff4272b3f25ab0
3
+ size 16404
jdeval-checkpoint-1000/rng_state_29.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d29a22410ad613ef129912d1f27204551148484f3ded39a2b50e7b6145e480
3
+ size 16404
jdeval-checkpoint-1000/rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4713b7dc17b1f6ee679289a3c2fc1900ad0f45e2051bedf01d15882977e770b2
3
+ size 16389
jdeval-checkpoint-1000/rng_state_30.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea43b9568c84e149ca5d7660228a36cdec4132dcf840ae851d0ff81103ea7c89
3
+ size 16404
jdeval-checkpoint-1000/rng_state_31.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65c9976a1fd2b8d60d6c2b86e63ccc3e792a48359327231fd713aa5fb450019f
3
+ size 16404
jdeval-checkpoint-1000/rng_state_32.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3efc5702320d7b143675cce792fc6e07ff692c6fe38f589c56eca85f58da680c
3
+ size 16404
jdeval-checkpoint-1000/rng_state_33.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a10ea53e98a68f440999911d706f5b4c070a3feec011f8e24ddab2909a4b526
3
+ size 16404
jdeval-checkpoint-1000/rng_state_34.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1375738c05ecd38cfc6089c9d8520e3dc38719c70baa772f235f588356773b49
3
+ size 16404
jdeval-checkpoint-1000/rng_state_35.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f80abdea4401fdd600b18300d4748f10d1cb832724a0ce4047c1d9f10cbc9a0
3
+ size 16404
jdeval-checkpoint-1000/rng_state_36.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24ce27fc4717fedda7ea2c4d7926b1eece24bc3a47ccfc16ac7c6c4f4152ec02
3
+ size 16404
jdeval-checkpoint-1000/rng_state_37.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a76e059827323b0d96a3b6009647651a734fa9c5c168416d6e36af483e752d8
3
+ size 16404
jdeval-checkpoint-1000/rng_state_38.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5d90dc2ae4eb28688baf916fb86bea441fd1b6b7270d7b35f4efa573c2b79ac
3
+ size 16404
jdeval-checkpoint-1000/rng_state_39.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c3fb5b3e4842343c165b4e2d09399ca3d5eece3d53bcccccf9cce12e6ee8acd
3
+ size 16404
jdeval-checkpoint-1000/rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:144786ebc8bd1abd8302a42bbb786aa8df5d360d7e6a161381b8bc0afacd63bc
3
+ size 16389
jdeval-checkpoint-1000/rng_state_40.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59b60b3ea0c58cf3e724eedd4477d70a5bb0f80aef7d5635e327aad3468c6153
3
+ size 16404
jdeval-checkpoint-1000/rng_state_41.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:535e893aeddc30475ce8cd4a10a67a1ed0c54e1fbeb3aab22f75f3153679d41f
3
+ size 16404