nclgbd commited on
Commit
fad8e18
·
verified ·
1 Parent(s): b7bcd7a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,3 +1,84 @@
1
- ---
2
- license: cc-by-nc-nd-4.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: cc-by-nc-4.0
4
+ ---
5
+
6
+ ```
7
+ python=3.10
8
+ torch==2.7.1 # may work with more recent version
9
+ torchvision==0.22.1
10
+ transformers==4.40.0
11
+ opencv-python
12
+ albumentations
13
+ accelerate
14
+ Pillow
15
+ matplotlib
16
+ einops
17
+ pyarrow
18
+ sentencepiece
19
+ protobuf
20
+ ```
21
+
22
+ <!-- markdownlint-disable first-line-h1 -->
23
+ <!-- markdownlint-disable html -->
24
+
25
+ <div align="center">
26
+ <h1>
27
+ CheXagent
28
+ </h1>
29
+ </div>
30
+
31
+ <p align="center">
32
+ 📝 <a href="https://arxiv.org/abs/2401.12208" target="_blank">Paper</a> • 🤗 <a href="https://huggingface.co/StanfordAIMI/CheXagent-2-3b/" target="_blank">Hugging Face</a> • 🧩 <a href="https://github.com/Stanford-AIMI/CheXagent" target="_blank">Github</a> • 🪄 <a href="https://stanford-aimi.github.io/chexagent.html" target="_blank">Project</a>
33
+ </p>
34
+
35
+ <div align="center">
36
+ </div>
37
+
38
+ ## ✨ Latest News
39
+
40
+ - [04/29/2024]: Model released in [Hugging Face](https://huggingface.co/StanfordAIMI/CheXagent-2-3b/).
41
+
42
+ ## 🎬 Get Started
43
+
44
+ ```python
45
+ import io
46
+
47
+ import requests
48
+ import torch
49
+ from PIL import Image
50
+ from transformers import AutoModelForCausalLM, AutoTokenizer
51
+
52
+ # step 1: Setup constant
53
+ model_name = "StanfordAIMI/CheXagent-2-3b"
54
+ dtype = torch.bfloat16
55
+ device = "cuda"
56
+
57
+ # step 2: Load Processor and Model
58
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
59
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", trust_remote_code=True)
60
+ model = model.to(dtype)
61
+ model.eval()
62
+
63
+ # step 3: Inference
64
+ query = tokenizer.from_list_format([*[{'image': path} for path in paths], {'text': prompt}])
65
+ conv = [{"from": "system", "value": "You are a helpful assistant."}, {"from": "human", "value": query}]
66
+ input_ids = tokenizer.apply_chat_template(conv, add_generation_prompt=True, return_tensors="pt")
67
+ output = model.generate(
68
+ input_ids.to(device), do_sample=False, num_beams=1, temperature=1., top_p=1., use_cache=True,
69
+ max_new_tokens=512
70
+ )[0]
71
+ response = tokenizer.decode(output[input_ids.size(1):-1])
72
+ ```
73
+
74
+ ## ✏️ Citation
75
+
76
+ ```
77
+ @article{chexagent-2024,
78
+ title={CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation},
79
+ author={Chen, Zhihong and Varma, Maya and Delbrouck, Jean-Benoit and Paschali, Magdalini and Blankemeier, Louis and Veen, Dave Van and Valanarasu, Jeya Maria Jose and Youssef, Alaa and Cohen, Joseph Paul and Reis, Eduardo Pontes and Tsai, Emily B. and Johnston, Andrew and Olsen, Cameron and Abraham, Tanishq Mathew and Gatidis, Sergios and Chaudhari, Akshay S and Langlotz, Curtis},
80
+ journal={arXiv preprint arXiv:2401.12208},
81
+ url={https://arxiv.org/abs/2401.12208},
82
+ year={2024}
83
+ }
84
+ ```
added_tokens.json ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "\t\t": 50294,
3
+ "\t\t\t": 50293,
4
+ "\t\t\t\t": 50292,
5
+ "\t\t\t\t\t": 50291,
6
+ "\t\t\t\t\t\t": 50290,
7
+ "\t\t\t\t\t\t\t": 50289,
8
+ "\t\t\t\t\t\t\t\t": 50288,
9
+ "\t\t\t\t\t\t\t\t\t": 50287,
10
+ " ": 50286,
11
+ " ": 50285,
12
+ " ": 50284,
13
+ " ": 50283,
14
+ " ": 50282,
15
+ " ": 50281,
16
+ " ": 50280,
17
+ " ": 50279,
18
+ " ": 50278,
19
+ " ": 50277,
20
+ " ": 50276,
21
+ " ": 50275,
22
+ " ": 50274,
23
+ " ": 50273,
24
+ " ": 50272,
25
+ " ": 50271,
26
+ " ": 50270,
27
+ " ": 50269,
28
+ " ": 50268,
29
+ " ": 50267,
30
+ " ": 50266,
31
+ " ": 50265,
32
+ " ": 50264,
33
+ " ": 50263,
34
+ " ": 50262,
35
+ " ": 50261,
36
+ " ": 50260,
37
+ " ": 50259,
38
+ " ": 50258,
39
+ " ": 50257,
40
+ "<|/box|>": 50301,
41
+ "<|/img|>": 50296,
42
+ "<|/quad|>": 50303,
43
+ "<|/ref|>": 50299,
44
+ "<|box|>": 50300,
45
+ "<|coord_0|>": 50304,
46
+ "<|coord_1|>": 50305,
47
+ "<|coord_2|>": 50306,
48
+ "<|coord_3|>": 50307,
49
+ "<|coord_4|>": 50308,
50
+ "<|coord_5|>": 50309,
51
+ "<|coord_6|>": 50310,
52
+ "<|coord_7|>": 50311,
53
+ "<|coord_8|>": 50312,
54
+ "<|coord_9|>": 50313,
55
+ "<|extra_0|>": 50314,
56
+ "<|extra_10|>": 50324,
57
+ "<|extra_11|>": 50325,
58
+ "<|extra_12|>": 50326,
59
+ "<|extra_13|>": 50327,
60
+ "<|extra_14|>": 50328,
61
+ "<|extra_15|>": 50329,
62
+ "<|extra_16|>": 50330,
63
+ "<|extra_17|>": 50331,
64
+ "<|extra_18|>": 50332,
65
+ "<|extra_19|>": 50333,
66
+ "<|extra_1|>": 50315,
67
+ "<|extra_20|>": 50334,
68
+ "<|extra_21|>": 50335,
69
+ "<|extra_22|>": 50336,
70
+ "<|extra_23|>": 50337,
71
+ "<|extra_24|>": 50338,
72
+ "<|extra_25|>": 50339,
73
+ "<|extra_26|>": 50340,
74
+ "<|extra_27|>": 50341,
75
+ "<|extra_28|>": 50342,
76
+ "<|extra_29|>": 50343,
77
+ "<|extra_2|>": 50316,
78
+ "<|extra_30|>": 50344,
79
+ "<|extra_31|>": 50345,
80
+ "<|extra_32|>": 50346,
81
+ "<|extra_33|>": 50347,
82
+ "<|extra_34|>": 50348,
83
+ "<|extra_35|>": 50349,
84
+ "<|extra_36|>": 50350,
85
+ "<|extra_37|>": 50351,
86
+ "<|extra_38|>": 50352,
87
+ "<|extra_39|>": 50353,
88
+ "<|extra_3|>": 50317,
89
+ "<|extra_40|>": 50354,
90
+ "<|extra_41|>": 50355,
91
+ "<|extra_42|>": 50356,
92
+ "<|extra_43|>": 50357,
93
+ "<|extra_44|>": 50358,
94
+ "<|extra_45|>": 50359,
95
+ "<|extra_46|>": 50360,
96
+ "<|extra_47|>": 50361,
97
+ "<|extra_48|>": 50362,
98
+ "<|extra_49|>": 50363,
99
+ "<|extra_4|>": 50318,
100
+ "<|extra_50|>": 50364,
101
+ "<|extra_51|>": 50365,
102
+ "<|extra_52|>": 50366,
103
+ "<|extra_53|>": 50367,
104
+ "<|extra_5|>": 50319,
105
+ "<|extra_6|>": 50320,
106
+ "<|extra_7|>": 50321,
107
+ "<|extra_8|>": 50322,
108
+ "<|extra_9|>": 50323,
109
+ "<|imgpad|>": 50297,
110
+ "<|img|>": 50295,
111
+ "<|quad|>": 50302,
112
+ "<|ref|>": 50298
113
+ }
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "StanfordAIMI/CheXagent-2-3b",
3
+ "architectures": [
4
+ "CheXagentForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 50256,
8
+ "embd_pdrop": 0.0,
9
+ "eos_token_id": 50256,
10
+ "hidden_act": "gelu_new",
11
+ "hidden_size": 2560,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 10240,
14
+ "layer_norm_eps": 1e-05,
15
+ "max_position_embeddings": 2048,
16
+ "model_type": "phi",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 32,
20
+ "partial_rotary_factor": 0.4,
21
+ "qk_layernorm": false,
22
+ "resid_pdrop": 0.1,
23
+ "rope_scaling": null,
24
+ "rope_theta": 10000.0,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.40.0",
28
+ "use_cache": false,
29
+ "visual": {
30
+ "image_size": 512,
31
+ "output_dim": 2560,
32
+ "vision_model_name_or_path": "StanfordAIMI/XraySigLIP__vit-l-16-siglip-384__webli"
33
+ },
34
+ "vocab_size": 51200,
35
+ "auto_map": {
36
+ "AutoModelForCausalLM": "modeling_chexagent.CheXagentForCausalLM",
37
+ "AutoConfig": "configuration_chexagent.CheXagentConfig"
38
+ }
39
+ }
configuration_chexagent.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Phi model configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ PHI_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2/resolve/main/config.json",
25
+ }
26
+
27
+
28
+ class CheXagentConfig(PretrainedConfig):
29
+ r"""
30
+ This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi
31
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
32
+ defaults will yield a similar configuration to that of the Phi
33
+ [microsoft/phi-1](https://huggingface.co/microsoft/phi-1).
34
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
+ documentation from [`PretrainedConfig`] for more information.
36
+ Args:
37
+ vocab_size (`int`, *optional*, defaults to 51200):
38
+ Vocabulary size of the Phi model. Defines the number of different tokens that can be represented by the
39
+ `inputs_ids` passed when calling [`PhiModel`].
40
+ hidden_size (`int`, *optional*, defaults to 2048):
41
+ Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 8192):
43
+ Dimension of the MLP representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 24):
45
+ Number of hidden layers in the Transformer decoder.
46
+ num_attention_heads (`int`, *optional*, defaults to 32):
47
+ Number of attention heads for each attention layer in the Transformer decoder.
48
+ num_key_value_heads (`int`, *optional*):
49
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
50
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
51
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
52
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
53
+ by meanpooling all the original heads within that group. For more details checkout [this
54
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
55
+ `num_attention_heads`.
56
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
57
+ Dropout probability for mlp outputs.
58
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
59
+ The dropout ratio for the embeddings.
60
+ attention_dropout (`float`, *optional*, defaults to 0.0):
61
+ The dropout ratio after computing the attention scores.
62
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
63
+ The non-linear activation function (function or string) in the decoder.
64
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
65
+ The maximum sequence length that this model might ever be used with. Phi-1 and Phi-1.5 supports up to 2048
66
+ tokens.
67
+ initializer_range (`float`, *optional*, defaults to 0.02):
68
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
70
+ The epsilon used by the rms normalization layers.
71
+ use_cache (`bool`, *optional*, defaults to `True`):
72
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
73
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
74
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
75
+ Whether to tie weight embeddings
76
+ rope_theta (`float`, *optional*, defaults to 10000.0):
77
+ The base period of the RoPE embeddings.
78
+ rope_scaling (`Dict`, *optional*):
79
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
80
+ strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
81
+ is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
82
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
83
+ these scaling strategies behave:
84
+ https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This
85
+ is an experimental feature, subject to breaking API changes in future versions.
86
+ partial_rotary_factor (`float`, *optional*, defaults to 0.5):
87
+ Percentage of the query and keys which will have rotary embedding.
88
+ qk_layernorm (`bool`, *optional*, defaults to `False`):
89
+ Whether or not to normalize the Queries and Keys after projecting the hidden states.
90
+ bos_token_id (`int`, *optional*, defaults to 1):
91
+ Denotes beginning of sequences token id.
92
+ eos_token_id (`int`, *optional*, defaults to 2):
93
+ Denotes end of sequences token id.
94
+ Example:
95
+ ```python
96
+ >>> from transformers import PhiModel, PhiConfig
97
+ >>> # Initializing a Phi-1 style configuration
98
+ >>> configuration = PhiConfig.from_pretrained("microsoft/phi-1")
99
+ >>> # Initializing a model from the configuration
100
+ >>> model = PhiModel(configuration)
101
+ >>> # Accessing the model configuration
102
+ >>> configuration = model.config
103
+ ```"""
104
+
105
+ model_type = "phi"
106
+ keys_to_ignore_at_inference = ["past_key_values"]
107
+
108
+ def __init__(
109
+ self,
110
+ vocab_size=51200,
111
+ hidden_size=2048,
112
+ intermediate_size=8192,
113
+ num_hidden_layers=24,
114
+ num_attention_heads=32,
115
+ num_key_value_heads=None,
116
+ resid_pdrop=0.0,
117
+ embd_pdrop=0.0,
118
+ attention_dropout=0.0,
119
+ hidden_act="gelu_new",
120
+ max_position_embeddings=2048,
121
+ initializer_range=0.02,
122
+ layer_norm_eps=1e-5,
123
+ use_cache=True,
124
+ tie_word_embeddings=False,
125
+ rope_theta=10000.0,
126
+ rope_scaling=None,
127
+ partial_rotary_factor=0.5,
128
+ qk_layernorm=False,
129
+ bos_token_id=1,
130
+ eos_token_id=2,
131
+ **kwargs,
132
+ ):
133
+ self.vocab_size = vocab_size
134
+ self.hidden_size = hidden_size
135
+ self.intermediate_size = intermediate_size
136
+ self.num_hidden_layers = num_hidden_layers
137
+ self.num_attention_heads = num_attention_heads
138
+
139
+ if num_key_value_heads is None:
140
+ num_key_value_heads = num_attention_heads
141
+
142
+ self.num_key_value_heads = num_key_value_heads
143
+ self.resid_pdrop = resid_pdrop
144
+ self.embd_pdrop = embd_pdrop
145
+ self.attention_dropout = attention_dropout
146
+ self.hidden_act = hidden_act
147
+ self.max_position_embeddings = max_position_embeddings
148
+ self.initializer_range = initializer_range
149
+ self.layer_norm_eps = layer_norm_eps
150
+ self.use_cache = use_cache
151
+ self.rope_theta = rope_theta
152
+ self.rope_scaling = rope_scaling
153
+ self.partial_rotary_factor = partial_rotary_factor
154
+ self.qk_layernorm = qk_layernorm
155
+ self._rope_scaling_validation()
156
+
157
+ super().__init__(
158
+ bos_token_id=bos_token_id,
159
+ eos_token_id=eos_token_id,
160
+ tie_word_embeddings=tie_word_embeddings,
161
+ **kwargs,
162
+ )
163
+
164
+ # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
165
+ def _rope_scaling_validation(self):
166
+ """
167
+ Validate the `rope_scaling` configuration.
168
+ """
169
+ if self.rope_scaling is None:
170
+ return
171
+
172
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
173
+ raise ValueError(
174
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
175
+ f"got {self.rope_scaling}"
176
+ )
177
+ rope_scaling_type = self.rope_scaling.get("type", None)
178
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
179
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
180
+ raise ValueError(
181
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
182
+ )
183
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
184
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.40.0"
6
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6260448fbee0a4278ebfd7943f52096e733a105f730a28aace9732509cde3499
3
+ size 135
model-00002-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1628f60b3bc0cbab16b15a7019f7bafd9befce39d03aedad6f3d373a1999f5a3
3
+ size 135
model-00003-of-00003.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a48253caadf1548dbfb0c328c0d3988a63745d171c428252b0392bd1759c1faf
3
+ size 135
model.safetensors.index.json ADDED
@@ -0,0 +1,868 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 12562987008
4
+ },
5
+ "weight_map": {
6
+ "lm_head.bias": "model-00003-of-00003.safetensors",
7
+ "lm_head.weight": "model-00003-of-00003.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00003.safetensors",
9
+ "model.final_layernorm.bias": "model-00003-of-00003.safetensors",
10
+ "model.final_layernorm.weight": "model-00003-of-00003.safetensors",
11
+ "model.layers.0.input_layernorm.bias": "model-00001-of-00003.safetensors",
12
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors",
13
+ "model.layers.0.mlp.fc1.bias": "model-00001-of-00003.safetensors",
14
+ "model.layers.0.mlp.fc1.weight": "model-00001-of-00003.safetensors",
15
+ "model.layers.0.mlp.fc2.bias": "model-00001-of-00003.safetensors",
16
+ "model.layers.0.mlp.fc2.weight": "model-00001-of-00003.safetensors",
17
+ "model.layers.0.self_attn.dense.bias": "model-00001-of-00003.safetensors",
18
+ "model.layers.0.self_attn.dense.weight": "model-00001-of-00003.safetensors",
19
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
20
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
21
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
22
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
23
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
24
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
25
+ "model.layers.1.input_layernorm.bias": "model-00001-of-00003.safetensors",
26
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors",
27
+ "model.layers.1.mlp.fc1.bias": "model-00001-of-00003.safetensors",
28
+ "model.layers.1.mlp.fc1.weight": "model-00001-of-00003.safetensors",
29
+ "model.layers.1.mlp.fc2.bias": "model-00001-of-00003.safetensors",
30
+ "model.layers.1.mlp.fc2.weight": "model-00001-of-00003.safetensors",
31
+ "model.layers.1.self_attn.dense.bias": "model-00001-of-00003.safetensors",
32
+ "model.layers.1.self_attn.dense.weight": "model-00001-of-00003.safetensors",
33
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
34
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
35
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
36
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
37
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
38
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
39
+ "model.layers.10.input_layernorm.bias": "model-00001-of-00003.safetensors",
40
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors",
41
+ "model.layers.10.mlp.fc1.bias": "model-00001-of-00003.safetensors",
42
+ "model.layers.10.mlp.fc1.weight": "model-00001-of-00003.safetensors",
43
+ "model.layers.10.mlp.fc2.bias": "model-00001-of-00003.safetensors",
44
+ "model.layers.10.mlp.fc2.weight": "model-00001-of-00003.safetensors",
45
+ "model.layers.10.self_attn.dense.bias": "model-00001-of-00003.safetensors",
46
+ "model.layers.10.self_attn.dense.weight": "model-00001-of-00003.safetensors",
47
+ "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
48
+ "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
49
+ "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
50
+ "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
51
+ "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
52
+ "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
53
+ "model.layers.11.input_layernorm.bias": "model-00001-of-00003.safetensors",
54
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors",
55
+ "model.layers.11.mlp.fc1.bias": "model-00001-of-00003.safetensors",
56
+ "model.layers.11.mlp.fc1.weight": "model-00001-of-00003.safetensors",
57
+ "model.layers.11.mlp.fc2.bias": "model-00001-of-00003.safetensors",
58
+ "model.layers.11.mlp.fc2.weight": "model-00001-of-00003.safetensors",
59
+ "model.layers.11.self_attn.dense.bias": "model-00001-of-00003.safetensors",
60
+ "model.layers.11.self_attn.dense.weight": "model-00001-of-00003.safetensors",
61
+ "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
62
+ "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
63
+ "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
64
+ "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
65
+ "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
66
+ "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
67
+ "model.layers.12.input_layernorm.bias": "model-00001-of-00003.safetensors",
68
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors",
69
+ "model.layers.12.mlp.fc1.bias": "model-00001-of-00003.safetensors",
70
+ "model.layers.12.mlp.fc1.weight": "model-00001-of-00003.safetensors",
71
+ "model.layers.12.mlp.fc2.bias": "model-00001-of-00003.safetensors",
72
+ "model.layers.12.mlp.fc2.weight": "model-00001-of-00003.safetensors",
73
+ "model.layers.12.self_attn.dense.bias": "model-00001-of-00003.safetensors",
74
+ "model.layers.12.self_attn.dense.weight": "model-00001-of-00003.safetensors",
75
+ "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
76
+ "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
77
+ "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
78
+ "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
79
+ "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
80
+ "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
81
+ "model.layers.13.input_layernorm.bias": "model-00001-of-00003.safetensors",
82
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors",
83
+ "model.layers.13.mlp.fc1.bias": "model-00001-of-00003.safetensors",
84
+ "model.layers.13.mlp.fc1.weight": "model-00001-of-00003.safetensors",
85
+ "model.layers.13.mlp.fc2.bias": "model-00001-of-00003.safetensors",
86
+ "model.layers.13.mlp.fc2.weight": "model-00001-of-00003.safetensors",
87
+ "model.layers.13.self_attn.dense.bias": "model-00001-of-00003.safetensors",
88
+ "model.layers.13.self_attn.dense.weight": "model-00001-of-00003.safetensors",
89
+ "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
90
+ "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
91
+ "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
92
+ "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
93
+ "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
94
+ "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
95
+ "model.layers.14.input_layernorm.bias": "model-00002-of-00003.safetensors",
96
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00003.safetensors",
97
+ "model.layers.14.mlp.fc1.bias": "model-00002-of-00003.safetensors",
98
+ "model.layers.14.mlp.fc1.weight": "model-00002-of-00003.safetensors",
99
+ "model.layers.14.mlp.fc2.bias": "model-00002-of-00003.safetensors",
100
+ "model.layers.14.mlp.fc2.weight": "model-00002-of-00003.safetensors",
101
+ "model.layers.14.self_attn.dense.bias": "model-00002-of-00003.safetensors",
102
+ "model.layers.14.self_attn.dense.weight": "model-00002-of-00003.safetensors",
103
+ "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
104
+ "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
105
+ "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
106
+ "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
107
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
108
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
109
+ "model.layers.15.input_layernorm.bias": "model-00002-of-00003.safetensors",
110
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00003.safetensors",
111
+ "model.layers.15.mlp.fc1.bias": "model-00002-of-00003.safetensors",
112
+ "model.layers.15.mlp.fc1.weight": "model-00002-of-00003.safetensors",
113
+ "model.layers.15.mlp.fc2.bias": "model-00002-of-00003.safetensors",
114
+ "model.layers.15.mlp.fc2.weight": "model-00002-of-00003.safetensors",
115
+ "model.layers.15.self_attn.dense.bias": "model-00002-of-00003.safetensors",
116
+ "model.layers.15.self_attn.dense.weight": "model-00002-of-00003.safetensors",
117
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
118
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
119
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
120
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
121
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
122
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
123
+ "model.layers.16.input_layernorm.bias": "model-00002-of-00003.safetensors",
124
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00003.safetensors",
125
+ "model.layers.16.mlp.fc1.bias": "model-00002-of-00003.safetensors",
126
+ "model.layers.16.mlp.fc1.weight": "model-00002-of-00003.safetensors",
127
+ "model.layers.16.mlp.fc2.bias": "model-00002-of-00003.safetensors",
128
+ "model.layers.16.mlp.fc2.weight": "model-00002-of-00003.safetensors",
129
+ "model.layers.16.self_attn.dense.bias": "model-00002-of-00003.safetensors",
130
+ "model.layers.16.self_attn.dense.weight": "model-00002-of-00003.safetensors",
131
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
132
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
133
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
134
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
135
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
136
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
137
+ "model.layers.17.input_layernorm.bias": "model-00002-of-00003.safetensors",
138
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00003.safetensors",
139
+ "model.layers.17.mlp.fc1.bias": "model-00002-of-00003.safetensors",
140
+ "model.layers.17.mlp.fc1.weight": "model-00002-of-00003.safetensors",
141
+ "model.layers.17.mlp.fc2.bias": "model-00002-of-00003.safetensors",
142
+ "model.layers.17.mlp.fc2.weight": "model-00002-of-00003.safetensors",
143
+ "model.layers.17.self_attn.dense.bias": "model-00002-of-00003.safetensors",
144
+ "model.layers.17.self_attn.dense.weight": "model-00002-of-00003.safetensors",
145
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
146
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
147
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
148
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
149
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
150
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
151
+ "model.layers.18.input_layernorm.bias": "model-00002-of-00003.safetensors",
152
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00003.safetensors",
153
+ "model.layers.18.mlp.fc1.bias": "model-00002-of-00003.safetensors",
154
+ "model.layers.18.mlp.fc1.weight": "model-00002-of-00003.safetensors",
155
+ "model.layers.18.mlp.fc2.bias": "model-00002-of-00003.safetensors",
156
+ "model.layers.18.mlp.fc2.weight": "model-00002-of-00003.safetensors",
157
+ "model.layers.18.self_attn.dense.bias": "model-00002-of-00003.safetensors",
158
+ "model.layers.18.self_attn.dense.weight": "model-00002-of-00003.safetensors",
159
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
160
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
161
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
162
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
163
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
164
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
165
+ "model.layers.19.input_layernorm.bias": "model-00002-of-00003.safetensors",
166
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00003.safetensors",
167
+ "model.layers.19.mlp.fc1.bias": "model-00002-of-00003.safetensors",
168
+ "model.layers.19.mlp.fc1.weight": "model-00002-of-00003.safetensors",
169
+ "model.layers.19.mlp.fc2.bias": "model-00002-of-00003.safetensors",
170
+ "model.layers.19.mlp.fc2.weight": "model-00002-of-00003.safetensors",
171
+ "model.layers.19.self_attn.dense.bias": "model-00002-of-00003.safetensors",
172
+ "model.layers.19.self_attn.dense.weight": "model-00002-of-00003.safetensors",
173
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
174
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
175
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
176
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
177
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
178
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
179
+ "model.layers.2.input_layernorm.bias": "model-00001-of-00003.safetensors",
180
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors",
181
+ "model.layers.2.mlp.fc1.bias": "model-00001-of-00003.safetensors",
182
+ "model.layers.2.mlp.fc1.weight": "model-00001-of-00003.safetensors",
183
+ "model.layers.2.mlp.fc2.bias": "model-00001-of-00003.safetensors",
184
+ "model.layers.2.mlp.fc2.weight": "model-00001-of-00003.safetensors",
185
+ "model.layers.2.self_attn.dense.bias": "model-00001-of-00003.safetensors",
186
+ "model.layers.2.self_attn.dense.weight": "model-00001-of-00003.safetensors",
187
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
188
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
189
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
190
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
191
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
192
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
193
+ "model.layers.20.input_layernorm.bias": "model-00002-of-00003.safetensors",
194
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00003.safetensors",
195
+ "model.layers.20.mlp.fc1.bias": "model-00002-of-00003.safetensors",
196
+ "model.layers.20.mlp.fc1.weight": "model-00002-of-00003.safetensors",
197
+ "model.layers.20.mlp.fc2.bias": "model-00002-of-00003.safetensors",
198
+ "model.layers.20.mlp.fc2.weight": "model-00002-of-00003.safetensors",
199
+ "model.layers.20.self_attn.dense.bias": "model-00002-of-00003.safetensors",
200
+ "model.layers.20.self_attn.dense.weight": "model-00002-of-00003.safetensors",
201
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
202
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
203
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
204
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
205
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
206
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
207
+ "model.layers.21.input_layernorm.bias": "model-00002-of-00003.safetensors",
208
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors",
209
+ "model.layers.21.mlp.fc1.bias": "model-00002-of-00003.safetensors",
210
+ "model.layers.21.mlp.fc1.weight": "model-00002-of-00003.safetensors",
211
+ "model.layers.21.mlp.fc2.bias": "model-00002-of-00003.safetensors",
212
+ "model.layers.21.mlp.fc2.weight": "model-00002-of-00003.safetensors",
213
+ "model.layers.21.self_attn.dense.bias": "model-00002-of-00003.safetensors",
214
+ "model.layers.21.self_attn.dense.weight": "model-00002-of-00003.safetensors",
215
+ "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
216
+ "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
217
+ "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
218
+ "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
219
+ "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
220
+ "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
221
+ "model.layers.22.input_layernorm.bias": "model-00002-of-00003.safetensors",
222
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors",
223
+ "model.layers.22.mlp.fc1.bias": "model-00002-of-00003.safetensors",
224
+ "model.layers.22.mlp.fc1.weight": "model-00002-of-00003.safetensors",
225
+ "model.layers.22.mlp.fc2.bias": "model-00002-of-00003.safetensors",
226
+ "model.layers.22.mlp.fc2.weight": "model-00002-of-00003.safetensors",
227
+ "model.layers.22.self_attn.dense.bias": "model-00002-of-00003.safetensors",
228
+ "model.layers.22.self_attn.dense.weight": "model-00002-of-00003.safetensors",
229
+ "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
230
+ "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
231
+ "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
232
+ "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
233
+ "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
234
+ "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
235
+ "model.layers.23.input_layernorm.bias": "model-00002-of-00003.safetensors",
236
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors",
237
+ "model.layers.23.mlp.fc1.bias": "model-00002-of-00003.safetensors",
238
+ "model.layers.23.mlp.fc1.weight": "model-00002-of-00003.safetensors",
239
+ "model.layers.23.mlp.fc2.bias": "model-00002-of-00003.safetensors",
240
+ "model.layers.23.mlp.fc2.weight": "model-00002-of-00003.safetensors",
241
+ "model.layers.23.self_attn.dense.bias": "model-00002-of-00003.safetensors",
242
+ "model.layers.23.self_attn.dense.weight": "model-00002-of-00003.safetensors",
243
+ "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
244
+ "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
245
+ "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
246
+ "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
247
+ "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
248
+ "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
249
+ "model.layers.24.input_layernorm.bias": "model-00002-of-00003.safetensors",
250
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors",
251
+ "model.layers.24.mlp.fc1.bias": "model-00002-of-00003.safetensors",
252
+ "model.layers.24.mlp.fc1.weight": "model-00002-of-00003.safetensors",
253
+ "model.layers.24.mlp.fc2.bias": "model-00002-of-00003.safetensors",
254
+ "model.layers.24.mlp.fc2.weight": "model-00002-of-00003.safetensors",
255
+ "model.layers.24.self_attn.dense.bias": "model-00002-of-00003.safetensors",
256
+ "model.layers.24.self_attn.dense.weight": "model-00002-of-00003.safetensors",
257
+ "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
258
+ "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
259
+ "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
260
+ "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
261
+ "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
262
+ "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
263
+ "model.layers.25.input_layernorm.bias": "model-00002-of-00003.safetensors",
264
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors",
265
+ "model.layers.25.mlp.fc1.bias": "model-00002-of-00003.safetensors",
266
+ "model.layers.25.mlp.fc1.weight": "model-00002-of-00003.safetensors",
267
+ "model.layers.25.mlp.fc2.bias": "model-00002-of-00003.safetensors",
268
+ "model.layers.25.mlp.fc2.weight": "model-00002-of-00003.safetensors",
269
+ "model.layers.25.self_attn.dense.bias": "model-00002-of-00003.safetensors",
270
+ "model.layers.25.self_attn.dense.weight": "model-00002-of-00003.safetensors",
271
+ "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
272
+ "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
273
+ "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
274
+ "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
275
+ "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
276
+ "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
277
+ "model.layers.26.input_layernorm.bias": "model-00002-of-00003.safetensors",
278
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors",
279
+ "model.layers.26.mlp.fc1.bias": "model-00002-of-00003.safetensors",
280
+ "model.layers.26.mlp.fc1.weight": "model-00002-of-00003.safetensors",
281
+ "model.layers.26.mlp.fc2.bias": "model-00002-of-00003.safetensors",
282
+ "model.layers.26.mlp.fc2.weight": "model-00002-of-00003.safetensors",
283
+ "model.layers.26.self_attn.dense.bias": "model-00002-of-00003.safetensors",
284
+ "model.layers.26.self_attn.dense.weight": "model-00002-of-00003.safetensors",
285
+ "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
286
+ "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
287
+ "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
288
+ "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
289
+ "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
290
+ "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
291
+ "model.layers.27.input_layernorm.bias": "model-00002-of-00003.safetensors",
292
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors",
293
+ "model.layers.27.mlp.fc1.bias": "model-00002-of-00003.safetensors",
294
+ "model.layers.27.mlp.fc1.weight": "model-00002-of-00003.safetensors",
295
+ "model.layers.27.mlp.fc2.bias": "model-00002-of-00003.safetensors",
296
+ "model.layers.27.mlp.fc2.weight": "model-00002-of-00003.safetensors",
297
+ "model.layers.27.self_attn.dense.bias": "model-00002-of-00003.safetensors",
298
+ "model.layers.27.self_attn.dense.weight": "model-00002-of-00003.safetensors",
299
+ "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
300
+ "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
301
+ "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
302
+ "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
303
+ "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
304
+ "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
305
+ "model.layers.28.input_layernorm.bias": "model-00002-of-00003.safetensors",
306
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors",
307
+ "model.layers.28.mlp.fc1.bias": "model-00002-of-00003.safetensors",
308
+ "model.layers.28.mlp.fc1.weight": "model-00002-of-00003.safetensors",
309
+ "model.layers.28.mlp.fc2.bias": "model-00002-of-00003.safetensors",
310
+ "model.layers.28.mlp.fc2.weight": "model-00002-of-00003.safetensors",
311
+ "model.layers.28.self_attn.dense.bias": "model-00002-of-00003.safetensors",
312
+ "model.layers.28.self_attn.dense.weight": "model-00002-of-00003.safetensors",
313
+ "model.layers.28.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
314
+ "model.layers.28.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
315
+ "model.layers.28.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
316
+ "model.layers.28.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
317
+ "model.layers.28.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
318
+ "model.layers.28.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
319
+ "model.layers.29.input_layernorm.bias": "model-00002-of-00003.safetensors",
320
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors",
321
+ "model.layers.29.mlp.fc1.bias": "model-00002-of-00003.safetensors",
322
+ "model.layers.29.mlp.fc1.weight": "model-00002-of-00003.safetensors",
323
+ "model.layers.29.mlp.fc2.bias": "model-00002-of-00003.safetensors",
324
+ "model.layers.29.mlp.fc2.weight": "model-00002-of-00003.safetensors",
325
+ "model.layers.29.self_attn.dense.bias": "model-00002-of-00003.safetensors",
326
+ "model.layers.29.self_attn.dense.weight": "model-00002-of-00003.safetensors",
327
+ "model.layers.29.self_attn.k_proj.bias": "model-00002-of-00003.safetensors",
328
+ "model.layers.29.self_attn.k_proj.weight": "model-00002-of-00003.safetensors",
329
+ "model.layers.29.self_attn.q_proj.bias": "model-00002-of-00003.safetensors",
330
+ "model.layers.29.self_attn.q_proj.weight": "model-00002-of-00003.safetensors",
331
+ "model.layers.29.self_attn.v_proj.bias": "model-00002-of-00003.safetensors",
332
+ "model.layers.29.self_attn.v_proj.weight": "model-00002-of-00003.safetensors",
333
+ "model.layers.3.input_layernorm.bias": "model-00001-of-00003.safetensors",
334
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors",
335
+ "model.layers.3.mlp.fc1.bias": "model-00001-of-00003.safetensors",
336
+ "model.layers.3.mlp.fc1.weight": "model-00001-of-00003.safetensors",
337
+ "model.layers.3.mlp.fc2.bias": "model-00001-of-00003.safetensors",
338
+ "model.layers.3.mlp.fc2.weight": "model-00001-of-00003.safetensors",
339
+ "model.layers.3.self_attn.dense.bias": "model-00001-of-00003.safetensors",
340
+ "model.layers.3.self_attn.dense.weight": "model-00001-of-00003.safetensors",
341
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
342
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
343
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
344
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
345
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
346
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
347
+ "model.layers.30.input_layernorm.bias": "model-00003-of-00003.safetensors",
348
+ "model.layers.30.input_layernorm.weight": "model-00003-of-00003.safetensors",
349
+ "model.layers.30.mlp.fc1.bias": "model-00003-of-00003.safetensors",
350
+ "model.layers.30.mlp.fc1.weight": "model-00003-of-00003.safetensors",
351
+ "model.layers.30.mlp.fc2.bias": "model-00003-of-00003.safetensors",
352
+ "model.layers.30.mlp.fc2.weight": "model-00003-of-00003.safetensors",
353
+ "model.layers.30.self_attn.dense.bias": "model-00003-of-00003.safetensors",
354
+ "model.layers.30.self_attn.dense.weight": "model-00003-of-00003.safetensors",
355
+ "model.layers.30.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
356
+ "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
357
+ "model.layers.30.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
358
+ "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
359
+ "model.layers.30.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
360
+ "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
361
+ "model.layers.31.input_layernorm.bias": "model-00003-of-00003.safetensors",
362
+ "model.layers.31.input_layernorm.weight": "model-00003-of-00003.safetensors",
363
+ "model.layers.31.mlp.fc1.bias": "model-00003-of-00003.safetensors",
364
+ "model.layers.31.mlp.fc1.weight": "model-00003-of-00003.safetensors",
365
+ "model.layers.31.mlp.fc2.bias": "model-00003-of-00003.safetensors",
366
+ "model.layers.31.mlp.fc2.weight": "model-00003-of-00003.safetensors",
367
+ "model.layers.31.self_attn.dense.bias": "model-00003-of-00003.safetensors",
368
+ "model.layers.31.self_attn.dense.weight": "model-00003-of-00003.safetensors",
369
+ "model.layers.31.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
370
+ "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
371
+ "model.layers.31.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
372
+ "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
373
+ "model.layers.31.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
374
+ "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
375
+ "model.layers.4.input_layernorm.bias": "model-00001-of-00003.safetensors",
376
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors",
377
+ "model.layers.4.mlp.fc1.bias": "model-00001-of-00003.safetensors",
378
+ "model.layers.4.mlp.fc1.weight": "model-00001-of-00003.safetensors",
379
+ "model.layers.4.mlp.fc2.bias": "model-00001-of-00003.safetensors",
380
+ "model.layers.4.mlp.fc2.weight": "model-00001-of-00003.safetensors",
381
+ "model.layers.4.self_attn.dense.bias": "model-00001-of-00003.safetensors",
382
+ "model.layers.4.self_attn.dense.weight": "model-00001-of-00003.safetensors",
383
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
384
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
385
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
386
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
387
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
388
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
389
+ "model.layers.5.input_layernorm.bias": "model-00001-of-00003.safetensors",
390
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors",
391
+ "model.layers.5.mlp.fc1.bias": "model-00001-of-00003.safetensors",
392
+ "model.layers.5.mlp.fc1.weight": "model-00001-of-00003.safetensors",
393
+ "model.layers.5.mlp.fc2.bias": "model-00001-of-00003.safetensors",
394
+ "model.layers.5.mlp.fc2.weight": "model-00001-of-00003.safetensors",
395
+ "model.layers.5.self_attn.dense.bias": "model-00001-of-00003.safetensors",
396
+ "model.layers.5.self_attn.dense.weight": "model-00001-of-00003.safetensors",
397
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
398
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
399
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
400
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
401
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
402
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
403
+ "model.layers.6.input_layernorm.bias": "model-00001-of-00003.safetensors",
404
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors",
405
+ "model.layers.6.mlp.fc1.bias": "model-00001-of-00003.safetensors",
406
+ "model.layers.6.mlp.fc1.weight": "model-00001-of-00003.safetensors",
407
+ "model.layers.6.mlp.fc2.bias": "model-00001-of-00003.safetensors",
408
+ "model.layers.6.mlp.fc2.weight": "model-00001-of-00003.safetensors",
409
+ "model.layers.6.self_attn.dense.bias": "model-00001-of-00003.safetensors",
410
+ "model.layers.6.self_attn.dense.weight": "model-00001-of-00003.safetensors",
411
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
412
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
413
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
414
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
415
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
416
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
417
+ "model.layers.7.input_layernorm.bias": "model-00001-of-00003.safetensors",
418
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors",
419
+ "model.layers.7.mlp.fc1.bias": "model-00001-of-00003.safetensors",
420
+ "model.layers.7.mlp.fc1.weight": "model-00001-of-00003.safetensors",
421
+ "model.layers.7.mlp.fc2.bias": "model-00001-of-00003.safetensors",
422
+ "model.layers.7.mlp.fc2.weight": "model-00001-of-00003.safetensors",
423
+ "model.layers.7.self_attn.dense.bias": "model-00001-of-00003.safetensors",
424
+ "model.layers.7.self_attn.dense.weight": "model-00001-of-00003.safetensors",
425
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
426
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
427
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
428
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
429
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
430
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
431
+ "model.layers.8.input_layernorm.bias": "model-00001-of-00003.safetensors",
432
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors",
433
+ "model.layers.8.mlp.fc1.bias": "model-00001-of-00003.safetensors",
434
+ "model.layers.8.mlp.fc1.weight": "model-00001-of-00003.safetensors",
435
+ "model.layers.8.mlp.fc2.bias": "model-00001-of-00003.safetensors",
436
+ "model.layers.8.mlp.fc2.weight": "model-00001-of-00003.safetensors",
437
+ "model.layers.8.self_attn.dense.bias": "model-00001-of-00003.safetensors",
438
+ "model.layers.8.self_attn.dense.weight": "model-00001-of-00003.safetensors",
439
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
440
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
441
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
442
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
443
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
444
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
445
+ "model.layers.9.input_layernorm.bias": "model-00001-of-00003.safetensors",
446
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors",
447
+ "model.layers.9.mlp.fc1.bias": "model-00001-of-00003.safetensors",
448
+ "model.layers.9.mlp.fc1.weight": "model-00001-of-00003.safetensors",
449
+ "model.layers.9.mlp.fc2.bias": "model-00001-of-00003.safetensors",
450
+ "model.layers.9.mlp.fc2.weight": "model-00001-of-00003.safetensors",
451
+ "model.layers.9.self_attn.dense.bias": "model-00001-of-00003.safetensors",
452
+ "model.layers.9.self_attn.dense.weight": "model-00001-of-00003.safetensors",
453
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00003.safetensors",
454
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00003.safetensors",
455
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00003.safetensors",
456
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00003.safetensors",
457
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00003.safetensors",
458
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00003.safetensors",
459
+ "model.visual.attn_pool.0.bias": "model-00003-of-00003.safetensors",
460
+ "model.visual.attn_pool.0.weight": "model-00003-of-00003.safetensors",
461
+ "model.visual.attn_pool.2.bias": "model-00003-of-00003.safetensors",
462
+ "model.visual.attn_pool.2.weight": "model-00003-of-00003.safetensors",
463
+ "model.visual.ln_post.bias": "model-00003-of-00003.safetensors",
464
+ "model.visual.ln_post.weight": "model-00003-of-00003.safetensors",
465
+ "model.visual.model.embeddings.patch_embedding.bias": "model-00003-of-00003.safetensors",
466
+ "model.visual.model.embeddings.patch_embedding.weight": "model-00003-of-00003.safetensors",
467
+ "model.visual.model.embeddings.position_embedding.weight": "model-00003-of-00003.safetensors",
468
+ "model.visual.model.encoder.layers.0.layer_norm1.bias": "model-00003-of-00003.safetensors",
469
+ "model.visual.model.encoder.layers.0.layer_norm1.weight": "model-00003-of-00003.safetensors",
470
+ "model.visual.model.encoder.layers.0.layer_norm2.bias": "model-00003-of-00003.safetensors",
471
+ "model.visual.model.encoder.layers.0.layer_norm2.weight": "model-00003-of-00003.safetensors",
472
+ "model.visual.model.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00003.safetensors",
473
+ "model.visual.model.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00003.safetensors",
474
+ "model.visual.model.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00003.safetensors",
475
+ "model.visual.model.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00003.safetensors",
476
+ "model.visual.model.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
477
+ "model.visual.model.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
478
+ "model.visual.model.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
479
+ "model.visual.model.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
480
+ "model.visual.model.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
481
+ "model.visual.model.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
482
+ "model.visual.model.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
483
+ "model.visual.model.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
484
+ "model.visual.model.encoder.layers.1.layer_norm1.bias": "model-00003-of-00003.safetensors",
485
+ "model.visual.model.encoder.layers.1.layer_norm1.weight": "model-00003-of-00003.safetensors",
486
+ "model.visual.model.encoder.layers.1.layer_norm2.bias": "model-00003-of-00003.safetensors",
487
+ "model.visual.model.encoder.layers.1.layer_norm2.weight": "model-00003-of-00003.safetensors",
488
+ "model.visual.model.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00003.safetensors",
489
+ "model.visual.model.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00003.safetensors",
490
+ "model.visual.model.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00003.safetensors",
491
+ "model.visual.model.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00003.safetensors",
492
+ "model.visual.model.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
493
+ "model.visual.model.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
494
+ "model.visual.model.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
495
+ "model.visual.model.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
496
+ "model.visual.model.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
497
+ "model.visual.model.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
498
+ "model.visual.model.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
499
+ "model.visual.model.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
500
+ "model.visual.model.encoder.layers.10.layer_norm1.bias": "model-00003-of-00003.safetensors",
501
+ "model.visual.model.encoder.layers.10.layer_norm1.weight": "model-00003-of-00003.safetensors",
502
+ "model.visual.model.encoder.layers.10.layer_norm2.bias": "model-00003-of-00003.safetensors",
503
+ "model.visual.model.encoder.layers.10.layer_norm2.weight": "model-00003-of-00003.safetensors",
504
+ "model.visual.model.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00003.safetensors",
505
+ "model.visual.model.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00003.safetensors",
506
+ "model.visual.model.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00003.safetensors",
507
+ "model.visual.model.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00003.safetensors",
508
+ "model.visual.model.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
509
+ "model.visual.model.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
510
+ "model.visual.model.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
511
+ "model.visual.model.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
512
+ "model.visual.model.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
513
+ "model.visual.model.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
514
+ "model.visual.model.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
515
+ "model.visual.model.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
516
+ "model.visual.model.encoder.layers.11.layer_norm1.bias": "model-00003-of-00003.safetensors",
517
+ "model.visual.model.encoder.layers.11.layer_norm1.weight": "model-00003-of-00003.safetensors",
518
+ "model.visual.model.encoder.layers.11.layer_norm2.bias": "model-00003-of-00003.safetensors",
519
+ "model.visual.model.encoder.layers.11.layer_norm2.weight": "model-00003-of-00003.safetensors",
520
+ "model.visual.model.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00003.safetensors",
521
+ "model.visual.model.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00003.safetensors",
522
+ "model.visual.model.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00003.safetensors",
523
+ "model.visual.model.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00003.safetensors",
524
+ "model.visual.model.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
525
+ "model.visual.model.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
526
+ "model.visual.model.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
527
+ "model.visual.model.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
528
+ "model.visual.model.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
529
+ "model.visual.model.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
530
+ "model.visual.model.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
531
+ "model.visual.model.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
532
+ "model.visual.model.encoder.layers.12.layer_norm1.bias": "model-00003-of-00003.safetensors",
533
+ "model.visual.model.encoder.layers.12.layer_norm1.weight": "model-00003-of-00003.safetensors",
534
+ "model.visual.model.encoder.layers.12.layer_norm2.bias": "model-00003-of-00003.safetensors",
535
+ "model.visual.model.encoder.layers.12.layer_norm2.weight": "model-00003-of-00003.safetensors",
536
+ "model.visual.model.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00003.safetensors",
537
+ "model.visual.model.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00003.safetensors",
538
+ "model.visual.model.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00003.safetensors",
539
+ "model.visual.model.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00003.safetensors",
540
+ "model.visual.model.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
541
+ "model.visual.model.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
542
+ "model.visual.model.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
543
+ "model.visual.model.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
544
+ "model.visual.model.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
545
+ "model.visual.model.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
546
+ "model.visual.model.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
547
+ "model.visual.model.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
548
+ "model.visual.model.encoder.layers.13.layer_norm1.bias": "model-00003-of-00003.safetensors",
549
+ "model.visual.model.encoder.layers.13.layer_norm1.weight": "model-00003-of-00003.safetensors",
550
+ "model.visual.model.encoder.layers.13.layer_norm2.bias": "model-00003-of-00003.safetensors",
551
+ "model.visual.model.encoder.layers.13.layer_norm2.weight": "model-00003-of-00003.safetensors",
552
+ "model.visual.model.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00003.safetensors",
553
+ "model.visual.model.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00003.safetensors",
554
+ "model.visual.model.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00003.safetensors",
555
+ "model.visual.model.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00003.safetensors",
556
+ "model.visual.model.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
557
+ "model.visual.model.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
558
+ "model.visual.model.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
559
+ "model.visual.model.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
560
+ "model.visual.model.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
561
+ "model.visual.model.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
562
+ "model.visual.model.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
563
+ "model.visual.model.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
564
+ "model.visual.model.encoder.layers.14.layer_norm1.bias": "model-00003-of-00003.safetensors",
565
+ "model.visual.model.encoder.layers.14.layer_norm1.weight": "model-00003-of-00003.safetensors",
566
+ "model.visual.model.encoder.layers.14.layer_norm2.bias": "model-00003-of-00003.safetensors",
567
+ "model.visual.model.encoder.layers.14.layer_norm2.weight": "model-00003-of-00003.safetensors",
568
+ "model.visual.model.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00003.safetensors",
569
+ "model.visual.model.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00003.safetensors",
570
+ "model.visual.model.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00003.safetensors",
571
+ "model.visual.model.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00003.safetensors",
572
+ "model.visual.model.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
573
+ "model.visual.model.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
574
+ "model.visual.model.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
575
+ "model.visual.model.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
576
+ "model.visual.model.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
577
+ "model.visual.model.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
578
+ "model.visual.model.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
579
+ "model.visual.model.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
580
+ "model.visual.model.encoder.layers.15.layer_norm1.bias": "model-00003-of-00003.safetensors",
581
+ "model.visual.model.encoder.layers.15.layer_norm1.weight": "model-00003-of-00003.safetensors",
582
+ "model.visual.model.encoder.layers.15.layer_norm2.bias": "model-00003-of-00003.safetensors",
583
+ "model.visual.model.encoder.layers.15.layer_norm2.weight": "model-00003-of-00003.safetensors",
584
+ "model.visual.model.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00003.safetensors",
585
+ "model.visual.model.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00003.safetensors",
586
+ "model.visual.model.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00003.safetensors",
587
+ "model.visual.model.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00003.safetensors",
588
+ "model.visual.model.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
589
+ "model.visual.model.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
590
+ "model.visual.model.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
591
+ "model.visual.model.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
592
+ "model.visual.model.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
593
+ "model.visual.model.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
594
+ "model.visual.model.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
595
+ "model.visual.model.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
596
+ "model.visual.model.encoder.layers.16.layer_norm1.bias": "model-00003-of-00003.safetensors",
597
+ "model.visual.model.encoder.layers.16.layer_norm1.weight": "model-00003-of-00003.safetensors",
598
+ "model.visual.model.encoder.layers.16.layer_norm2.bias": "model-00003-of-00003.safetensors",
599
+ "model.visual.model.encoder.layers.16.layer_norm2.weight": "model-00003-of-00003.safetensors",
600
+ "model.visual.model.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00003.safetensors",
601
+ "model.visual.model.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00003.safetensors",
602
+ "model.visual.model.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00003.safetensors",
603
+ "model.visual.model.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00003.safetensors",
604
+ "model.visual.model.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
605
+ "model.visual.model.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
606
+ "model.visual.model.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
607
+ "model.visual.model.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
608
+ "model.visual.model.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
609
+ "model.visual.model.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
610
+ "model.visual.model.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
611
+ "model.visual.model.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
612
+ "model.visual.model.encoder.layers.17.layer_norm1.bias": "model-00003-of-00003.safetensors",
613
+ "model.visual.model.encoder.layers.17.layer_norm1.weight": "model-00003-of-00003.safetensors",
614
+ "model.visual.model.encoder.layers.17.layer_norm2.bias": "model-00003-of-00003.safetensors",
615
+ "model.visual.model.encoder.layers.17.layer_norm2.weight": "model-00003-of-00003.safetensors",
616
+ "model.visual.model.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00003.safetensors",
617
+ "model.visual.model.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00003.safetensors",
618
+ "model.visual.model.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00003.safetensors",
619
+ "model.visual.model.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00003.safetensors",
620
+ "model.visual.model.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
621
+ "model.visual.model.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
622
+ "model.visual.model.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
623
+ "model.visual.model.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
624
+ "model.visual.model.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
625
+ "model.visual.model.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
626
+ "model.visual.model.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
627
+ "model.visual.model.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
628
+ "model.visual.model.encoder.layers.18.layer_norm1.bias": "model-00003-of-00003.safetensors",
629
+ "model.visual.model.encoder.layers.18.layer_norm1.weight": "model-00003-of-00003.safetensors",
630
+ "model.visual.model.encoder.layers.18.layer_norm2.bias": "model-00003-of-00003.safetensors",
631
+ "model.visual.model.encoder.layers.18.layer_norm2.weight": "model-00003-of-00003.safetensors",
632
+ "model.visual.model.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00003.safetensors",
633
+ "model.visual.model.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00003.safetensors",
634
+ "model.visual.model.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00003.safetensors",
635
+ "model.visual.model.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00003.safetensors",
636
+ "model.visual.model.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
637
+ "model.visual.model.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
638
+ "model.visual.model.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
639
+ "model.visual.model.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
640
+ "model.visual.model.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
641
+ "model.visual.model.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
642
+ "model.visual.model.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
643
+ "model.visual.model.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
644
+ "model.visual.model.encoder.layers.19.layer_norm1.bias": "model-00003-of-00003.safetensors",
645
+ "model.visual.model.encoder.layers.19.layer_norm1.weight": "model-00003-of-00003.safetensors",
646
+ "model.visual.model.encoder.layers.19.layer_norm2.bias": "model-00003-of-00003.safetensors",
647
+ "model.visual.model.encoder.layers.19.layer_norm2.weight": "model-00003-of-00003.safetensors",
648
+ "model.visual.model.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00003.safetensors",
649
+ "model.visual.model.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00003.safetensors",
650
+ "model.visual.model.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00003.safetensors",
651
+ "model.visual.model.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00003.safetensors",
652
+ "model.visual.model.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
653
+ "model.visual.model.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
654
+ "model.visual.model.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
655
+ "model.visual.model.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
656
+ "model.visual.model.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
657
+ "model.visual.model.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
658
+ "model.visual.model.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
659
+ "model.visual.model.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
660
+ "model.visual.model.encoder.layers.2.layer_norm1.bias": "model-00003-of-00003.safetensors",
661
+ "model.visual.model.encoder.layers.2.layer_norm1.weight": "model-00003-of-00003.safetensors",
662
+ "model.visual.model.encoder.layers.2.layer_norm2.bias": "model-00003-of-00003.safetensors",
663
+ "model.visual.model.encoder.layers.2.layer_norm2.weight": "model-00003-of-00003.safetensors",
664
+ "model.visual.model.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00003.safetensors",
665
+ "model.visual.model.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00003.safetensors",
666
+ "model.visual.model.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00003.safetensors",
667
+ "model.visual.model.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00003.safetensors",
668
+ "model.visual.model.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
669
+ "model.visual.model.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
670
+ "model.visual.model.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
671
+ "model.visual.model.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
672
+ "model.visual.model.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
673
+ "model.visual.model.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
674
+ "model.visual.model.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
675
+ "model.visual.model.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
676
+ "model.visual.model.encoder.layers.20.layer_norm1.bias": "model-00003-of-00003.safetensors",
677
+ "model.visual.model.encoder.layers.20.layer_norm1.weight": "model-00003-of-00003.safetensors",
678
+ "model.visual.model.encoder.layers.20.layer_norm2.bias": "model-00003-of-00003.safetensors",
679
+ "model.visual.model.encoder.layers.20.layer_norm2.weight": "model-00003-of-00003.safetensors",
680
+ "model.visual.model.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00003.safetensors",
681
+ "model.visual.model.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00003.safetensors",
682
+ "model.visual.model.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00003.safetensors",
683
+ "model.visual.model.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00003.safetensors",
684
+ "model.visual.model.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
685
+ "model.visual.model.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
686
+ "model.visual.model.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
687
+ "model.visual.model.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
688
+ "model.visual.model.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
689
+ "model.visual.model.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
690
+ "model.visual.model.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
691
+ "model.visual.model.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
692
+ "model.visual.model.encoder.layers.21.layer_norm1.bias": "model-00003-of-00003.safetensors",
693
+ "model.visual.model.encoder.layers.21.layer_norm1.weight": "model-00003-of-00003.safetensors",
694
+ "model.visual.model.encoder.layers.21.layer_norm2.bias": "model-00003-of-00003.safetensors",
695
+ "model.visual.model.encoder.layers.21.layer_norm2.weight": "model-00003-of-00003.safetensors",
696
+ "model.visual.model.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00003.safetensors",
697
+ "model.visual.model.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00003.safetensors",
698
+ "model.visual.model.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00003.safetensors",
699
+ "model.visual.model.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00003.safetensors",
700
+ "model.visual.model.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
701
+ "model.visual.model.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
702
+ "model.visual.model.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
703
+ "model.visual.model.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
704
+ "model.visual.model.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
705
+ "model.visual.model.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
706
+ "model.visual.model.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
707
+ "model.visual.model.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
708
+ "model.visual.model.encoder.layers.22.layer_norm1.bias": "model-00003-of-00003.safetensors",
709
+ "model.visual.model.encoder.layers.22.layer_norm1.weight": "model-00003-of-00003.safetensors",
710
+ "model.visual.model.encoder.layers.22.layer_norm2.bias": "model-00003-of-00003.safetensors",
711
+ "model.visual.model.encoder.layers.22.layer_norm2.weight": "model-00003-of-00003.safetensors",
712
+ "model.visual.model.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00003.safetensors",
713
+ "model.visual.model.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00003.safetensors",
714
+ "model.visual.model.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00003.safetensors",
715
+ "model.visual.model.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00003.safetensors",
716
+ "model.visual.model.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
717
+ "model.visual.model.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
718
+ "model.visual.model.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
719
+ "model.visual.model.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
720
+ "model.visual.model.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
721
+ "model.visual.model.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
722
+ "model.visual.model.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
723
+ "model.visual.model.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
724
+ "model.visual.model.encoder.layers.23.layer_norm1.bias": "model-00003-of-00003.safetensors",
725
+ "model.visual.model.encoder.layers.23.layer_norm1.weight": "model-00003-of-00003.safetensors",
726
+ "model.visual.model.encoder.layers.23.layer_norm2.bias": "model-00003-of-00003.safetensors",
727
+ "model.visual.model.encoder.layers.23.layer_norm2.weight": "model-00003-of-00003.safetensors",
728
+ "model.visual.model.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00003.safetensors",
729
+ "model.visual.model.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00003.safetensors",
730
+ "model.visual.model.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00003.safetensors",
731
+ "model.visual.model.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00003.safetensors",
732
+ "model.visual.model.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
733
+ "model.visual.model.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
734
+ "model.visual.model.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
735
+ "model.visual.model.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
736
+ "model.visual.model.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
737
+ "model.visual.model.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
738
+ "model.visual.model.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
739
+ "model.visual.model.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
740
+ "model.visual.model.encoder.layers.3.layer_norm1.bias": "model-00003-of-00003.safetensors",
741
+ "model.visual.model.encoder.layers.3.layer_norm1.weight": "model-00003-of-00003.safetensors",
742
+ "model.visual.model.encoder.layers.3.layer_norm2.bias": "model-00003-of-00003.safetensors",
743
+ "model.visual.model.encoder.layers.3.layer_norm2.weight": "model-00003-of-00003.safetensors",
744
+ "model.visual.model.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00003.safetensors",
745
+ "model.visual.model.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00003.safetensors",
746
+ "model.visual.model.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00003.safetensors",
747
+ "model.visual.model.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00003.safetensors",
748
+ "model.visual.model.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
749
+ "model.visual.model.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
750
+ "model.visual.model.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
751
+ "model.visual.model.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
752
+ "model.visual.model.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
753
+ "model.visual.model.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
754
+ "model.visual.model.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
755
+ "model.visual.model.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
756
+ "model.visual.model.encoder.layers.4.layer_norm1.bias": "model-00003-of-00003.safetensors",
757
+ "model.visual.model.encoder.layers.4.layer_norm1.weight": "model-00003-of-00003.safetensors",
758
+ "model.visual.model.encoder.layers.4.layer_norm2.bias": "model-00003-of-00003.safetensors",
759
+ "model.visual.model.encoder.layers.4.layer_norm2.weight": "model-00003-of-00003.safetensors",
760
+ "model.visual.model.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00003.safetensors",
761
+ "model.visual.model.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00003.safetensors",
762
+ "model.visual.model.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00003.safetensors",
763
+ "model.visual.model.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00003.safetensors",
764
+ "model.visual.model.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
765
+ "model.visual.model.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
766
+ "model.visual.model.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
767
+ "model.visual.model.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
768
+ "model.visual.model.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
769
+ "model.visual.model.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
770
+ "model.visual.model.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
771
+ "model.visual.model.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
772
+ "model.visual.model.encoder.layers.5.layer_norm1.bias": "model-00003-of-00003.safetensors",
773
+ "model.visual.model.encoder.layers.5.layer_norm1.weight": "model-00003-of-00003.safetensors",
774
+ "model.visual.model.encoder.layers.5.layer_norm2.bias": "model-00003-of-00003.safetensors",
775
+ "model.visual.model.encoder.layers.5.layer_norm2.weight": "model-00003-of-00003.safetensors",
776
+ "model.visual.model.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00003.safetensors",
777
+ "model.visual.model.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00003.safetensors",
778
+ "model.visual.model.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00003.safetensors",
779
+ "model.visual.model.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00003.safetensors",
780
+ "model.visual.model.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
781
+ "model.visual.model.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
782
+ "model.visual.model.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
783
+ "model.visual.model.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
784
+ "model.visual.model.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
785
+ "model.visual.model.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
786
+ "model.visual.model.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
787
+ "model.visual.model.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
788
+ "model.visual.model.encoder.layers.6.layer_norm1.bias": "model-00003-of-00003.safetensors",
789
+ "model.visual.model.encoder.layers.6.layer_norm1.weight": "model-00003-of-00003.safetensors",
790
+ "model.visual.model.encoder.layers.6.layer_norm2.bias": "model-00003-of-00003.safetensors",
791
+ "model.visual.model.encoder.layers.6.layer_norm2.weight": "model-00003-of-00003.safetensors",
792
+ "model.visual.model.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00003.safetensors",
793
+ "model.visual.model.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00003.safetensors",
794
+ "model.visual.model.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00003.safetensors",
795
+ "model.visual.model.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00003.safetensors",
796
+ "model.visual.model.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
797
+ "model.visual.model.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
798
+ "model.visual.model.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
799
+ "model.visual.model.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
800
+ "model.visual.model.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
801
+ "model.visual.model.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
802
+ "model.visual.model.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
803
+ "model.visual.model.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
804
+ "model.visual.model.encoder.layers.7.layer_norm1.bias": "model-00003-of-00003.safetensors",
805
+ "model.visual.model.encoder.layers.7.layer_norm1.weight": "model-00003-of-00003.safetensors",
806
+ "model.visual.model.encoder.layers.7.layer_norm2.bias": "model-00003-of-00003.safetensors",
807
+ "model.visual.model.encoder.layers.7.layer_norm2.weight": "model-00003-of-00003.safetensors",
808
+ "model.visual.model.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00003.safetensors",
809
+ "model.visual.model.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00003.safetensors",
810
+ "model.visual.model.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00003.safetensors",
811
+ "model.visual.model.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00003.safetensors",
812
+ "model.visual.model.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
813
+ "model.visual.model.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
814
+ "model.visual.model.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
815
+ "model.visual.model.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
816
+ "model.visual.model.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
817
+ "model.visual.model.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
818
+ "model.visual.model.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
819
+ "model.visual.model.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
820
+ "model.visual.model.encoder.layers.8.layer_norm1.bias": "model-00003-of-00003.safetensors",
821
+ "model.visual.model.encoder.layers.8.layer_norm1.weight": "model-00003-of-00003.safetensors",
822
+ "model.visual.model.encoder.layers.8.layer_norm2.bias": "model-00003-of-00003.safetensors",
823
+ "model.visual.model.encoder.layers.8.layer_norm2.weight": "model-00003-of-00003.safetensors",
824
+ "model.visual.model.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00003.safetensors",
825
+ "model.visual.model.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00003.safetensors",
826
+ "model.visual.model.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00003.safetensors",
827
+ "model.visual.model.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00003.safetensors",
828
+ "model.visual.model.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
829
+ "model.visual.model.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
830
+ "model.visual.model.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
831
+ "model.visual.model.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
832
+ "model.visual.model.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
833
+ "model.visual.model.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
834
+ "model.visual.model.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
835
+ "model.visual.model.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
836
+ "model.visual.model.encoder.layers.9.layer_norm1.bias": "model-00003-of-00003.safetensors",
837
+ "model.visual.model.encoder.layers.9.layer_norm1.weight": "model-00003-of-00003.safetensors",
838
+ "model.visual.model.encoder.layers.9.layer_norm2.bias": "model-00003-of-00003.safetensors",
839
+ "model.visual.model.encoder.layers.9.layer_norm2.weight": "model-00003-of-00003.safetensors",
840
+ "model.visual.model.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00003.safetensors",
841
+ "model.visual.model.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00003.safetensors",
842
+ "model.visual.model.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00003.safetensors",
843
+ "model.visual.model.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00003.safetensors",
844
+ "model.visual.model.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00003.safetensors",
845
+ "model.visual.model.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00003.safetensors",
846
+ "model.visual.model.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00003.safetensors",
847
+ "model.visual.model.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00003.safetensors",
848
+ "model.visual.model.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00003.safetensors",
849
+ "model.visual.model.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00003.safetensors",
850
+ "model.visual.model.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00003.safetensors",
851
+ "model.visual.model.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00003.safetensors",
852
+ "model.visual.model.head.attention.in_proj_bias": "model-00003-of-00003.safetensors",
853
+ "model.visual.model.head.attention.in_proj_weight": "model-00003-of-00003.safetensors",
854
+ "model.visual.model.head.attention.out_proj.bias": "model-00003-of-00003.safetensors",
855
+ "model.visual.model.head.attention.out_proj.weight": "model-00003-of-00003.safetensors",
856
+ "model.visual.model.head.layernorm.bias": "model-00003-of-00003.safetensors",
857
+ "model.visual.model.head.layernorm.weight": "model-00003-of-00003.safetensors",
858
+ "model.visual.model.head.mlp.fc1.bias": "model-00003-of-00003.safetensors",
859
+ "model.visual.model.head.mlp.fc1.weight": "model-00003-of-00003.safetensors",
860
+ "model.visual.model.head.mlp.fc2.bias": "model-00003-of-00003.safetensors",
861
+ "model.visual.model.head.mlp.fc2.weight": "model-00003-of-00003.safetensors",
862
+ "model.visual.model.head.probe": "model-00003-of-00003.safetensors",
863
+ "model.visual.model.post_layernorm.bias": "model-00003-of-00003.safetensors",
864
+ "model.visual.model.post_layernorm.weight": "model-00003-of-00003.safetensors",
865
+ "model.visual.pos_embed": "model-00003-of-00003.safetensors",
866
+ "model.visual.proj": "model-00003-of-00003.safetensors"
867
+ }
868
+ }
modeling_chexagent.py ADDED
@@ -0,0 +1,1141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ PyTorch Phi model."""
17
+
18
+ import math
19
+ from typing import List, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.nn.functional as F
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import CrossEntropyLoss
26
+ import transformers
27
+ from transformers.activations import ACT2FN
28
+ from transformers.cache_utils import Cache, DynamicCache
29
+ from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
30
+ from transformers.modeling_outputs import (
31
+ BaseModelOutputWithPast,
32
+ CausalLMOutputWithPast,
33
+ )
34
+ from transformers.modeling_utils import PreTrainedModel
35
+ from transformers.utils import (
36
+ add_start_docstrings,
37
+ add_start_docstrings_to_model_forward,
38
+ is_flash_attn_greater_or_equal_2_10,
39
+ logging,
40
+ )
41
+
42
+ from .configuration_chexagent import CheXagentConfig
43
+ from .modeling_visual import CLIPModel
44
+ from .tokenization_chexagent import CheXagentTokenizer
45
+
46
+ try:
47
+ from flash_attn import flash_attn_func, flash_attn_varlen_func
48
+ from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
49
+ except:
50
+ pass
51
+
52
+ assert transformers.__version__ == "4.40.0", "Please install a specific HF transformers version: pip install transformers==4.40.0"
53
+
54
+ logger = logging.get_logger(__name__)
55
+
56
+ _CHECKPOINT_FOR_DOC = "microsoft/phi-2"
57
+ _CONFIG_FOR_DOC = "CheXagentConfig"
58
+
59
+ PHI_PRETRAINED_MODEL_ARCHIVE_LIST = [
60
+ "microsoft/phi-2",
61
+ # See all Phi models at https://huggingface.co/models?filter=phi
62
+ ]
63
+
64
+
65
+ # Copied from transformers.models.llama.modeling_llama._get_unpad_data
66
+ def _get_unpad_data(attention_mask):
67
+ seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
68
+ indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
69
+ max_seqlen_in_batch = seqlens_in_batch.max().item()
70
+ cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
71
+ return (
72
+ indices,
73
+ cu_seqlens,
74
+ max_seqlen_in_batch,
75
+ )
76
+
77
+
78
+ # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Phi
79
+ class PhiRotaryEmbedding(nn.Module):
80
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
81
+ super().__init__()
82
+
83
+ self.dim = dim
84
+ self.max_position_embeddings = max_position_embeddings
85
+ self.base = base
86
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
87
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
88
+
89
+ # Build here to make `torch.jit.trace` work.
90
+ self._set_cos_sin_cache(
91
+ seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
92
+ )
93
+
94
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
95
+ self.max_seq_len_cached = seq_len
96
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
97
+
98
+ freqs = torch.outer(t, self.inv_freq)
99
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
100
+ emb = torch.cat((freqs, freqs), dim=-1)
101
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
102
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
103
+
104
+ def forward(self, x, seq_len=None):
105
+ # x: [bs, num_attention_heads, seq_len, head_size]
106
+ if seq_len > self.max_seq_len_cached:
107
+ self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
108
+
109
+ return (
110
+ self.cos_cached[:seq_len].to(dtype=x.dtype),
111
+ self.sin_cached[:seq_len].to(dtype=x.dtype),
112
+ )
113
+
114
+
115
+ # Copied from transformers.models.llama.modeling_llama.LlamaLinearScalingRotaryEmbedding with Llama->Phi
116
+ class PhiLinearScalingRotaryEmbedding(PhiRotaryEmbedding):
117
+ """PhiRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
118
+
119
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
120
+ self.scaling_factor = scaling_factor
121
+ super().__init__(dim, max_position_embeddings, base, device)
122
+
123
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
124
+ self.max_seq_len_cached = seq_len
125
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
126
+ t = t / self.scaling_factor
127
+
128
+ freqs = torch.outer(t, self.inv_freq)
129
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
130
+ emb = torch.cat((freqs, freqs), dim=-1)
131
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
132
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
133
+
134
+
135
+ # Copied from transformers.models.llama.modeling_llama.LlamaDynamicNTKScalingRotaryEmbedding with Llama->Phi
136
+ class PhiDynamicNTKScalingRotaryEmbedding(PhiRotaryEmbedding):
137
+ """PhiRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
138
+
139
+ def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
140
+ self.scaling_factor = scaling_factor
141
+ super().__init__(dim, max_position_embeddings, base, device)
142
+
143
+ def _set_cos_sin_cache(self, seq_len, device, dtype):
144
+ self.max_seq_len_cached = seq_len
145
+
146
+ if seq_len > self.max_position_embeddings:
147
+ base = self.base * (
148
+ (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
149
+ ) ** (self.dim / (self.dim - 2))
150
+ inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
151
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
152
+
153
+ t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
154
+
155
+ freqs = torch.outer(t, self.inv_freq)
156
+ # Different from paper, but it uses a different permutation in order to obtain the same calculation
157
+ emb = torch.cat((freqs, freqs), dim=-1)
158
+ self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
159
+ self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
160
+
161
+
162
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
163
+ def rotate_half(x):
164
+ """Rotates half the hidden dims of the input."""
165
+ x1 = x[..., : x.shape[-1] // 2]
166
+ x2 = x[..., x.shape[-1] // 2:]
167
+ return torch.cat((-x2, x1), dim=-1)
168
+
169
+
170
+ # Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
171
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
172
+ """Applies Rotary Position Embedding to the query and key tensors.
173
+ Args:
174
+ q (`torch.Tensor`): The query tensor.
175
+ k (`torch.Tensor`): The key tensor.
176
+ cos (`torch.Tensor`): The cosine part of the rotary embedding.
177
+ sin (`torch.Tensor`): The sine part of the rotary embedding.
178
+ position_ids (`torch.Tensor`):
179
+ The position indices of the tokens corresponding to the query and key tensors. For example, this can be
180
+ used to pass offsetted position ids when working with a KV-cache.
181
+ unsqueeze_dim (`int`, *optional*, defaults to 1):
182
+ The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
183
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
184
+ that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
185
+ k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
186
+ cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
187
+ the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
188
+ Returns:
189
+ `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
190
+ """
191
+ cos = cos[position_ids].unsqueeze(unsqueeze_dim)
192
+ sin = sin[position_ids].unsqueeze(unsqueeze_dim)
193
+ q_embed = (q * cos) + (rotate_half(q) * sin)
194
+ k_embed = (k * cos) + (rotate_half(k) * sin)
195
+ return q_embed, k_embed
196
+
197
+
198
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Phi
199
+ class PhiMLP(nn.Module):
200
+ def __init__(self, config):
201
+ super().__init__()
202
+ self.config = config
203
+ self.activation_fn = ACT2FN[config.hidden_act]
204
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
205
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
206
+
207
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
208
+ hidden_states = self.fc1(hidden_states)
209
+ hidden_states = self.activation_fn(hidden_states)
210
+ hidden_states = self.fc2(hidden_states)
211
+ return hidden_states
212
+
213
+
214
+ # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
215
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
216
+ """
217
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
218
+ num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
219
+ """
220
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
221
+ if n_rep == 1:
222
+ return hidden_states
223
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
224
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
225
+
226
+
227
+ class PhiAttention(nn.Module):
228
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
229
+
230
+ def __init__(self, config: CheXagentConfig, layer_idx: Optional[int] = None):
231
+ super().__init__()
232
+ self.config = config
233
+ self.layer_idx = layer_idx
234
+ if layer_idx is None:
235
+ logger.warning_once(
236
+ f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
237
+ "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
238
+ "when creating this class."
239
+ )
240
+
241
+ self.attention_dropout = config.attention_dropout
242
+ self.hidden_size = config.hidden_size
243
+ self.num_heads = config.num_attention_heads
244
+ self.head_dim = self.hidden_size // self.num_heads
245
+ self.num_key_value_heads = config.num_key_value_heads
246
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
247
+ self.max_position_embeddings = config.max_position_embeddings
248
+ self.rope_theta = config.rope_theta
249
+ self.partial_rotary_factor = config.partial_rotary_factor
250
+ self.is_causal = True
251
+
252
+ if (self.head_dim * self.num_heads) != self.hidden_size:
253
+ raise ValueError(
254
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
255
+ f" and `num_heads`: {self.num_heads})."
256
+ )
257
+
258
+ self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=True)
259
+ self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
260
+ self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
261
+ self.dense = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=True)
262
+
263
+ self.qk_layernorm = config.qk_layernorm
264
+ if self.qk_layernorm:
265
+ self.q_layernorm = nn.LayerNorm(
266
+ config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
267
+ )
268
+ self.k_layernorm = nn.LayerNorm(
269
+ config.hidden_size // self.num_heads, eps=config.layer_norm_eps, elementwise_affine=True
270
+ )
271
+
272
+ self._init_rope()
273
+
274
+ def _init_rope(self):
275
+ if self.config.rope_scaling is None:
276
+ self.rotary_emb = PhiRotaryEmbedding(
277
+ int(self.partial_rotary_factor * self.head_dim),
278
+ max_position_embeddings=self.max_position_embeddings,
279
+ base=self.rope_theta,
280
+ )
281
+ else:
282
+ scaling_type = self.config.rope_scaling["type"]
283
+ scaling_factor = self.config.rope_scaling["factor"]
284
+ if scaling_type == "linear":
285
+ self.rotary_emb = PhiLinearScalingRotaryEmbedding(
286
+ int(self.partial_rotary_factor * self.head_dim),
287
+ max_position_embeddings=self.max_position_embeddings,
288
+ scaling_factor=scaling_factor,
289
+ base=self.rope_theta,
290
+ )
291
+ elif scaling_type == "dynamic":
292
+ self.rotary_emb = PhiDynamicNTKScalingRotaryEmbedding(
293
+ int(self.partial_rotary_factor * self.head_dim),
294
+ max_position_embeddings=self.max_position_embeddings,
295
+ scaling_factor=scaling_factor,
296
+ base=self.rope_theta,
297
+ )
298
+ else:
299
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
300
+
301
+ # Phi-2 has an attention overflow issue (with FP16) and requires autocast to be disabled
302
+ @torch.autocast("cpu", enabled=False)
303
+ @torch.autocast("cuda", enabled=False)
304
+ def forward(
305
+ self,
306
+ hidden_states: torch.Tensor,
307
+ attention_mask: Optional[torch.Tensor] = None,
308
+ position_ids: Optional[torch.LongTensor] = None,
309
+ past_key_value: Optional[Cache] = None,
310
+ output_attentions: bool = False,
311
+ use_cache: bool = False,
312
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
313
+ bsz, q_len, _ = hidden_states.size()
314
+
315
+ query_states = self.q_proj(hidden_states)
316
+ key_states = self.k_proj(hidden_states)
317
+ value_states = self.v_proj(hidden_states)
318
+
319
+ if self.qk_layernorm:
320
+ query_states = self.q_layernorm(query_states)
321
+ key_states = self.k_layernorm(key_states)
322
+
323
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
324
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
325
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
326
+
327
+ kv_seq_len = key_states.shape[-2]
328
+ if past_key_value is not None:
329
+ if self.layer_idx is None:
330
+ raise ValueError(
331
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
332
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
333
+ "with a layer index."
334
+ )
335
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
336
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
337
+
338
+ # Partial rotary embedding
339
+ query_rot, query_pass = (
340
+ query_states[..., : self.rotary_emb.dim],
341
+ query_states[..., self.rotary_emb.dim:],
342
+ )
343
+ key_rot, key_pass = (
344
+ key_states[..., : self.rotary_emb.dim],
345
+ key_states[..., self.rotary_emb.dim:],
346
+ )
347
+ # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
348
+ query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
349
+
350
+ # [batch_size, seq_length, num_heads, head_dim]
351
+ query_states = torch.cat((query_rot, query_pass), dim=-1)
352
+ key_states = torch.cat((key_rot, key_pass), dim=-1)
353
+
354
+ if past_key_value is not None:
355
+ cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
356
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
357
+
358
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
359
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
360
+
361
+ # Queries and keys upcast to fp32 is required by Phi-2 to avoid overflow
362
+ attn_weights = torch.matmul(
363
+ query_states.to(torch.float32), key_states.to(torch.float32).transpose(2, 3)
364
+ ) / math.sqrt(self.head_dim)
365
+
366
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
367
+ raise ValueError(
368
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
369
+ f" {attn_weights.size()}"
370
+ )
371
+
372
+ if attention_mask is not None:
373
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
374
+ raise ValueError(
375
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
376
+ )
377
+ attn_weights = attn_weights + attention_mask
378
+
379
+ # upcast attention to fp32
380
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(value_states.dtype)
381
+ attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
382
+
383
+ attn_output = torch.matmul(attn_weights, value_states)
384
+
385
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
386
+ raise ValueError(
387
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
388
+ f" {attn_output.size()}"
389
+ )
390
+
391
+ attn_output = attn_output.transpose(1, 2).contiguous()
392
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
393
+
394
+ attn_output = self.dense(attn_output)
395
+
396
+ if not output_attentions:
397
+ attn_weights = None
398
+
399
+ return attn_output, attn_weights, past_key_value
400
+
401
+
402
+ class PhiFlashAttention2(PhiAttention):
403
+ """
404
+ Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays
405
+ untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
406
+ flash attention and deal with padding tokens in case the input contains any of them.
407
+ """
408
+
409
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
410
+ def __init__(self, *args, **kwargs):
411
+ super().__init__(*args, **kwargs)
412
+
413
+ # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
414
+ # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
415
+ # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
416
+ self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
417
+
418
+ def forward(
419
+ self,
420
+ hidden_states: torch.Tensor,
421
+ attention_mask: Optional[torch.LongTensor] = None,
422
+ position_ids: Optional[torch.LongTensor] = None,
423
+ past_key_value: Optional[Cache] = None,
424
+ output_attentions: bool = False,
425
+ use_cache: bool = False,
426
+ **kwargs,
427
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
428
+ # PhiFlashAttention2 attention does not support output_attentions
429
+
430
+ output_attentions = False
431
+
432
+ bsz, q_len, _ = hidden_states.size()
433
+
434
+ query_states = self.q_proj(hidden_states)
435
+ key_states = self.k_proj(hidden_states)
436
+ value_states = self.v_proj(hidden_states)
437
+
438
+ if self.qk_layernorm:
439
+ query_states = self.q_layernorm(query_states)
440
+ key_states = self.k_layernorm(key_states)
441
+
442
+ # Flash attention requires the input to have the shape
443
+ # batch_size x seq_length x head_dim x hidden_dim
444
+ # therefore we just need to keep the original shape
445
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
446
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
447
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
448
+
449
+ kv_seq_len = key_states.shape[-2]
450
+ if past_key_value is not None:
451
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
452
+ cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
453
+
454
+ # Partial rotary embedding
455
+ query_rot, query_pass = (
456
+ query_states[..., : self.rotary_emb.dim],
457
+ query_states[..., self.rotary_emb.dim:],
458
+ )
459
+ key_rot, key_pass = (
460
+ key_states[..., : self.rotary_emb.dim],
461
+ key_states[..., self.rotary_emb.dim:],
462
+ )
463
+ # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
464
+ query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
465
+
466
+ # [batch_size, seq_length, num_heads, head_dim]
467
+ query_states = torch.cat((query_rot, query_pass), dim=-1)
468
+ key_states = torch.cat((key_rot, key_pass), dim=-1)
469
+
470
+ if past_key_value is not None:
471
+ cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
472
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
473
+
474
+ # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
475
+ # to be able to avoid many of these transpose/reshape/view.
476
+ query_states = query_states.transpose(1, 2)
477
+ key_states = key_states.transpose(1, 2)
478
+ value_states = value_states.transpose(1, 2)
479
+
480
+ attn_dropout = self.attention_dropout if self.training else 0.0
481
+
482
+ # In PEFT, usually we cast the layer norms in float32 for training stability reasons
483
+ # therefore the input hidden states gets silently casted in float32. Hence, we need
484
+ # cast them back in the correct dtype just to be sure everything works as expected.
485
+ # This might slowdown training & inference so it is recommended to not cast the LayerNorms
486
+ # in fp32.
487
+
488
+ if query_states.dtype == torch.float32:
489
+ if torch.is_autocast_enabled():
490
+ target_dtype = torch.get_autocast_gpu_dtype()
491
+ # Handle the case where the model is quantized
492
+ elif hasattr(self.config, "_pre_quantization_dtype"):
493
+ target_dtype = self.config._pre_quantization_dtype
494
+ else:
495
+ target_dtype = self.q_proj.weight.dtype
496
+
497
+ logger.warning_once(
498
+ f"The input hidden states seems to be silently casted in float32, this might be related to"
499
+ f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
500
+ f" {target_dtype}."
501
+ )
502
+
503
+ query_states = query_states.to(target_dtype)
504
+ key_states = key_states.to(target_dtype)
505
+ value_states = value_states.to(target_dtype)
506
+
507
+ attn_output = self._flash_attention_forward(
508
+ query_states, key_states, value_states, attention_mask, q_len, dropout=attn_dropout, softmax_scale=None
509
+ )
510
+
511
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
512
+ attn_output = self.dense(attn_output)
513
+
514
+ if not output_attentions:
515
+ attn_weights = None
516
+
517
+ return attn_output, attn_weights, past_key_value
518
+
519
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
520
+ def _flash_attention_forward(
521
+ self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
522
+ ):
523
+ """
524
+ Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
525
+ first unpad the input, then computes the attention scores and pad the final attention scores.
526
+ Args:
527
+ query_states (`torch.Tensor`):
528
+ Input query states to be passed to Flash Attention API
529
+ key_states (`torch.Tensor`):
530
+ Input key states to be passed to Flash Attention API
531
+ value_states (`torch.Tensor`):
532
+ Input value states to be passed to Flash Attention API
533
+ attention_mask (`torch.Tensor`):
534
+ The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
535
+ position of padding tokens and 1 for the position of non-padding tokens.
536
+ dropout (`int`, *optional*):
537
+ Attention dropout
538
+ softmax_scale (`float`, *optional*):
539
+ The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
540
+ """
541
+ if not self._flash_attn_uses_top_left_mask:
542
+ causal = self.is_causal
543
+ else:
544
+ # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
545
+ causal = self.is_causal and query_length != 1
546
+
547
+ # Contains at least one padding token in the sequence
548
+ if attention_mask is not None:
549
+ batch_size = query_states.shape[0]
550
+ query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
551
+ query_states, key_states, value_states, attention_mask, query_length
552
+ )
553
+
554
+ cu_seqlens_q, cu_seqlens_k = cu_seq_lens
555
+ max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
556
+
557
+ attn_output_unpad = flash_attn_varlen_func(
558
+ query_states,
559
+ key_states,
560
+ value_states,
561
+ cu_seqlens_q=cu_seqlens_q,
562
+ cu_seqlens_k=cu_seqlens_k,
563
+ max_seqlen_q=max_seqlen_in_batch_q,
564
+ max_seqlen_k=max_seqlen_in_batch_k,
565
+ dropout_p=dropout,
566
+ softmax_scale=softmax_scale,
567
+ causal=causal,
568
+ )
569
+
570
+ attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
571
+ else:
572
+ attn_output = flash_attn_func(
573
+ query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
574
+ )
575
+
576
+ return attn_output
577
+
578
+ # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
579
+ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
580
+ indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
581
+ batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
582
+
583
+ key_layer = index_first_axis(
584
+ key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
585
+ )
586
+ value_layer = index_first_axis(
587
+ value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
588
+ )
589
+ if query_length == kv_seq_len:
590
+ query_layer = index_first_axis(
591
+ query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
592
+ )
593
+ cu_seqlens_q = cu_seqlens_k
594
+ max_seqlen_in_batch_q = max_seqlen_in_batch_k
595
+ indices_q = indices_k
596
+ elif query_length == 1:
597
+ max_seqlen_in_batch_q = 1
598
+ cu_seqlens_q = torch.arange(
599
+ batch_size + 1, dtype=torch.int32, device=query_layer.device
600
+ ) # There is a memcpy here, that is very bad.
601
+ indices_q = cu_seqlens_q[:-1]
602
+ query_layer = query_layer.squeeze(1)
603
+ else:
604
+ # The -q_len: slice assumes left padding.
605
+ attention_mask = attention_mask[:, -query_length:]
606
+ query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
607
+
608
+ return (
609
+ query_layer,
610
+ key_layer,
611
+ value_layer,
612
+ indices_q,
613
+ (cu_seqlens_q, cu_seqlens_k),
614
+ (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
615
+ )
616
+
617
+
618
+ PHI_ATTENTION_CLASSES = {
619
+ "eager": PhiAttention,
620
+ "flash_attention_2": PhiFlashAttention2,
621
+ }
622
+
623
+
624
+ class PhiDecoderLayer(nn.Module):
625
+ def __init__(self, config: CheXagentConfig, layer_idx: int):
626
+ super().__init__()
627
+ self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
628
+ self.mlp = PhiMLP(config)
629
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
630
+ self.resid_dropout = nn.Dropout(config.resid_pdrop)
631
+
632
+ def forward(
633
+ self,
634
+ hidden_states: torch.Tensor,
635
+ attention_mask: Optional[torch.Tensor] = None,
636
+ position_ids: Optional[torch.LongTensor] = None,
637
+ output_attentions: Optional[bool] = False,
638
+ use_cache: Optional[bool] = False,
639
+ past_key_value: Optional[Tuple[torch.Tensor]] = None,
640
+ ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
641
+ """
642
+ Args:
643
+ hidden_states (`torch.FloatTensor`):
644
+ input to the layer of shape `(batch, seq_len, embed_dim)`
645
+ attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
646
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
647
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
648
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range
649
+ `[0, config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
650
+ output_attentions (`bool`, *optional*):
651
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
652
+ returned tensors for more detail.
653
+ use_cache (`bool`, *optional*):
654
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
655
+ (see `past_key_values`).
656
+ past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
657
+ """
658
+
659
+ residual = hidden_states
660
+
661
+ hidden_states = self.input_layernorm(hidden_states)
662
+
663
+ # Self Attention
664
+ attn_outputs, self_attn_weights, present_key_value = self.self_attn(
665
+ hidden_states=hidden_states,
666
+ attention_mask=attention_mask,
667
+ position_ids=position_ids,
668
+ past_key_value=past_key_value,
669
+ output_attentions=output_attentions,
670
+ use_cache=use_cache,
671
+ )
672
+ attn_outputs = self.resid_dropout(attn_outputs)
673
+
674
+ feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
675
+ hidden_states = attn_outputs + feed_forward_hidden_states + residual
676
+ outputs = (hidden_states,)
677
+
678
+ if output_attentions:
679
+ outputs += (self_attn_weights,)
680
+
681
+ if use_cache:
682
+ outputs += (present_key_value,)
683
+
684
+ return outputs
685
+
686
+
687
+ PHI_START_DOCSTRING = r"""
688
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
689
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
690
+ etc.)
691
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
692
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
693
+ and behavior.
694
+ Parameters:
695
+ config ([`CheXagentConfig`]):
696
+ Model configuration class with all the parameters of the model. Initializing with a config file does not
697
+ load the weights associated with the model, only the configuration. Check out the
698
+ [`~PreTrainedModel.from_pretrained`] method to load the model weights.
699
+ """
700
+
701
+
702
+ @add_start_docstrings(
703
+ "The bare Phi Model outputting raw hidden-states without any specific head on top.",
704
+ PHI_START_DOCSTRING,
705
+ )
706
+ class PhiPreTrainedModel(PreTrainedModel):
707
+ config_class = CheXagentConfig
708
+ base_model_prefix = "model"
709
+ supports_gradient_checkpointing = True
710
+ _no_split_modules = ["PhiDecoderLayer"]
711
+ _skip_keys_device_placement = "past_key_values"
712
+ _supports_flash_attn_2 = True
713
+ _supports_cache_class = True
714
+
715
+ def _init_weights(self, module):
716
+ std = self.config.initializer_range
717
+ if isinstance(module, nn.Linear):
718
+ module.weight.data.normal_(mean=0.0, std=std)
719
+ if module.bias is not None:
720
+ module.bias.data.zero_()
721
+ elif isinstance(module, nn.Embedding):
722
+ module.weight.data.normal_(mean=0.0, std=std)
723
+ if module.padding_idx is not None:
724
+ module.weight.data[module.padding_idx].zero_()
725
+
726
+
727
+ PHI_INPUTS_DOCSTRING = r"""
728
+ Args:
729
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
730
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
731
+ it.
732
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
733
+ [`PreTrainedTokenizer.__call__`] for details.
734
+ [What are input IDs?](../glossary#input-ids)
735
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
736
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
737
+ - 1 for tokens that are **not masked**,
738
+ - 0 for tokens that are **masked**.
739
+ [What are attention masks?](../glossary#attention-mask)
740
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
741
+ [`PreTrainedTokenizer.__call__`] for details.
742
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
743
+ `past_key_values`).
744
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
745
+ and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
746
+ information on the default strategy.
747
+ - 1 indicates the head is **not masked**,
748
+ - 0 indicates the head is **masked**.
749
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
750
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
751
+ config.n_positions - 1]`.
752
+ [What are position IDs?](../glossary#position-ids)
753
+ past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
754
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
755
+ blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
756
+ returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
757
+ Two formats are allowed:
758
+ - a [`~cache_utils.Cache`] instance;
759
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
760
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
761
+ cache format.
762
+ The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
763
+ legacy cache format will be returned.
764
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
765
+ have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
766
+ of shape `(batch_size, sequence_length)`.
767
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
768
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
769
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
770
+ model's internal embedding lookup matrix.
771
+ use_cache (`bool`, *optional*):
772
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
773
+ `past_key_values`).
774
+ output_attentions (`bool`, *optional*):
775
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
776
+ tensors for more detail.
777
+ output_hidden_states (`bool`, *optional*):
778
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
779
+ more detail.
780
+ return_dict (`bool`, *optional*):
781
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
782
+ """
783
+
784
+
785
+ class CheXagentModel(PhiPreTrainedModel):
786
+ """
787
+ Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`PhiDecoderLayer`]
788
+ Args:
789
+ config: CheXagentConfig
790
+ """
791
+
792
+ def __init__(self, config: CheXagentConfig):
793
+ super().__init__(config)
794
+ self.padding_idx = config.pad_token_id
795
+ self.vocab_size = config.vocab_size
796
+
797
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
798
+ self.embed_dropout = nn.Dropout(config.embd_pdrop)
799
+ self.layers = nn.ModuleList(
800
+ [PhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
801
+ )
802
+ self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
803
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
804
+
805
+ self.gradient_checkpointing = False
806
+
807
+ # Initialize weights and apply final processing
808
+ self.post_init()
809
+
810
+ # IMAGE
811
+ self.tokenizer = CheXagentTokenizer.from_pretrained(config.name_or_path)
812
+ # self.visual = VisionEnsembleModel(**config.visual)
813
+ self.visual = CLIPModel(**config.visual)
814
+ # self.visual = DINOv2Model(**config.visual)
815
+
816
+ def get_input_embeddings(self):
817
+ return self.embed_tokens
818
+
819
+ def set_input_embeddings(self, value):
820
+ self.embed_tokens = value
821
+
822
+ @add_start_docstrings_to_model_forward(PHI_INPUTS_DOCSTRING)
823
+ def forward(
824
+ self,
825
+ input_ids: torch.LongTensor = None,
826
+ attention_mask: Optional[torch.Tensor] = None,
827
+ position_ids: Optional[torch.LongTensor] = None,
828
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
829
+ inputs_embeds: Optional[torch.FloatTensor] = None,
830
+ use_cache: Optional[bool] = None,
831
+ output_attentions: Optional[bool] = None,
832
+ output_hidden_states: Optional[bool] = None,
833
+ return_dict: Optional[bool] = None,
834
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
835
+ # IMAGE: encode images
836
+ if past_key_values is None and torch.any(input_ids == self.tokenizer.img_start_id):
837
+ bos_pos = torch.where(input_ids == self.tokenizer.img_start_id)
838
+ eos_pos = torch.where(input_ids == self.tokenizer.img_end_id)
839
+ assert (bos_pos[0] == eos_pos[0]).all()
840
+ img_pos = torch.stack((bos_pos[0], bos_pos[1], eos_pos[1]), dim=1)
841
+ image_paths = []
842
+ for i, a, b in img_pos:
843
+ image = input_ids[i][a + 1: b - 1].tolist()
844
+ image = image[: image.index(self.tokenizer.img_pad_id)]
845
+ image_paths.append(self.tokenizer.decode(image))
846
+ images = self.visual.encode(image_paths, training=self.training)
847
+ assert images.shape[0] == len(images)
848
+ fake_images = None
849
+ elif self.training:
850
+ fake_images = torch.zeros(1, 3, 512, 512).to(
851
+ dtype=next(self.visual.parameters()).dtype, device=next(self.visual.parameters()).device)
852
+ images = self.visual(fake_images)
853
+ else:
854
+ fake_images = None
855
+ images = None
856
+
857
+ # set constants
858
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
859
+ output_hidden_states = (
860
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
861
+ )
862
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
863
+
864
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
865
+
866
+ # IMAGE: retrieve input_ids and inputs_embeds
867
+ if input_ids is not None and inputs_embeds is not None:
868
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
869
+ elif input_ids is not None:
870
+ input_shape = input_ids.size()
871
+ batch_size, seq_length = input_ids.shape[:2]
872
+ elif inputs_embeds is not None:
873
+ input_shape = inputs_embeds.size()[:-1]
874
+ batch_size, seq_length = inputs_embeds.shape[:2]
875
+ else:
876
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
877
+
878
+ past_key_values_length = 0
879
+
880
+ if self.gradient_checkpointing and self.training:
881
+ if use_cache:
882
+ logger.warning_once(
883
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
884
+ )
885
+ use_cache = False
886
+
887
+ if use_cache:
888
+ use_legacy_cache = not isinstance(past_key_values, Cache)
889
+ if use_legacy_cache:
890
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
891
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
892
+
893
+ if position_ids is None:
894
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
895
+ position_ids = torch.arange(
896
+ past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
897
+ )
898
+ position_ids = position_ids.unsqueeze(0)
899
+
900
+ if inputs_embeds is None:
901
+ inputs_embeds = self.embed_tokens(input_ids)
902
+
903
+ inputs_embeds = self.embed_dropout(inputs_embeds)
904
+
905
+ # Attention mask.
906
+ if self._use_flash_attention_2:
907
+ # 2d mask is passed through the layers
908
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
909
+ else:
910
+ # 4d mask is passed through the layers
911
+ attention_mask = _prepare_4d_causal_attention_mask(
912
+ attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
913
+ )
914
+
915
+ # IMAGE: embed positions
916
+ hidden_states = inputs_embeds.clone()
917
+ if fake_images is not None:
918
+ hidden_states = hidden_states + images.mean() * 0
919
+ elif images is not None:
920
+ for idx, (i, a, b) in enumerate(img_pos):
921
+ hidden_states[i][a + 1: b] = images[idx]
922
+ output_shape = input_shape + (hidden_states.size(-1),)
923
+
924
+ # decoder layers
925
+ all_hidden_states = () if output_hidden_states else None
926
+ all_self_attns = () if output_attentions else None
927
+ next_decoder_cache = None
928
+
929
+ for decoder_layer in self.layers:
930
+ if output_hidden_states:
931
+ all_hidden_states += (hidden_states,)
932
+
933
+ if self.gradient_checkpointing and self.training:
934
+ layer_outputs = self._gradient_checkpointing_func(
935
+ decoder_layer.__call__,
936
+ hidden_states,
937
+ attention_mask,
938
+ position_ids,
939
+ past_key_values,
940
+ output_attentions,
941
+ )
942
+ else:
943
+ layer_outputs = decoder_layer(
944
+ hidden_states,
945
+ attention_mask=attention_mask,
946
+ position_ids=position_ids,
947
+ past_key_value=past_key_values,
948
+ output_attentions=output_attentions,
949
+ use_cache=use_cache,
950
+ )
951
+
952
+ hidden_states = layer_outputs[0]
953
+
954
+ if use_cache:
955
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
956
+
957
+ if output_attentions:
958
+ all_self_attns += (layer_outputs[1],)
959
+
960
+ hidden_states = self.final_layernorm(hidden_states)
961
+
962
+ # add hidden states from the last decoder layer
963
+ if output_hidden_states:
964
+ all_hidden_states += (hidden_states,)
965
+
966
+ next_cache = None
967
+ if use_cache:
968
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
969
+ if not return_dict:
970
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
971
+ return BaseModelOutputWithPast(
972
+ last_hidden_state=hidden_states,
973
+ past_key_values=next_cache,
974
+ hidden_states=all_hidden_states,
975
+ attentions=all_self_attns,
976
+ )
977
+
978
+
979
+ class CheXagentForCausalLM(PhiPreTrainedModel):
980
+ _tied_weights_keys = ["lm_head.weight"]
981
+
982
+ def __init__(self, config):
983
+ super().__init__(config)
984
+ self.model = CheXagentModel(config)
985
+ self.vocab_size = config.vocab_size
986
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=True)
987
+
988
+ # Initialize weights and apply final processing
989
+ self.post_init()
990
+
991
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_input_embeddings
992
+ def get_input_embeddings(self):
993
+ return self.model.embed_tokens
994
+
995
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_input_embeddings
996
+ def set_input_embeddings(self, value):
997
+ self.model.embed_tokens = value
998
+
999
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_output_embeddings
1000
+ def get_output_embeddings(self):
1001
+ return self.lm_head
1002
+
1003
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_output_embeddings
1004
+ def set_output_embeddings(self, new_embeddings):
1005
+ self.lm_head = new_embeddings
1006
+
1007
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
1008
+ def set_decoder(self, decoder):
1009
+ self.model = decoder
1010
+
1011
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
1012
+ def get_decoder(self):
1013
+ return self.model
1014
+
1015
+ def forward(
1016
+ self,
1017
+ input_ids: torch.LongTensor = None,
1018
+ attention_mask: Optional[torch.Tensor] = None,
1019
+ position_ids: Optional[torch.LongTensor] = None,
1020
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
1021
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1022
+ labels: Optional[torch.LongTensor] = None,
1023
+ use_cache: Optional[bool] = None,
1024
+ output_attentions: Optional[bool] = None,
1025
+ output_hidden_states: Optional[bool] = None,
1026
+ return_dict: Optional[bool] = None,
1027
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
1028
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1029
+ output_hidden_states = (
1030
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1031
+ )
1032
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1033
+
1034
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1035
+ outputs = self.model(
1036
+ input_ids=input_ids,
1037
+ attention_mask=attention_mask,
1038
+ position_ids=position_ids,
1039
+ past_key_values=past_key_values,
1040
+ inputs_embeds=inputs_embeds,
1041
+ use_cache=use_cache,
1042
+ output_attentions=output_attentions,
1043
+ output_hidden_states=output_hidden_states,
1044
+ return_dict=return_dict,
1045
+ )
1046
+
1047
+ hidden_states = outputs[0]
1048
+ logits = self.lm_head(hidden_states)
1049
+ logits = logits.float()
1050
+
1051
+ loss = None
1052
+ if labels is not None:
1053
+ # Shift so that tokens < n predict n
1054
+ shift_logits = logits[..., :-1, :].contiguous()
1055
+ shift_labels = labels[..., 1:].contiguous()
1056
+ # Flatten the tokens
1057
+ loss_fct = CrossEntropyLoss()
1058
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
1059
+ shift_labels = shift_labels.view(-1)
1060
+ # Enable model parallelism
1061
+ shift_labels = shift_labels.to(shift_logits.device)
1062
+ loss = loss_fct(shift_logits, shift_labels)
1063
+
1064
+ if not return_dict:
1065
+ output = (logits,) + outputs[1:]
1066
+ return (loss,) + output if loss is not None else output
1067
+
1068
+ return CausalLMOutputWithPast(
1069
+ loss=loss,
1070
+ logits=logits,
1071
+ past_key_values=outputs.past_key_values,
1072
+ hidden_states=outputs.hidden_states,
1073
+ attentions=outputs.attentions,
1074
+ )
1075
+
1076
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.prepare_inputs_for_generation
1077
+ def prepare_inputs_for_generation(
1078
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
1079
+ ):
1080
+ if past_key_values is not None:
1081
+ if isinstance(past_key_values, Cache):
1082
+ cache_length = past_key_values.get_seq_length()
1083
+ past_length = past_key_values.seen_tokens
1084
+ max_cache_length = past_key_values.get_max_length()
1085
+ else:
1086
+ cache_length = past_length = past_key_values[0][0].shape[2]
1087
+ max_cache_length = None
1088
+
1089
+ # Keep only the unprocessed tokens:
1090
+ # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1091
+ # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
1092
+ # input)
1093
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1094
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length):]
1095
+ # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1096
+ # input_ids based on the past_length.
1097
+ elif past_length < input_ids.shape[1]:
1098
+ input_ids = input_ids[:, past_length:]
1099
+ # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1100
+
1101
+ # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1102
+ if (
1103
+ max_cache_length is not None
1104
+ and attention_mask is not None
1105
+ and cache_length + input_ids.shape[1] > max_cache_length
1106
+ ):
1107
+ attention_mask = attention_mask[:, -max_cache_length:]
1108
+
1109
+ position_ids = kwargs.get("position_ids", None)
1110
+ if attention_mask is not None and position_ids is None:
1111
+ # create position_ids on the fly for batch generation
1112
+ position_ids = attention_mask.long().cumsum(-1) - 1
1113
+ position_ids.masked_fill_(attention_mask == 0, 1)
1114
+ if past_key_values:
1115
+ position_ids = position_ids[:, -input_ids.shape[1]:]
1116
+
1117
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1118
+ if inputs_embeds is not None and past_key_values is None:
1119
+ model_inputs = {"inputs_embeds": inputs_embeds}
1120
+ else:
1121
+ model_inputs = {"input_ids": input_ids}
1122
+
1123
+ model_inputs.update(
1124
+ {
1125
+ "position_ids": position_ids,
1126
+ "past_key_values": past_key_values,
1127
+ "use_cache": kwargs.get("use_cache"),
1128
+ "attention_mask": attention_mask,
1129
+ }
1130
+ )
1131
+ return model_inputs
1132
+
1133
+ @staticmethod
1134
+ # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache
1135
+ def _reorder_cache(past_key_values, beam_idx):
1136
+ reordered_past = ()
1137
+ for layer_past in past_key_values:
1138
+ reordered_past += (
1139
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1140
+ )
1141
+ return reordered_past
modeling_visual.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import math
3
+ import re
4
+ from functools import partial
5
+ from typing import List
6
+
7
+ import albumentations as A
8
+ import cv2
9
+ import numpy as np
10
+ import pyarrow as pa
11
+ import requests
12
+ import torch
13
+ import transformers
14
+ from PIL import Image
15
+ from albumentations.pytorch import ToTensorV2
16
+ from einops import rearrange
17
+ from torch import nn
18
+ from torch.nn import functional as F
19
+ from torch.nn.init import trunc_normal_
20
+ from torchvision import transforms
21
+ from torchvision.transforms import InterpolationMode
22
+ from transformers import AutoModel, AutoProcessor
23
+ from transformers.activations import ACT2FN
24
+
25
+ assert transformers.__version__ == "4.40.0", "Please install a specific HF transformers version: pip install transformers==4.40.0"
26
+
27
+
28
+ def get_abs_pos(abs_pos, tgt_size):
29
+ # abs_pos: L, C
30
+ # tgt_size: M
31
+ # return: M, C
32
+ src_size = int(math.sqrt(abs_pos.size(0)))
33
+ tgt_size = int(math.sqrt(tgt_size))
34
+ dtype = abs_pos.dtype
35
+
36
+ if src_size != tgt_size:
37
+ return F.interpolate(
38
+ abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2),
39
+ size=(tgt_size, tgt_size),
40
+ mode="bicubic",
41
+ align_corners=False,
42
+ ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)
43
+ else:
44
+ return abs_pos
45
+
46
+
47
+ def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False):
48
+ """
49
+ grid_size: int of the grid height and width
50
+ return:
51
+ pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
52
+ """
53
+ grid_h = np.arange(grid_size, dtype=np.float32)
54
+ grid_w = np.arange(grid_size, dtype=np.float32)
55
+ grid = np.meshgrid(grid_w, grid_h) # here w goes first
56
+ grid = np.stack(grid, axis=0)
57
+
58
+ grid = grid.reshape([2, 1, grid_size, grid_size])
59
+ pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
60
+ if cls_token:
61
+ pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
62
+ return pos_embed
63
+
64
+
65
+ def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
66
+ assert embed_dim % 2 == 0
67
+
68
+ # use half of dimensions to encode grid_h
69
+ emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)
70
+ emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)
71
+
72
+ emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
73
+ return emb
74
+
75
+
76
+ def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
77
+ """
78
+ embed_dim: output dimension for each position
79
+ pos: a list of positions to be encoded: size (M,)
80
+ out: (M, D)
81
+ """
82
+ assert embed_dim % 2 == 0
83
+ omega = np.arange(embed_dim // 2, dtype=np.float32)
84
+ omega /= embed_dim / 2.
85
+ omega = 1. / 10000 ** omega # (D/2,)
86
+
87
+ pos = pos.reshape(-1) # (M,)
88
+ out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product
89
+
90
+ emb_sin = np.sin(out) # (M, D/2)
91
+ emb_cos = np.cos(out) # (M, D/2)
92
+
93
+ emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)
94
+ return emb
95
+
96
+
97
+ class Resampler(nn.Module):
98
+ """
99
+ A 2D perceiver-resampler network with one cross attention layers by
100
+ (grid_size**2) learnable queries and 2d sincos pos_emb
101
+ Outputs:
102
+ A tensor with the shape of (grid_size**2, embed_dim)
103
+ """
104
+
105
+ def __init__(
106
+ self,
107
+ grid_size,
108
+ embed_dim,
109
+ num_heads,
110
+ kv_dim=None,
111
+ norm_layer=nn.LayerNorm
112
+ ):
113
+ super().__init__()
114
+ self.num_queries = grid_size ** 2
115
+ self.embed_dim = embed_dim
116
+ self.num_heads = num_heads
117
+
118
+ self.pos_embed = nn.Parameter(
119
+ torch.from_numpy(get_2d_sincos_pos_embed(embed_dim, grid_size)).float()
120
+ ).requires_grad_(False)
121
+
122
+ self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim))
123
+ # trunc_normal_(self.query, std=.02)
124
+
125
+ if kv_dim is not None and kv_dim != embed_dim:
126
+ self.kv_proj = nn.Linear(kv_dim, embed_dim, bias=False)
127
+ else:
128
+ self.kv_proj = nn.Identity()
129
+
130
+ self.attn = nn.MultiheadAttention(embed_dim, num_heads)
131
+ self.ln_q = norm_layer(embed_dim)
132
+ self.ln_kv = norm_layer(embed_dim)
133
+ # self.apply(self._init_weights)
134
+
135
+ def _init_weights(self, m):
136
+ if isinstance(m, nn.Linear):
137
+ trunc_normal_(m.weight, std=.02)
138
+ if isinstance(m, nn.Linear) and m.bias is not None:
139
+ nn.init.constant_(m.bias, 0)
140
+ elif isinstance(m, nn.LayerNorm):
141
+ nn.init.constant_(m.bias, 0)
142
+ nn.init.constant_(m.weight, 1.0)
143
+
144
+ def forward(self, x, attn_mask=None):
145
+
146
+ pos_embed = get_abs_pos(self.pos_embed, x.size(1))
147
+
148
+ x = self.kv_proj(x)
149
+ x = self.ln_kv(x).permute(1, 0, 2)
150
+
151
+ N = x.shape[1]
152
+ q = self.ln_q(self.query)
153
+ out = self.attn(
154
+ self._repeat(q, N) + self.pos_embed.unsqueeze(1),
155
+ x + pos_embed.unsqueeze(1),
156
+ x,
157
+ attn_mask=attn_mask
158
+ )[0]
159
+ return out.permute(1, 0, 2)
160
+
161
+ def _repeat(self, query, N: int):
162
+ return query.unsqueeze(1).repeat(1, N, 1)
163
+
164
+
165
+ class CLIPModel(nn.Module):
166
+
167
+ def __init__(
168
+ self,
169
+ image_size: int,
170
+ n_queries: int = 256,
171
+ output_dim: int = 512,
172
+ vision_model_name_or_path: str = "StanfordAIMI/XraySigLIP__vit-l-16-siglip-384__webli",
173
+ **kwargs
174
+ ):
175
+ super().__init__()
176
+ # load model and processor
177
+ self.model = AutoModel.from_pretrained(vision_model_name_or_path).vision_model
178
+ self.processor = AutoProcessor.from_pretrained(vision_model_name_or_path).image_processor
179
+
180
+ # set constants
181
+ self.image_height, self.image_width = self.image_size = (image_size, image_size)
182
+ width = self.model.config.hidden_size
183
+ patch_height, patch_width = self.model.embeddings.patch_embedding.kernel_size
184
+ self.grid_size = (self.image_height // patch_height, self.image_width // patch_width)
185
+ self.output_dim = output_dim
186
+
187
+ # Transforms
188
+ self.mean = self.processor.image_mean
189
+ self.std = self.processor.image_std
190
+
191
+ self.image_transform = transforms.Compose([
192
+ transforms.Resize(
193
+ (image_size, image_size),
194
+ interpolation=InterpolationMode.BICUBIC
195
+ ),
196
+ transforms.ToTensor(),
197
+ transforms.Normalize(mean=self.mean, std=self.std),
198
+ ])
199
+
200
+ # MLP
201
+ self.pos_embed = nn.Parameter(
202
+ torch.from_numpy(get_2d_sincos_pos_embed(width, self.grid_size[0])).float()
203
+ ).requires_grad_(False)
204
+ self.attn_pool = nn.Sequential(
205
+ nn.Linear(width, output_dim * 4, bias=True),
206
+ ACT2FN["gelu"],
207
+ nn.Linear(output_dim * 4, output_dim, bias=True)
208
+ )
209
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
210
+ self.ln_post = norm_layer(output_dim)
211
+ self.proj = nn.Parameter((output_dim ** -0.5) * torch.randn(output_dim, output_dim), requires_grad=True)
212
+
213
+ def forward_resampler(self, x):
214
+ pos_embed = get_abs_pos(self.pos_embed, x.size(1))
215
+ x = x + pos_embed.unsqueeze(0)
216
+ x = self.attn_pool(x)
217
+ x = self.ln_post(x)
218
+ x = x @ self.proj
219
+ return x
220
+
221
+ def forward(self, x: torch.Tensor):
222
+ # get feature
223
+ x = self.model(x, output_hidden_states=True).hidden_states[-1]
224
+
225
+ # resampler
226
+ x = self.forward_resampler(x)
227
+ return x
228
+
229
+ def load_image(self, image_path, training):
230
+ if image_path.startswith("http://") or image_path.startswith("https://"):
231
+ image = Image.open(requests.get(image_path, stream=True).raw)
232
+ else:
233
+ image = Image.open(image_path)
234
+
235
+ image = image.convert("RGB")
236
+
237
+ image_tensor = self.image_transform(image)
238
+ return image_tensor
239
+
240
+ def encode(self, image_paths: List[str], training):
241
+ images = []
242
+ for image_path in image_paths:
243
+ image = self.load_image(image_path, training)
244
+ images.append(image)
245
+ images = torch.stack(images, dim=0)
246
+ images = images.to(dtype=next(self.parameters()).dtype, device=next(self.parameters()).device)
247
+ outputs = self.forward(images)
248
+ return outputs
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<|coord_9|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "bos_token": {
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<|endoftext|>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenization_chexagent.py ADDED
@@ -0,0 +1,675 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from functools import lru_cache
3
+ from typing import TYPE_CHECKING
4
+
5
+ import regex as re
6
+ from transformers.tokenization_utils_base import TextInput
7
+ from transformers.utils import is_tf_available, is_torch_available, to_py_obj
8
+
9
+ if TYPE_CHECKING:
10
+ if is_torch_available():
11
+ import torch
12
+ if is_tf_available():
13
+ import tensorflow as tf
14
+
15
+ import os
16
+ import random
17
+ from typing import Dict, List, Tuple, Union, Any, Callable, Optional
18
+
19
+ import matplotlib as mpl
20
+ import matplotlib.colors as mcolors
21
+ import matplotlib.colors as mplc
22
+ import matplotlib.figure as mplfigure
23
+ import numpy as np
24
+ import requests
25
+ import torch
26
+ from PIL import Image
27
+ from matplotlib.backends.backend_agg import FigureCanvasAgg
28
+ from transformers import PreTrainedTokenizer, AddedToken
29
+ from transformers.utils import logging
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ VOCAB_FILES_NAMES = {
34
+ "vocab_file": "vocab.json",
35
+ "merges_file": "merges.txt",
36
+ }
37
+
38
+ PRETRAINED_VOCAB_FILES_MAP = {
39
+ "vocab_file": {
40
+ "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/vocab.json",
41
+ },
42
+ "merges_file": {
43
+ "Salesforce/codegen-350M-mono": "https://huggingface.co/Salesforce/codegen-350M-mono/resolve/main/merges.txt",
44
+ },
45
+ }
46
+
47
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
48
+ "Salesforce/codegen-350M-mono": 2048,
49
+ }
50
+
51
+ IMG_TOKEN_SPAN = 1024
52
+
53
+ DEFAULT_CHAT_TEMPLATE = "{% for message in messages %}\n{% if message['from'] == 'human' %}\n{{ '<|user|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'system' %}\n{{ '<|system|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'gpt' %}\n{{ '<|assistant|>\n' + message['value'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}"
54
+
55
+
56
+ @lru_cache()
57
+ def bytes_to_unicode():
58
+ """
59
+ Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
60
+ characters the bpe code barfs on.
61
+
62
+ The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
63
+ if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
64
+ decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
65
+ tables between utf-8 bytes and unicode strings.
66
+ """
67
+ bs = (
68
+ list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(
69
+ range(ord("®"), ord("ÿ") + 1))
70
+ )
71
+ cs = bs[:]
72
+ n = 0
73
+ for b in range(2 ** 8):
74
+ if b not in bs:
75
+ bs.append(b)
76
+ cs.append(2 ** 8 + n)
77
+ n += 1
78
+ cs = [chr(n) for n in cs]
79
+ return dict(zip(bs, cs))
80
+
81
+
82
+ def get_pairs(word):
83
+ """
84
+ Return set of symbol pairs in a word.
85
+
86
+ Word is represented as tuple of symbols (symbols being variable-length strings).
87
+ """
88
+ pairs = set()
89
+ prev_char = word[0]
90
+ for char in word[1:]:
91
+ pairs.add((prev_char, char))
92
+ prev_char = char
93
+ return pairs
94
+
95
+
96
+ def _list_find(
97
+ input_list: List[Any],
98
+ candidates: Tuple[Any],
99
+ start: int = 0,
100
+ ):
101
+ for i in range(start, len(input_list)):
102
+ if input_list[i] in candidates:
103
+ return i
104
+ return -1
105
+
106
+
107
+ def _replace_closed_tag(
108
+ input_tokens: List[Any],
109
+ start_tags: Union[Any, Tuple[Any]],
110
+ end_tags: Union[Any, Tuple[Any]],
111
+ inclusive_replace_func: Callable,
112
+ exclusive_replace_func: Callable = lambda x: x,
113
+ ):
114
+ if isinstance(start_tags, (str, int)):
115
+ start_tags = (start_tags,)
116
+ if isinstance(end_tags, (str, int)):
117
+ end_tags = (end_tags,)
118
+ assert len(start_tags) == len(end_tags)
119
+
120
+ output_tokens = []
121
+ end = 0
122
+ while True:
123
+ start = _list_find(input_tokens, start_tags, end)
124
+ if start == -1:
125
+ break
126
+ output_tokens.extend(exclusive_replace_func(input_tokens[end: start]))
127
+ tag_idx = start_tags.index(input_tokens[start])
128
+ end = _list_find(input_tokens, (end_tags[tag_idx],), start)
129
+ if end == -1:
130
+ raise ValueError("Unclosed image token")
131
+ output_tokens.extend(inclusive_replace_func(input_tokens[start: end + 1]))
132
+ end += 1
133
+ output_tokens.extend(exclusive_replace_func(input_tokens[end:]))
134
+ return output_tokens
135
+
136
+
137
+ class CheXagentTokenizer(PreTrainedTokenizer):
138
+ vocab_files_names = VOCAB_FILES_NAMES
139
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
140
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
141
+ model_input_names = ["input_ids", "attention_mask"]
142
+
143
+ def __init__(
144
+ self,
145
+ vocab_file,
146
+ merges_file,
147
+ errors="replace",
148
+ unk_token="<|endoftext|>",
149
+ bos_token="<|endoftext|>",
150
+ eos_token="<|endoftext|>",
151
+ pad_token=None,
152
+ add_prefix_space=False,
153
+ add_bos_token=False,
154
+ image_start_tag='<|img|>',
155
+ image_end_tag='<|/img|>',
156
+ image_pad_tag='<|imgpad|>',
157
+ ref_start_tag='<|ref|>',
158
+ ref_end_tag='<|/ref|>',
159
+ box_start_tag='<|box|>',
160
+ box_end_tag='<|/box|>',
161
+ quad_start_tag='<|quad|>',
162
+ quad_end_tag='<|/quad|>',
163
+ **kwargs,
164
+ ):
165
+ bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
166
+ eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
167
+ unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
168
+ pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
169
+ self.add_bos_token = add_bos_token
170
+
171
+ with open(vocab_file, encoding="utf-8") as vocab_handle:
172
+ self.encoder = json.load(vocab_handle)
173
+ self.decoder = {v: k for k, v in self.encoder.items()}
174
+ self.errors = errors # how to handle errors in decoding
175
+ self.byte_encoder = bytes_to_unicode()
176
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
177
+ with open(merges_file, encoding="utf-8") as merges_handle:
178
+ bpe_merges = merges_handle.read().split("\n")[1:-1]
179
+ bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
180
+ self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
181
+ self.cache = {}
182
+ self.add_prefix_space = add_prefix_space
183
+
184
+ # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
185
+ self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
186
+ super().__init__(
187
+ errors=errors,
188
+ unk_token=unk_token,
189
+ bos_token=bos_token,
190
+ eos_token=eos_token,
191
+ pad_token=pad_token,
192
+ add_prefix_space=add_prefix_space,
193
+ add_bos_token=add_bos_token,
194
+ **kwargs,
195
+ )
196
+
197
+ self.image_start_tag = image_start_tag
198
+ self.image_end_tag = image_end_tag
199
+ self.image_pad_tag = image_pad_tag
200
+ self.ref_start_tag = ref_start_tag
201
+ self.ref_end_tag = ref_end_tag
202
+ self.box_start_tag = box_start_tag
203
+ self.box_end_tag = box_end_tag
204
+ self.quad_start_tag = quad_start_tag
205
+ self.quad_end_tag = quad_end_tag
206
+ self.IMAGE_ST = (
207
+ image_start_tag, image_end_tag, image_pad_tag,
208
+ ref_start_tag, ref_end_tag, box_start_tag, box_end_tag,
209
+ quad_start_tag, quad_end_tag,
210
+ )
211
+ for special_token in self.IMAGE_ST:
212
+ if special_token not in self.get_vocab():
213
+ self.add_special_tokens({"additional_special_tokens": [special_token]})
214
+ for coordinate in range(10):
215
+ if f"<{coordinate}>" not in self.get_vocab():
216
+ self.add_special_tokens({"additional_special_tokens": [f"<|coord_{coordinate}|>"]})
217
+ if len(self) % 64 != 0:
218
+ for extra in range(((len(self) // 64) + 1) * 64 - len(self)):
219
+ if f"<extra_{extra}>" not in self.get_vocab():
220
+ self.add_special_tokens({"additional_special_tokens": [f"<|extra_{extra}|>"]})
221
+ self.img_start_id = self.convert_tokens_to_ids(self.image_start_tag)
222
+ self.img_end_id = self.convert_tokens_to_ids(self.image_end_tag)
223
+ self.img_pad_id = self.convert_tokens_to_ids(self.image_pad_tag)
224
+ self.ref_start_id = self.convert_tokens_to_ids(self.ref_start_tag)
225
+ self.ref_end_id = self.convert_tokens_to_ids(self.ref_end_tag)
226
+ self.box_start_id = self.convert_tokens_to_ids(self.box_start_tag)
227
+ self.box_end_id = self.convert_tokens_to_ids(self.box_end_tag)
228
+ self.quad_start_id = self.convert_tokens_to_ids(self.quad_start_tag)
229
+ self.quad_end_id = self.convert_tokens_to_ids(self.quad_end_tag)
230
+ self.chat_template = DEFAULT_CHAT_TEMPLATE
231
+
232
+ @property
233
+ def vocab_size(self):
234
+ return len(self.encoder)
235
+
236
+ def get_vocab(self):
237
+ return dict(self.encoder, **self.added_tokens_encoder)
238
+
239
+ def bpe(self, token):
240
+ if token in self.cache:
241
+ return self.cache[token]
242
+ word = tuple(token)
243
+ pairs = get_pairs(word)
244
+
245
+ if not pairs:
246
+ return token
247
+
248
+ while True:
249
+ bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
250
+ if bigram not in self.bpe_ranks:
251
+ break
252
+ first, second = bigram
253
+ new_word = []
254
+ i = 0
255
+ while i < len(word):
256
+ try:
257
+ j = word.index(first, i)
258
+ except ValueError:
259
+ new_word.extend(word[i:])
260
+ break
261
+ else:
262
+ new_word.extend(word[i:j])
263
+ i = j
264
+
265
+ if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
266
+ new_word.append(first + second)
267
+ i += 2
268
+ else:
269
+ new_word.append(word[i])
270
+ i += 1
271
+ new_word = tuple(new_word)
272
+ word = new_word
273
+ if len(word) == 1:
274
+ break
275
+ else:
276
+ pairs = get_pairs(word)
277
+ word = " ".join(word)
278
+ self.cache[token] = word
279
+ return word
280
+
281
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
282
+ if self.add_bos_token:
283
+ bos_token_ids = [self.bos_token_id]
284
+ else:
285
+ bos_token_ids = []
286
+
287
+ output = bos_token_ids + token_ids_0
288
+
289
+ if token_ids_1 is None:
290
+ return output
291
+
292
+ return output + bos_token_ids + token_ids_1
293
+
294
+ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
295
+ def _encode_imgurl(img_tokens):
296
+ assert img_tokens[0] == self.image_start_tag and img_tokens[-1] == self.image_end_tag
297
+ img_tokens = img_tokens[1:-1]
298
+ img_url = ''.join(img_tokens)
299
+ out_img_tokens = list(img_url)
300
+ if len(out_img_tokens) > IMG_TOKEN_SPAN:
301
+ raise ValueError("The content in {}..{} is too long".format(self.image_start_tag, self.image_end_tag))
302
+ out_img_tokens.extend([self.image_pad_tag] * (IMG_TOKEN_SPAN - len(out_img_tokens)))
303
+ out_img_tokens = [self.image_start_tag] + out_img_tokens + [self.image_end_tag]
304
+ return out_img_tokens
305
+
306
+ tokens = super().tokenize(text, **kwargs)
307
+ tokens = _replace_closed_tag(tokens, self.image_start_tag, self.image_end_tag, _encode_imgurl)
308
+ return tokens
309
+
310
+ def _tokenize(self, text):
311
+ """Tokenize a string."""
312
+
313
+ bpe_tokens = []
314
+ for token in re.findall(self.pat, text):
315
+ token = "".join(
316
+ self.byte_encoder[b] for b in token.encode("utf-8")
317
+ ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
318
+ bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
319
+ return bpe_tokens
320
+
321
+ def _convert_token_to_id(self, token):
322
+ """Converts a token (str) in an id using the vocab."""
323
+ return self.encoder.get(token, self.encoder.get(self.unk_token))
324
+
325
+ def _convert_id_to_token(self, index):
326
+ """Converts an index (integer) in a token (str) using the vocab."""
327
+ return self.decoder.get(index)
328
+
329
+ def convert_tokens_to_string(self, tokens):
330
+ """Converts a sequence of tokens (string) in a single string."""
331
+ text = "".join(tokens)
332
+ text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
333
+ return text
334
+
335
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
336
+ if not os.path.isdir(save_directory):
337
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
338
+ return
339
+ vocab_file = os.path.join(
340
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
341
+ )
342
+ merge_file = os.path.join(
343
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
344
+ )
345
+
346
+ with open(vocab_file, "w", encoding="utf-8") as f:
347
+ f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
348
+
349
+ index = 0
350
+ with open(merge_file, "w", encoding="utf-8") as writer:
351
+ writer.write("#version: 0.2\n")
352
+ for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
353
+ if index != token_index:
354
+ logger.warning(
355
+ f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
356
+ " Please check that the tokenizer is not corrupted!"
357
+ )
358
+ index = token_index
359
+ writer.write(" ".join(bpe_tokens) + "\n")
360
+ index += 1
361
+
362
+ return vocab_file, merge_file
363
+
364
+ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
365
+ add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
366
+ if is_split_into_words or add_prefix_space:
367
+ text = " " + text
368
+ return (text, kwargs)
369
+
370
+ def decode(
371
+ self,
372
+ token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
373
+ skip_special_tokens: bool = False,
374
+ clean_up_tokenization_spaces: bool = None,
375
+ truncate_before_pattern: Optional[List[str]] = None,
376
+ **kwargs,
377
+ ) -> str:
378
+ """
379
+ Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
380
+ tokens and clean up tokenization spaces.
381
+
382
+ Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
383
+
384
+ Args:
385
+ token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
386
+ List of tokenized input ids. Can be obtained using the `__call__` method.
387
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
388
+ Whether or not to remove special tokens in the decoding.
389
+ clean_up_tokenization_spaces (`bool`, *optional*):
390
+ Whether or not to clean up the tokenization spaces. If `None`, will default to
391
+ `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
392
+ truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
393
+ A list of regular expression strings that will be used to truncate the returned string. This can be
394
+ used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
395
+ of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
396
+ kwargs (additional keyword arguments, *optional*):
397
+ Will be passed to the underlying model specific decode method.
398
+
399
+ Returns:
400
+ `str`: The decoded sentence.
401
+ """
402
+
403
+ token_ids = to_py_obj(token_ids)
404
+
405
+ decoded_text = self._decode(
406
+ token_ids=token_ids,
407
+ skip_special_tokens=skip_special_tokens,
408
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
409
+ **kwargs,
410
+ )
411
+
412
+ if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
413
+ decoded_text = self.truncate(decoded_text, truncate_before_pattern)
414
+
415
+ return decoded_text
416
+
417
+ def _decode(
418
+ self,
419
+ token_ids: List[int],
420
+ skip_special_tokens: bool = False,
421
+ clean_up_tokenization_spaces: bool = None,
422
+ spaces_between_special_tokens: bool = True,
423
+ **kwargs,
424
+ ) -> str:
425
+
426
+ def _decode_imgurl(img_token_ids):
427
+ assert img_token_ids[0] == self.img_start_id and img_token_ids[-1] == self.img_end_id
428
+ img_token_ids = img_token_ids[1:-1]
429
+ img_token_ids = img_token_ids[: img_token_ids.index(self.img_pad_id)]
430
+ return [self.img_start_id] + img_token_ids + [self.img_end_id]
431
+
432
+ token_ids = _replace_closed_tag(token_ids, self.img_start_id, self.img_end_id, _decode_imgurl)
433
+
434
+ return super()._decode(
435
+ token_ids, skip_special_tokens, clean_up_tokenization_spaces, spaces_between_special_tokens, **kwargs
436
+ )
437
+
438
+ def truncate(self, completion, truncate_before_pattern):
439
+ def find_re(string, pattern, start_pos):
440
+ m = pattern.search(string, start_pos)
441
+ return m.start() if m else -1
442
+
443
+ terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
444
+
445
+ prints = list(re.finditer("^print", completion, re.MULTILINE))
446
+
447
+ if len(prints) > 1:
448
+ completion = completion[: prints[1].start()]
449
+
450
+ defs = list(re.finditer("^def", completion, re.MULTILINE))
451
+
452
+ if len(defs) > 1:
453
+ completion = completion[: defs[1].start()]
454
+
455
+ start_pos = 0
456
+
457
+ terminals_pos = [
458
+ pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
459
+ ]
460
+
461
+ if len(terminals_pos) > 0:
462
+ return completion[: min(terminals_pos)]
463
+ else:
464
+ return completion
465
+
466
+ def from_list_format(self, list_format: List[Dict]):
467
+ text = ''
468
+ num_images = 0
469
+ for ele in list_format:
470
+ if 'image' in ele:
471
+ num_images += 1
472
+ text += f'Picture {num_images}:'
473
+ text += self.image_start_tag + ele['image'] + self.image_end_tag
474
+ text += '\n'
475
+ elif 'text' in ele:
476
+ text += ele['text']
477
+ elif 'box' in ele:
478
+ if 'ref' in ele:
479
+ text += self.ref_start_tag + ele['ref'] + self.ref_end_tag
480
+ for box in ele['box']:
481
+ text += self.box_start_tag + '(%d,%d),(%d,%d)' % (box[0], box[1], box[2], box[3]) + self.box_end_tag
482
+ else:
483
+ raise ValueError("Unsupport element: " + str(ele))
484
+ return text
485
+
486
+ def to_list_format(self, text: str):
487
+ token_ids = self.encode(text)
488
+
489
+ def _encode_vl_info(tokens):
490
+ if len(tokens) == 0:
491
+ return []
492
+ if tokens[0] == self.img_start_id and tokens[-1] == self.img_end_id:
493
+ key = 'image'
494
+ elif tokens[0] == self.ref_start_id and tokens[-1] == self.ref_end_id:
495
+ key = 'ref'
496
+ elif tokens[0] == self.box_start_id and tokens[-1] == self.box_end_id:
497
+ key = 'box'
498
+ elif tokens[0] == self.quad_start_id and tokens[-1] == self.quad_end_id:
499
+ key = 'quad'
500
+ else:
501
+ val = self.decode(tokens)
502
+ return [{'text': val}]
503
+ tokens = [token for token in tokens[1:-1] if token != self.img_pad_id]
504
+ val = self.decode(tokens, skip_special_tokens=True)
505
+ return [{key: val}]
506
+
507
+ return _replace_closed_tag(
508
+ token_ids,
509
+ (self.img_start_id, self.ref_start_id, self.box_start_id, self.quad_start_id),
510
+ (self.img_end_id, self.ref_end_id, self.box_end_id, self.quad_end_id),
511
+ _encode_vl_info,
512
+ _encode_vl_info,
513
+ )
514
+
515
+ def _fetch_latest_picture(self, response, history):
516
+ if history is None:
517
+ history = []
518
+ _history = history + [(response, None)]
519
+ for q, r in _history[::-1]:
520
+ for ele in self.to_list_format(q)[::-1]:
521
+ if 'image' in ele:
522
+ return ele['image']
523
+ return None
524
+
525
+ def _fetch_all_box_with_ref(self, text):
526
+ list_format = self.to_list_format(text)
527
+ output = []
528
+ for i, ele in enumerate(list_format):
529
+ if 'box' in ele:
530
+ bbox = tuple(map(int, ele['box'].replace('(', '').replace(')', '').split(',')))
531
+ assert len(bbox) == 4
532
+ output.append({'box': bbox})
533
+ if i > 0 and 'ref' in list_format[i - 1]:
534
+ output[-1]['ref'] = list_format[i - 1]['ref'].strip()
535
+ return output
536
+
537
+ def draw_bbox_on_latest_picture(
538
+ self,
539
+ response,
540
+ history=None,
541
+ ) -> Optional[Image.Image]:
542
+ image = self._fetch_latest_picture(response, history)
543
+ if image is None:
544
+ return None
545
+ if image.startswith("http://") or image.startswith("https://"):
546
+ image = Image.open(requests.get(image, stream=True).raw).convert("RGB")
547
+ h, w = image.height, image.width
548
+ else:
549
+ image = np.asarray(Image.open(image).convert("RGB"))
550
+ h, w = image.shape[0], image.shape[1]
551
+ visualizer = Visualizer(image)
552
+
553
+ boxes = self._fetch_all_box_with_ref(response)
554
+ if not boxes:
555
+ return None
556
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()]) # init color
557
+ for box in boxes:
558
+ if 'ref' in box: # random new color for new refexps
559
+ color = random.choice([_ for _ in mcolors.TABLEAU_COLORS.keys()])
560
+ x1, y1, x2, y2 = box['box']
561
+ x1, y1, x2, y2 = (int(x1 / 1000 * w), int(y1 / 1000 * h), int(x2 / 1000 * w), int(y2 / 1000 * h))
562
+ visualizer.draw_box((x1, y1, x2, y2), alpha=1, edge_color=color)
563
+ if 'ref' in box:
564
+ visualizer.draw_text(box['ref'], (x1, y1), color=color, horizontal_alignment="left")
565
+ return visualizer.output
566
+
567
+
568
+ class VisImage:
569
+ def __init__(self, img, scale=1.0):
570
+ self.img = img
571
+ self.scale = scale
572
+ self.width, self.height = img.shape[1], img.shape[0]
573
+ self._setup_figure(img)
574
+
575
+ def _setup_figure(self, img):
576
+ fig = mplfigure.Figure(frameon=False)
577
+ self.dpi = fig.get_dpi()
578
+ # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
579
+ # (https://github.com/matplotlib/matplotlib/issues/15363)
580
+ fig.set_size_inches(
581
+ (self.width * self.scale + 1e-2) / self.dpi,
582
+ (self.height * self.scale + 1e-2) / self.dpi,
583
+ )
584
+ self.canvas = FigureCanvasAgg(fig)
585
+ # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
586
+ ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
587
+ ax.axis("off")
588
+ self.fig = fig
589
+ self.ax = ax
590
+ self.reset_image(img)
591
+
592
+ def reset_image(self, img):
593
+ img = img.astype("uint8")
594
+ self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
595
+
596
+ def save(self, filepath):
597
+ self.fig.savefig(filepath)
598
+
599
+ def get_image(self):
600
+ canvas = self.canvas
601
+ s, (width, height) = canvas.print_to_buffer()
602
+
603
+ buffer = np.frombuffer(s, dtype="uint8")
604
+
605
+ img_rgba = buffer.reshape(height, width, 4)
606
+ rgb, alpha = np.split(img_rgba, [3], axis=2)
607
+ return rgb.astype("uint8")
608
+
609
+
610
+ class Visualizer:
611
+ def __init__(self, img_rgb, metadata=None, scale=1.0):
612
+ self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
613
+ self.output = VisImage(self.img, scale=scale)
614
+ self.cpu_device = torch.device("cpu")
615
+
616
+ # too small texts are useless, therefore clamp to 14
617
+ self._default_font_size = max(
618
+ np.sqrt(self.output.height * self.output.width) // 30, 15 // scale
619
+ )
620
+
621
+ def draw_text(
622
+ self,
623
+ text,
624
+ position,
625
+ *,
626
+ font_size=None,
627
+ color="g",
628
+ horizontal_alignment="center",
629
+ rotation=0,
630
+ ):
631
+ if not font_size:
632
+ font_size = self._default_font_size
633
+
634
+ # since the text background is dark, we don't want the text to be dark
635
+ color = np.maximum(list(mplc.to_rgb(color)), 0.2)
636
+ color[np.argmax(color)] = max(0.8, np.max(color))
637
+
638
+ x, y = position
639
+ self.output.ax.text(
640
+ x,
641
+ y,
642
+ text,
643
+ size=font_size * self.output.scale,
644
+ bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
645
+ verticalalignment="top",
646
+ horizontalalignment=horizontal_alignment,
647
+ color=color,
648
+ zorder=10,
649
+ rotation=rotation,
650
+ )
651
+ return self.output
652
+
653
+ def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
654
+ x0, y0, x1, y1 = box_coord
655
+ width = x1 - x0
656
+ height = y1 - y0
657
+
658
+ linewidth = max(self._default_font_size / 4, 1)
659
+
660
+ self.output.ax.add_patch(
661
+ mpl.patches.Rectangle(
662
+ (x0, y0),
663
+ width,
664
+ height,
665
+ fill=False,
666
+ edgecolor=edge_color,
667
+ linewidth=linewidth * self.output.scale,
668
+ alpha=alpha,
669
+ linestyle=line_style,
670
+ )
671
+ )
672
+ return self.output
673
+
674
+ def get_output(self):
675
+ return self.output
tokenizer_config.json ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "50256": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "50257": {
14
+ "content": " ",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": false
20
+ },
21
+ "50258": {
22
+ "content": " ",
23
+ "lstrip": false,
24
+ "normalized": true,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "50259": {
30
+ "content": " ",
31
+ "lstrip": false,
32
+ "normalized": true,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": false
36
+ },
37
+ "50260": {
38
+ "content": " ",
39
+ "lstrip": false,
40
+ "normalized": true,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": false
44
+ },
45
+ "50261": {
46
+ "content": " ",
47
+ "lstrip": false,
48
+ "normalized": true,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": false
52
+ },
53
+ "50262": {
54
+ "content": " ",
55
+ "lstrip": false,
56
+ "normalized": true,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": false
60
+ },
61
+ "50263": {
62
+ "content": " ",
63
+ "lstrip": false,
64
+ "normalized": true,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": false
68
+ },
69
+ "50264": {
70
+ "content": " ",
71
+ "lstrip": false,
72
+ "normalized": true,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": false
76
+ },
77
+ "50265": {
78
+ "content": " ",
79
+ "lstrip": false,
80
+ "normalized": true,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": false
84
+ },
85
+ "50266": {
86
+ "content": " ",
87
+ "lstrip": false,
88
+ "normalized": true,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": false
92
+ },
93
+ "50267": {
94
+ "content": " ",
95
+ "lstrip": false,
96
+ "normalized": true,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": false
100
+ },
101
+ "50268": {
102
+ "content": " ",
103
+ "lstrip": false,
104
+ "normalized": true,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": false
108
+ },
109
+ "50269": {
110
+ "content": " ",
111
+ "lstrip": false,
112
+ "normalized": true,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": false
116
+ },
117
+ "50270": {
118
+ "content": " ",
119
+ "lstrip": false,
120
+ "normalized": true,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "50271": {
126
+ "content": " ",
127
+ "lstrip": false,
128
+ "normalized": true,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "50272": {
134
+ "content": " ",
135
+ "lstrip": false,
136
+ "normalized": true,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "50273": {
142
+ "content": " ",
143
+ "lstrip": false,
144
+ "normalized": true,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "50274": {
150
+ "content": " ",
151
+ "lstrip": false,
152
+ "normalized": true,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "50275": {
158
+ "content": " ",
159
+ "lstrip": false,
160
+ "normalized": true,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "50276": {
166
+ "content": " ",
167
+ "lstrip": false,
168
+ "normalized": true,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "50277": {
174
+ "content": " ",
175
+ "lstrip": false,
176
+ "normalized": true,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "50278": {
182
+ "content": " ",
183
+ "lstrip": false,
184
+ "normalized": true,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "50279": {
190
+ "content": " ",
191
+ "lstrip": false,
192
+ "normalized": true,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "50280": {
198
+ "content": " ",
199
+ "lstrip": false,
200
+ "normalized": true,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "50281": {
206
+ "content": " ",
207
+ "lstrip": false,
208
+ "normalized": true,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ },
213
+ "50282": {
214
+ "content": " ",
215
+ "lstrip": false,
216
+ "normalized": true,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": false
220
+ },
221
+ "50283": {
222
+ "content": " ",
223
+ "lstrip": false,
224
+ "normalized": true,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": false
228
+ },
229
+ "50284": {
230
+ "content": " ",
231
+ "lstrip": false,
232
+ "normalized": true,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": false
236
+ },
237
+ "50285": {
238
+ "content": " ",
239
+ "lstrip": false,
240
+ "normalized": true,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": false
244
+ },
245
+ "50286": {
246
+ "content": " ",
247
+ "lstrip": false,
248
+ "normalized": true,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": false
252
+ },
253
+ "50287": {
254
+ "content": "\t\t\t\t\t\t\t\t\t",
255
+ "lstrip": false,
256
+ "normalized": true,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": false
260
+ },
261
+ "50288": {
262
+ "content": "\t\t\t\t\t\t\t\t",
263
+ "lstrip": false,
264
+ "normalized": true,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": false
268
+ },
269
+ "50289": {
270
+ "content": "\t\t\t\t\t\t\t",
271
+ "lstrip": false,
272
+ "normalized": true,
273
+ "rstrip": false,
274
+ "single_word": false,
275
+ "special": false
276
+ },
277
+ "50290": {
278
+ "content": "\t\t\t\t\t\t",
279
+ "lstrip": false,
280
+ "normalized": true,
281
+ "rstrip": false,
282
+ "single_word": false,
283
+ "special": false
284
+ },
285
+ "50291": {
286
+ "content": "\t\t\t\t\t",
287
+ "lstrip": false,
288
+ "normalized": true,
289
+ "rstrip": false,
290
+ "single_word": false,
291
+ "special": false
292
+ },
293
+ "50292": {
294
+ "content": "\t\t\t\t",
295
+ "lstrip": false,
296
+ "normalized": true,
297
+ "rstrip": false,
298
+ "single_word": false,
299
+ "special": false
300
+ },
301
+ "50293": {
302
+ "content": "\t\t\t",
303
+ "lstrip": false,
304
+ "normalized": true,
305
+ "rstrip": false,
306
+ "single_word": false,
307
+ "special": false
308
+ },
309
+ "50294": {
310
+ "content": "\t\t",
311
+ "lstrip": false,
312
+ "normalized": true,
313
+ "rstrip": false,
314
+ "single_word": false,
315
+ "special": false
316
+ },
317
+ "50295": {
318
+ "content": "<|img|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": false,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "50296": {
326
+ "content": "<|/img|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": false,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "50297": {
334
+ "content": "<|imgpad|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": false,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "50298": {
342
+ "content": "<|ref|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": false,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "50299": {
350
+ "content": "<|/ref|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": false,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "50300": {
358
+ "content": "<|box|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": false,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "50301": {
366
+ "content": "<|/box|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": false,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "50302": {
374
+ "content": "<|quad|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": false,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "50303": {
382
+ "content": "<|/quad|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": false,
386
+ "single_word": false,
387
+ "special": true
388
+ },
389
+ "50304": {
390
+ "content": "<|coord_0|>",
391
+ "lstrip": false,
392
+ "normalized": false,
393
+ "rstrip": false,
394
+ "single_word": false,
395
+ "special": true
396
+ },
397
+ "50305": {
398
+ "content": "<|coord_1|>",
399
+ "lstrip": false,
400
+ "normalized": false,
401
+ "rstrip": false,
402
+ "single_word": false,
403
+ "special": true
404
+ },
405
+ "50306": {
406
+ "content": "<|coord_2|>",
407
+ "lstrip": false,
408
+ "normalized": false,
409
+ "rstrip": false,
410
+ "single_word": false,
411
+ "special": true
412
+ },
413
+ "50307": {
414
+ "content": "<|coord_3|>",
415
+ "lstrip": false,
416
+ "normalized": false,
417
+ "rstrip": false,
418
+ "single_word": false,
419
+ "special": true
420
+ },
421
+ "50308": {
422
+ "content": "<|coord_4|>",
423
+ "lstrip": false,
424
+ "normalized": false,
425
+ "rstrip": false,
426
+ "single_word": false,
427
+ "special": true
428
+ },
429
+ "50309": {
430
+ "content": "<|coord_5|>",
431
+ "lstrip": false,
432
+ "normalized": false,
433
+ "rstrip": false,
434
+ "single_word": false,
435
+ "special": true
436
+ },
437
+ "50310": {
438
+ "content": "<|coord_6|>",
439
+ "lstrip": false,
440
+ "normalized": false,
441
+ "rstrip": false,
442
+ "single_word": false,
443
+ "special": true
444
+ },
445
+ "50311": {
446
+ "content": "<|coord_7|>",
447
+ "lstrip": false,
448
+ "normalized": false,
449
+ "rstrip": false,
450
+ "single_word": false,
451
+ "special": true
452
+ },
453
+ "50312": {
454
+ "content": "<|coord_8|>",
455
+ "lstrip": false,
456
+ "normalized": false,
457
+ "rstrip": false,
458
+ "single_word": false,
459
+ "special": true
460
+ },
461
+ "50313": {
462
+ "content": "<|coord_9|>",
463
+ "lstrip": false,
464
+ "normalized": false,
465
+ "rstrip": false,
466
+ "single_word": false,
467
+ "special": true
468
+ },
469
+ "50314": {
470
+ "content": "<|extra_0|>",
471
+ "lstrip": false,
472
+ "normalized": false,
473
+ "rstrip": false,
474
+ "single_word": false,
475
+ "special": true
476
+ },
477
+ "50315": {
478
+ "content": "<|extra_1|>",
479
+ "lstrip": false,
480
+ "normalized": false,
481
+ "rstrip": false,
482
+ "single_word": false,
483
+ "special": true
484
+ },
485
+ "50316": {
486
+ "content": "<|extra_2|>",
487
+ "lstrip": false,
488
+ "normalized": false,
489
+ "rstrip": false,
490
+ "single_word": false,
491
+ "special": true
492
+ },
493
+ "50317": {
494
+ "content": "<|extra_3|>",
495
+ "lstrip": false,
496
+ "normalized": false,
497
+ "rstrip": false,
498
+ "single_word": false,
499
+ "special": true
500
+ },
501
+ "50318": {
502
+ "content": "<|extra_4|>",
503
+ "lstrip": false,
504
+ "normalized": false,
505
+ "rstrip": false,
506
+ "single_word": false,
507
+ "special": true
508
+ },
509
+ "50319": {
510
+ "content": "<|extra_5|>",
511
+ "lstrip": false,
512
+ "normalized": false,
513
+ "rstrip": false,
514
+ "single_word": false,
515
+ "special": true
516
+ },
517
+ "50320": {
518
+ "content": "<|extra_6|>",
519
+ "lstrip": false,
520
+ "normalized": false,
521
+ "rstrip": false,
522
+ "single_word": false,
523
+ "special": true
524
+ },
525
+ "50321": {
526
+ "content": "<|extra_7|>",
527
+ "lstrip": false,
528
+ "normalized": false,
529
+ "rstrip": false,
530
+ "single_word": false,
531
+ "special": true
532
+ },
533
+ "50322": {
534
+ "content": "<|extra_8|>",
535
+ "lstrip": false,
536
+ "normalized": false,
537
+ "rstrip": false,
538
+ "single_word": false,
539
+ "special": true
540
+ },
541
+ "50323": {
542
+ "content": "<|extra_9|>",
543
+ "lstrip": false,
544
+ "normalized": false,
545
+ "rstrip": false,
546
+ "single_word": false,
547
+ "special": true
548
+ },
549
+ "50324": {
550
+ "content": "<|extra_10|>",
551
+ "lstrip": false,
552
+ "normalized": false,
553
+ "rstrip": false,
554
+ "single_word": false,
555
+ "special": true
556
+ },
557
+ "50325": {
558
+ "content": "<|extra_11|>",
559
+ "lstrip": false,
560
+ "normalized": false,
561
+ "rstrip": false,
562
+ "single_word": false,
563
+ "special": true
564
+ },
565
+ "50326": {
566
+ "content": "<|extra_12|>",
567
+ "lstrip": false,
568
+ "normalized": false,
569
+ "rstrip": false,
570
+ "single_word": false,
571
+ "special": true
572
+ },
573
+ "50327": {
574
+ "content": "<|extra_13|>",
575
+ "lstrip": false,
576
+ "normalized": false,
577
+ "rstrip": false,
578
+ "single_word": false,
579
+ "special": true
580
+ },
581
+ "50328": {
582
+ "content": "<|extra_14|>",
583
+ "lstrip": false,
584
+ "normalized": false,
585
+ "rstrip": false,
586
+ "single_word": false,
587
+ "special": true
588
+ },
589
+ "50329": {
590
+ "content": "<|extra_15|>",
591
+ "lstrip": false,
592
+ "normalized": false,
593
+ "rstrip": false,
594
+ "single_word": false,
595
+ "special": true
596
+ },
597
+ "50330": {
598
+ "content": "<|extra_16|>",
599
+ "lstrip": false,
600
+ "normalized": false,
601
+ "rstrip": false,
602
+ "single_word": false,
603
+ "special": true
604
+ },
605
+ "50331": {
606
+ "content": "<|extra_17|>",
607
+ "lstrip": false,
608
+ "normalized": false,
609
+ "rstrip": false,
610
+ "single_word": false,
611
+ "special": true
612
+ },
613
+ "50332": {
614
+ "content": "<|extra_18|>",
615
+ "lstrip": false,
616
+ "normalized": false,
617
+ "rstrip": false,
618
+ "single_word": false,
619
+ "special": true
620
+ },
621
+ "50333": {
622
+ "content": "<|extra_19|>",
623
+ "lstrip": false,
624
+ "normalized": false,
625
+ "rstrip": false,
626
+ "single_word": false,
627
+ "special": true
628
+ },
629
+ "50334": {
630
+ "content": "<|extra_20|>",
631
+ "lstrip": false,
632
+ "normalized": false,
633
+ "rstrip": false,
634
+ "single_word": false,
635
+ "special": true
636
+ },
637
+ "50335": {
638
+ "content": "<|extra_21|>",
639
+ "lstrip": false,
640
+ "normalized": false,
641
+ "rstrip": false,
642
+ "single_word": false,
643
+ "special": true
644
+ },
645
+ "50336": {
646
+ "content": "<|extra_22|>",
647
+ "lstrip": false,
648
+ "normalized": false,
649
+ "rstrip": false,
650
+ "single_word": false,
651
+ "special": true
652
+ },
653
+ "50337": {
654
+ "content": "<|extra_23|>",
655
+ "lstrip": false,
656
+ "normalized": false,
657
+ "rstrip": false,
658
+ "single_word": false,
659
+ "special": true
660
+ },
661
+ "50338": {
662
+ "content": "<|extra_24|>",
663
+ "lstrip": false,
664
+ "normalized": false,
665
+ "rstrip": false,
666
+ "single_word": false,
667
+ "special": true
668
+ },
669
+ "50339": {
670
+ "content": "<|extra_25|>",
671
+ "lstrip": false,
672
+ "normalized": false,
673
+ "rstrip": false,
674
+ "single_word": false,
675
+ "special": true
676
+ },
677
+ "50340": {
678
+ "content": "<|extra_26|>",
679
+ "lstrip": false,
680
+ "normalized": false,
681
+ "rstrip": false,
682
+ "single_word": false,
683
+ "special": true
684
+ },
685
+ "50341": {
686
+ "content": "<|extra_27|>",
687
+ "lstrip": false,
688
+ "normalized": false,
689
+ "rstrip": false,
690
+ "single_word": false,
691
+ "special": true
692
+ },
693
+ "50342": {
694
+ "content": "<|extra_28|>",
695
+ "lstrip": false,
696
+ "normalized": false,
697
+ "rstrip": false,
698
+ "single_word": false,
699
+ "special": true
700
+ },
701
+ "50343": {
702
+ "content": "<|extra_29|>",
703
+ "lstrip": false,
704
+ "normalized": false,
705
+ "rstrip": false,
706
+ "single_word": false,
707
+ "special": true
708
+ },
709
+ "50344": {
710
+ "content": "<|extra_30|>",
711
+ "lstrip": false,
712
+ "normalized": false,
713
+ "rstrip": false,
714
+ "single_word": false,
715
+ "special": true
716
+ },
717
+ "50345": {
718
+ "content": "<|extra_31|>",
719
+ "lstrip": false,
720
+ "normalized": false,
721
+ "rstrip": false,
722
+ "single_word": false,
723
+ "special": true
724
+ },
725
+ "50346": {
726
+ "content": "<|extra_32|>",
727
+ "lstrip": false,
728
+ "normalized": false,
729
+ "rstrip": false,
730
+ "single_word": false,
731
+ "special": true
732
+ },
733
+ "50347": {
734
+ "content": "<|extra_33|>",
735
+ "lstrip": false,
736
+ "normalized": false,
737
+ "rstrip": false,
738
+ "single_word": false,
739
+ "special": true
740
+ },
741
+ "50348": {
742
+ "content": "<|extra_34|>",
743
+ "lstrip": false,
744
+ "normalized": false,
745
+ "rstrip": false,
746
+ "single_word": false,
747
+ "special": true
748
+ },
749
+ "50349": {
750
+ "content": "<|extra_35|>",
751
+ "lstrip": false,
752
+ "normalized": false,
753
+ "rstrip": false,
754
+ "single_word": false,
755
+ "special": true
756
+ },
757
+ "50350": {
758
+ "content": "<|extra_36|>",
759
+ "lstrip": false,
760
+ "normalized": false,
761
+ "rstrip": false,
762
+ "single_word": false,
763
+ "special": true
764
+ },
765
+ "50351": {
766
+ "content": "<|extra_37|>",
767
+ "lstrip": false,
768
+ "normalized": false,
769
+ "rstrip": false,
770
+ "single_word": false,
771
+ "special": true
772
+ },
773
+ "50352": {
774
+ "content": "<|extra_38|>",
775
+ "lstrip": false,
776
+ "normalized": false,
777
+ "rstrip": false,
778
+ "single_word": false,
779
+ "special": true
780
+ },
781
+ "50353": {
782
+ "content": "<|extra_39|>",
783
+ "lstrip": false,
784
+ "normalized": false,
785
+ "rstrip": false,
786
+ "single_word": false,
787
+ "special": true
788
+ },
789
+ "50354": {
790
+ "content": "<|extra_40|>",
791
+ "lstrip": false,
792
+ "normalized": false,
793
+ "rstrip": false,
794
+ "single_word": false,
795
+ "special": true
796
+ },
797
+ "50355": {
798
+ "content": "<|extra_41|>",
799
+ "lstrip": false,
800
+ "normalized": false,
801
+ "rstrip": false,
802
+ "single_word": false,
803
+ "special": true
804
+ },
805
+ "50356": {
806
+ "content": "<|extra_42|>",
807
+ "lstrip": false,
808
+ "normalized": false,
809
+ "rstrip": false,
810
+ "single_word": false,
811
+ "special": true
812
+ },
813
+ "50357": {
814
+ "content": "<|extra_43|>",
815
+ "lstrip": false,
816
+ "normalized": false,
817
+ "rstrip": false,
818
+ "single_word": false,
819
+ "special": true
820
+ },
821
+ "50358": {
822
+ "content": "<|extra_44|>",
823
+ "lstrip": false,
824
+ "normalized": false,
825
+ "rstrip": false,
826
+ "single_word": false,
827
+ "special": true
828
+ },
829
+ "50359": {
830
+ "content": "<|extra_45|>",
831
+ "lstrip": false,
832
+ "normalized": false,
833
+ "rstrip": false,
834
+ "single_word": false,
835
+ "special": true
836
+ },
837
+ "50360": {
838
+ "content": "<|extra_46|>",
839
+ "lstrip": false,
840
+ "normalized": false,
841
+ "rstrip": false,
842
+ "single_word": false,
843
+ "special": true
844
+ },
845
+ "50361": {
846
+ "content": "<|extra_47|>",
847
+ "lstrip": false,
848
+ "normalized": false,
849
+ "rstrip": false,
850
+ "single_word": false,
851
+ "special": true
852
+ },
853
+ "50362": {
854
+ "content": "<|extra_48|>",
855
+ "lstrip": false,
856
+ "normalized": false,
857
+ "rstrip": false,
858
+ "single_word": false,
859
+ "special": true
860
+ },
861
+ "50363": {
862
+ "content": "<|extra_49|>",
863
+ "lstrip": false,
864
+ "normalized": false,
865
+ "rstrip": false,
866
+ "single_word": false,
867
+ "special": true
868
+ },
869
+ "50364": {
870
+ "content": "<|extra_50|>",
871
+ "lstrip": false,
872
+ "normalized": false,
873
+ "rstrip": false,
874
+ "single_word": false,
875
+ "special": true
876
+ },
877
+ "50365": {
878
+ "content": "<|extra_51|>",
879
+ "lstrip": false,
880
+ "normalized": false,
881
+ "rstrip": false,
882
+ "single_word": false,
883
+ "special": true
884
+ },
885
+ "50366": {
886
+ "content": "<|extra_52|>",
887
+ "lstrip": false,
888
+ "normalized": false,
889
+ "rstrip": false,
890
+ "single_word": false,
891
+ "special": true
892
+ },
893
+ "50367": {
894
+ "content": "<|extra_53|>",
895
+ "lstrip": false,
896
+ "normalized": false,
897
+ "rstrip": false,
898
+ "single_word": false,
899
+ "special": true
900
+ }
901
+ },
902
+ "additional_special_tokens": [
903
+ "<|coord_9|>"
904
+ ],
905
+ "bos_token": "<|endoftext|>",
906
+ "chat_template": "{% for message in messages %}\n{% if message['from'] == 'human' %}\n{{ '<|user|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'system' %}\n{{ '<|system|>\n' + message['value'] + eos_token }}\n{% elif message['from'] == 'gpt' %}\n{{ '<|assistant|>\n' + message['value'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
907
+ "clean_up_tokenization_spaces": true,
908
+ "eos_token": "<|endoftext|>",
909
+ "errors": "replace",
910
+ "model_max_length": 3072,
911
+ "pad_token": "<|endoftext|>",
912
+ "padding_side": "right",
913
+ "tokenizer_class": "CheXagentTokenizer",
914
+ "unk_token": "<|endoftext|>",
915
+ "use_fast": false,
916
+ "auto_map": {
917
+ "AutoTokenizer": [
918
+ "tokenization_chexagent.CheXagentTokenizer",
919
+ null
920
+ ]
921
+ }
922
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff