zohaib-irfan zhjohnchan commited on
Commit
f8b1ef7
·
0 Parent(s):

Duplicate from StanfordAIMI/CheXagent-8b

Browse files

Co-authored-by: Zhihong Chen <zhjohnchan@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- markdownlint-disable first-line-h1 -->
2
+ <!-- markdownlint-disable html -->
3
+
4
+ <div align="center">
5
+ <h1>
6
+ CheXagent
7
+ </h1>
8
+ </div>
9
+
10
+ <p align="center">
11
+ 📝 <a href="https://arxiv.org/abs/2401.12208" target="_blank">Paper</a> • 🤗 <a href="https://huggingface.co/StanfordAIMI/CheXagent-8b/" target="_blank">Hugging Face</a> • 🧩 <a href="https://github.com/Stanford-AIMI/CheXagent" target="_blank">Github</a> • 🪄 <a href="https://stanford-aimi.github.io/chexagent.html" target="_blank">Project</a>
12
+ </p>
13
+
14
+ <div align="center">
15
+ </div>
16
+
17
+ ## ✨ Latest News
18
+
19
+ - [12/15/2023]: Model released in [Hugging Face](https://huggingface.co/StanfordAIMI/CheXagent-8b/).
20
+
21
+ ## 🎬 Get Started
22
+
23
+ ```python
24
+ import io
25
+
26
+ import requests
27
+ import torch
28
+ from PIL import Image
29
+ from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
30
+
31
+ # step 1: Setup constant
32
+ device = "cuda"
33
+ dtype = torch.float16
34
+
35
+ # step 2: Load Processor and Model
36
+ processor = AutoProcessor.from_pretrained("StanfordAIMI/CheXagent-8b", trust_remote_code=True)
37
+ generation_config = GenerationConfig.from_pretrained("StanfordAIMI/CheXagent-8b")
38
+ model = AutoModelForCausalLM.from_pretrained("StanfordAIMI/CheXagent-8b", torch_dtype=dtype, trust_remote_code=True)
39
+
40
+ # step 3: Fetch the images
41
+ image_path = "https://upload.wikimedia.org/wikipedia/commons/3/3b/Pleural_effusion-Metastatic_breast_carcinoma_Case_166_%285477628658%29.jpg"
42
+ images = [Image.open(io.BytesIO(requests.get(image_path).content)).convert("RGB")]
43
+
44
+ # step 4: Generate the Findings section
45
+ prompt = f'Describe "Airway"'
46
+ inputs = processor(images=images, text=f" USER: <s>{prompt} ASSISTANT: <s>", return_tensors="pt").to(device=device, dtype=dtype)
47
+ output = model.generate(**inputs, generation_config=generation_config)[0]
48
+ response = processor.tokenizer.decode(output, skip_special_tokens=True)
49
+ ```
50
+
51
+ ## ✏️ Citation
52
+
53
+ ```
54
+ @article{chexagent-2024,
55
+ title={CheXagent: Towards a Foundation Model for Chest X-Ray Interpretation},
56
+ author={Chen, Zhihong and Varma, Maya and Delbrouck, Jean-Benoit and Paschali, Magdalini and Blankemeier, Louis and Veen, Dave Van and Valanarasu, Jeya Maria Jose and Youssef, Alaa and Cohen, Joseph Paul and Reis, Eduardo Pontes and Tsai, Emily B. and Johnston, Andrew and Olsen, Cameron and Abraham, Tanishq Mathew and Gatidis, Sergios and Chaudhari, Akshay S and Langlotz, Curtis},
57
+ journal={arXiv preprint arXiv:2401.12208},
58
+ url={https://arxiv.org/abs/2401.12208},
59
+ year={2024}
60
+ }
61
+ ```
config.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "StanfordAIMI/CheXagent-8b",
3
+ "architectures": [
4
+ "CheXagentForConditionalGeneration"
5
+ ],
6
+ "initializer_factor": 1.0,
7
+ "initializer_range": 0.02,
8
+ "model_type": "chexagent",
9
+ "num_max_images": 2,
10
+ "num_query_tokens": 128,
11
+ "qformer_config": {
12
+ "model_type": "chexagent_qformer",
13
+ "vocab_size": 30523
14
+ },
15
+ "text_config": {
16
+ "_name_or_path": "mistralai/Mistral-7B-v0.1",
17
+ "architectures": [
18
+ "MistralForCausalLM"
19
+ ],
20
+ "attention_bias": false,
21
+ "bos_token_id": 1,
22
+ "hidden_act": "silu",
23
+ "hidden_size": 4096,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 14336,
26
+ "max_position_embeddings": 32768,
27
+ "model_type": "mistral",
28
+ "num_attention_heads": 32,
29
+ "num_hidden_layers": 32,
30
+ "num_key_value_heads": 8,
31
+ "pad_token_id": null,
32
+ "pretraining_tp": 1,
33
+ "rms_norm_eps": 1e-05,
34
+ "rope_scaling": null,
35
+ "rope_theta": 10000.0,
36
+ "sliding_window": 4096,
37
+ "tie_word_embeddings": false,
38
+ "torch_dtype": "bfloat16",
39
+ "vocab_size": 32000
40
+ },
41
+ "tie_word_embeddings": false,
42
+ "torch_dtype": "float32",
43
+ "transformers_version": "4.35.2",
44
+ "use_decoder_only_language_model": true,
45
+ "vision_config": {
46
+ "image_size": 448,
47
+ "model_type": "chexagent_vision_model",
48
+ "num_hidden_layers": 40
49
+ },
50
+ "auto_map": {
51
+ "AutoModelForCausalLM": "modeling_chexagent.CheXagentForConditionalGeneration",
52
+ "AutoConfig": "configuration_chexagent.CheXagentConfig"
53
+ }
54
+ }
configuration_chexagent.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The CheXagent Authors and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from typing import Union
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.models.auto import CONFIG_MAPPING
21
+ from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
22
+ from transformers.utils import logging
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+
27
+ class CheXagentVisionConfig(PretrainedConfig):
28
+ model_type = "chexagent_vision_model"
29
+
30
+ def __init__(
31
+ self,
32
+ hidden_size=1408,
33
+ intermediate_size=6144,
34
+ num_hidden_layers=39,
35
+ num_attention_heads=16,
36
+ image_size=224,
37
+ patch_size=14,
38
+ hidden_act="gelu",
39
+ layer_norm_eps=1e-6,
40
+ attention_dropout=0.0,
41
+ initializer_range=1e-10,
42
+ qkv_bias=True,
43
+ **kwargs,
44
+ ):
45
+ super().__init__(**kwargs)
46
+
47
+ self.hidden_size = hidden_size
48
+ self.intermediate_size = intermediate_size
49
+ self.num_hidden_layers = num_hidden_layers
50
+ self.num_attention_heads = num_attention_heads
51
+ self.patch_size = patch_size
52
+ self.image_size = image_size
53
+ self.initializer_range = initializer_range
54
+ self.attention_dropout = attention_dropout
55
+ self.layer_norm_eps = layer_norm_eps
56
+ self.hidden_act = hidden_act
57
+ self.qkv_bias = qkv_bias
58
+
59
+ @classmethod
60
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
61
+ cls._set_token_in_kwargs(kwargs)
62
+
63
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
64
+
65
+ if config_dict.get("model_type") == "chexagent":
66
+ config_dict = config_dict["vision_config"]
67
+
68
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
69
+ logger.warning(
70
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
71
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
72
+ )
73
+
74
+ return cls.from_dict(config_dict, **kwargs)
75
+
76
+
77
+ class CheXagentQFormerConfig(PretrainedConfig):
78
+ model_type = "chexagent_qformer"
79
+
80
+ def __init__(
81
+ self,
82
+ vocab_size=30522,
83
+ hidden_size=768,
84
+ num_hidden_layers=12,
85
+ num_attention_heads=12,
86
+ intermediate_size=3072,
87
+ hidden_act="gelu",
88
+ hidden_dropout_prob=0.1,
89
+ attention_probs_dropout_prob=0.1,
90
+ max_position_embeddings=512,
91
+ initializer_range=0.02,
92
+ layer_norm_eps=1e-12,
93
+ pad_token_id=0,
94
+ position_embedding_type="absolute",
95
+ cross_attention_frequency=2,
96
+ encoder_hidden_size=1408,
97
+ **kwargs,
98
+ ):
99
+ super().__init__(pad_token_id=pad_token_id, **kwargs)
100
+
101
+ self.vocab_size = vocab_size
102
+ self.hidden_size = hidden_size
103
+ self.num_hidden_layers = num_hidden_layers
104
+ self.num_attention_heads = num_attention_heads
105
+ self.hidden_act = hidden_act
106
+ self.intermediate_size = intermediate_size
107
+ self.hidden_dropout_prob = hidden_dropout_prob
108
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
109
+ self.max_position_embeddings = max_position_embeddings
110
+ self.initializer_range = initializer_range
111
+ self.layer_norm_eps = layer_norm_eps
112
+ self.position_embedding_type = position_embedding_type
113
+ self.cross_attention_frequency = cross_attention_frequency
114
+ self.encoder_hidden_size = encoder_hidden_size
115
+
116
+ @classmethod
117
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
118
+ cls._set_token_in_kwargs(kwargs)
119
+
120
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
121
+
122
+ if config_dict.get("model_type") == "chexagent":
123
+ config_dict = config_dict["qformer_config"]
124
+
125
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
126
+ logger.warning(
127
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
128
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
129
+ )
130
+
131
+ return cls.from_dict(config_dict, **kwargs)
132
+
133
+
134
+ class CheXagentConfig(PretrainedConfig):
135
+ model_type = "chexagent"
136
+
137
+ def __init__(
138
+ self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=128,
139
+ num_max_images=2, **kwargs
140
+ ):
141
+ super().__init__(**kwargs)
142
+
143
+ if vision_config is None:
144
+ vision_config = {}
145
+
146
+ if qformer_config is None:
147
+ qformer_config = {}
148
+
149
+ if text_config is None:
150
+ text_config = {}
151
+
152
+ self.vision_config = CheXagentVisionConfig(**vision_config)
153
+ self.qformer_config = CheXagentQFormerConfig(**qformer_config)
154
+ text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
155
+ self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
156
+
157
+ self.tie_word_embeddings = self.text_config.tie_word_embeddings
158
+ self.is_encoder_decoder = self.text_config.is_encoder_decoder
159
+
160
+ self.num_query_tokens = num_query_tokens
161
+ self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
162
+ self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
163
+ self.initializer_factor = 1.0
164
+ self.initializer_range = 0.02
165
+ self.num_max_images = num_max_images
166
+
167
+ @classmethod
168
+ def from_vision_qformer_text_configs(
169
+ cls,
170
+ vision_config: CheXagentVisionConfig,
171
+ qformer_config: CheXagentQFormerConfig,
172
+ text_config: PretrainedConfig,
173
+ **kwargs,
174
+ ):
175
+ return cls(
176
+ vision_config=vision_config.to_dict(),
177
+ qformer_config=qformer_config.to_dict(),
178
+ text_config=text_config.to_dict(),
179
+ **kwargs,
180
+ )
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "do_sample": false,
6
+ "num_beams": 5,
7
+ "min_length": 0,
8
+ "max_length": 512,
9
+ "top_p": 1.0,
10
+ "repetition_penalty": 1.0,
11
+ "length_penalty": 1.0,
12
+ "temperature": 1,
13
+ "transformers_version": "4.35.2"
14
+ }
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17cf1c475199690b5ac26372bc136c1a0f8667809172489f43620129ebefb092
3
+ size 4482770072
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bd4e517a2cc06e6a6b2cf2bac8461f53be2bc2af13828fb16bb3505ad2fe712
3
+ size 4987197672
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e02a7202a70268a0c8dcda31300fecd4480c112d8d9a4520e8b4e35a8f122c57
3
+ size 4899117160
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:23b6b8f22977ae559008d7d0b153d6646cf0badf0a2558444f246129a694eda7
3
+ size 4999813920
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d874b7ae8898e6dc89a45e00b17eed2c05c1ca1945ea43a254408f38ca974a8
3
+ size 4999813920
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8366a386401e78984d5c6fe49a0ca2551f02a16b892629a86fd9453a0e226aa5
3
+ size 4832008200
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:975ad7dc9aca1e36ace96867534a2751ae35d45936ee2e8a27bef2a5211ec129
3
+ size 4249015512
model.safetensors.index.json ADDED
@@ -0,0 +1,1041 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 33449606656
4
+ },
5
+ "weight_map": {
6
+ "language_model.lm_head.weight": "model-00007-of-00007.safetensors",
7
+ "language_model.model.embed_tokens.weight": "model-00002-of-00007.safetensors",
8
+ "language_model.model.layers.0.input_layernorm.weight": "model-00002-of-00007.safetensors",
9
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
10
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
11
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
12
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
13
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
14
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
15
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
16
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
17
+ "language_model.model.layers.1.input_layernorm.weight": "model-00002-of-00007.safetensors",
18
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
19
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
20
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
21
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
22
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
23
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
24
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
25
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
26
+ "language_model.model.layers.10.input_layernorm.weight": "model-00004-of-00007.safetensors",
27
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
28
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
29
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
30
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
31
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
32
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
33
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
34
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
35
+ "language_model.model.layers.11.input_layernorm.weight": "model-00004-of-00007.safetensors",
36
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
37
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
38
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
39
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
40
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
41
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
42
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
43
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
44
+ "language_model.model.layers.12.input_layernorm.weight": "model-00004-of-00007.safetensors",
45
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
46
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
47
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
48
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
49
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
50
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
51
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
52
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
53
+ "language_model.model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
54
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
55
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
56
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
57
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
58
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
59
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
60
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
61
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
62
+ "language_model.model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
63
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
64
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
65
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
66
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
67
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
68
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
69
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
70
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
71
+ "language_model.model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
72
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
73
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
74
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
75
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
76
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
77
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
78
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
79
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
80
+ "language_model.model.layers.16.input_layernorm.weight": "model-00005-of-00007.safetensors",
81
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
82
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
83
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
84
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
85
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
86
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
87
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
88
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
89
+ "language_model.model.layers.17.input_layernorm.weight": "model-00005-of-00007.safetensors",
90
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
91
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
92
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
93
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
94
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
95
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
96
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
97
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
98
+ "language_model.model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
99
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
100
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
101
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
102
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
103
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
104
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
105
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
106
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
107
+ "language_model.model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
108
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
109
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
110
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
111
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
112
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
113
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
114
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
115
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
116
+ "language_model.model.layers.2.input_layernorm.weight": "model-00002-of-00007.safetensors",
117
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
118
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
119
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
120
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
121
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
122
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
123
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
124
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
125
+ "language_model.model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
126
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
127
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
128
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
129
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
130
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
131
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
132
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
133
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
134
+ "language_model.model.layers.21.input_layernorm.weight": "model-00005-of-00007.safetensors",
135
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
136
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
137
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
138
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
139
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
140
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
141
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
142
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
143
+ "language_model.model.layers.22.input_layernorm.weight": "model-00006-of-00007.safetensors",
144
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
145
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
146
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
147
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
148
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
149
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
150
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
152
+ "language_model.model.layers.23.input_layernorm.weight": "model-00006-of-00007.safetensors",
153
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
154
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
155
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
156
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
157
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
158
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
159
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
160
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
161
+ "language_model.model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
162
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
163
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
164
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
165
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
166
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
167
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
168
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
169
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
170
+ "language_model.model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
171
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
172
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
173
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
174
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
175
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
176
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
177
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
178
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
179
+ "language_model.model.layers.26.input_layernorm.weight": "model-00006-of-00007.safetensors",
180
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
181
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
182
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
183
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
184
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
185
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
186
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
187
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
188
+ "language_model.model.layers.27.input_layernorm.weight": "model-00007-of-00007.safetensors",
189
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
190
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
191
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
192
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
193
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
194
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
195
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
196
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
197
+ "language_model.model.layers.28.input_layernorm.weight": "model-00007-of-00007.safetensors",
198
+ "language_model.model.layers.28.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
199
+ "language_model.model.layers.28.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
200
+ "language_model.model.layers.28.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
201
+ "language_model.model.layers.28.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
202
+ "language_model.model.layers.28.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
203
+ "language_model.model.layers.28.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
204
+ "language_model.model.layers.28.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
205
+ "language_model.model.layers.28.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
206
+ "language_model.model.layers.29.input_layernorm.weight": "model-00007-of-00007.safetensors",
207
+ "language_model.model.layers.29.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
208
+ "language_model.model.layers.29.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
209
+ "language_model.model.layers.29.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
210
+ "language_model.model.layers.29.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
211
+ "language_model.model.layers.29.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
212
+ "language_model.model.layers.29.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
213
+ "language_model.model.layers.29.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
214
+ "language_model.model.layers.29.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
215
+ "language_model.model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
216
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
217
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
218
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
219
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
220
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
221
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
222
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
223
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
224
+ "language_model.model.layers.30.input_layernorm.weight": "model-00007-of-00007.safetensors",
225
+ "language_model.model.layers.30.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
226
+ "language_model.model.layers.30.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
227
+ "language_model.model.layers.30.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
228
+ "language_model.model.layers.30.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
229
+ "language_model.model.layers.30.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
230
+ "language_model.model.layers.30.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
231
+ "language_model.model.layers.30.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
232
+ "language_model.model.layers.30.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
233
+ "language_model.model.layers.31.input_layernorm.weight": "model-00007-of-00007.safetensors",
234
+ "language_model.model.layers.31.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
235
+ "language_model.model.layers.31.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
236
+ "language_model.model.layers.31.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
237
+ "language_model.model.layers.31.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
238
+ "language_model.model.layers.31.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
239
+ "language_model.model.layers.31.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
240
+ "language_model.model.layers.31.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
241
+ "language_model.model.layers.31.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
242
+ "language_model.model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
243
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
244
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
245
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
246
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
247
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
248
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
249
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
250
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
251
+ "language_model.model.layers.5.input_layernorm.weight": "model-00003-of-00007.safetensors",
252
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
253
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
254
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
255
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
256
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
257
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
258
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
259
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
260
+ "language_model.model.layers.6.input_layernorm.weight": "model-00003-of-00007.safetensors",
261
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
262
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
263
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
264
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
265
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
266
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
267
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
268
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
269
+ "language_model.model.layers.7.input_layernorm.weight": "model-00003-of-00007.safetensors",
270
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
271
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
272
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
273
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
274
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
275
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
276
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
277
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
278
+ "language_model.model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
279
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
280
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
281
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
282
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
283
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
284
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
285
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
286
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
287
+ "language_model.model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
288
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
289
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
290
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
291
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
292
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
293
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
294
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
295
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
296
+ "language_model.model.norm.weight": "model-00007-of-00007.safetensors",
297
+ "language_projection.bias": "model-00001-of-00007.safetensors",
298
+ "language_projection.weight": "model-00001-of-00007.safetensors",
299
+ "qformer.encoder.layer.0.attention.attention.key.bias": "model-00001-of-00007.safetensors",
300
+ "qformer.encoder.layer.0.attention.attention.key.weight": "model-00001-of-00007.safetensors",
301
+ "qformer.encoder.layer.0.attention.attention.query.bias": "model-00001-of-00007.safetensors",
302
+ "qformer.encoder.layer.0.attention.attention.query.weight": "model-00001-of-00007.safetensors",
303
+ "qformer.encoder.layer.0.attention.attention.value.bias": "model-00001-of-00007.safetensors",
304
+ "qformer.encoder.layer.0.attention.attention.value.weight": "model-00001-of-00007.safetensors",
305
+ "qformer.encoder.layer.0.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
306
+ "qformer.encoder.layer.0.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
307
+ "qformer.encoder.layer.0.attention.output.dense.bias": "model-00001-of-00007.safetensors",
308
+ "qformer.encoder.layer.0.attention.output.dense.weight": "model-00001-of-00007.safetensors",
309
+ "qformer.encoder.layer.0.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
310
+ "qformer.encoder.layer.0.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
311
+ "qformer.encoder.layer.0.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
312
+ "qformer.encoder.layer.0.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
313
+ "qformer.encoder.layer.0.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
314
+ "qformer.encoder.layer.0.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
315
+ "qformer.encoder.layer.0.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
316
+ "qformer.encoder.layer.0.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
317
+ "qformer.encoder.layer.0.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
318
+ "qformer.encoder.layer.0.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
319
+ "qformer.encoder.layer.0.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
320
+ "qformer.encoder.layer.0.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
321
+ "qformer.encoder.layer.0.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
322
+ "qformer.encoder.layer.0.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
323
+ "qformer.encoder.layer.0.output_query.dense.bias": "model-00001-of-00007.safetensors",
324
+ "qformer.encoder.layer.0.output_query.dense.weight": "model-00001-of-00007.safetensors",
325
+ "qformer.encoder.layer.1.attention.attention.key.bias": "model-00001-of-00007.safetensors",
326
+ "qformer.encoder.layer.1.attention.attention.key.weight": "model-00001-of-00007.safetensors",
327
+ "qformer.encoder.layer.1.attention.attention.query.bias": "model-00001-of-00007.safetensors",
328
+ "qformer.encoder.layer.1.attention.attention.query.weight": "model-00001-of-00007.safetensors",
329
+ "qformer.encoder.layer.1.attention.attention.value.bias": "model-00001-of-00007.safetensors",
330
+ "qformer.encoder.layer.1.attention.attention.value.weight": "model-00001-of-00007.safetensors",
331
+ "qformer.encoder.layer.1.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
332
+ "qformer.encoder.layer.1.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
333
+ "qformer.encoder.layer.1.attention.output.dense.bias": "model-00001-of-00007.safetensors",
334
+ "qformer.encoder.layer.1.attention.output.dense.weight": "model-00001-of-00007.safetensors",
335
+ "qformer.encoder.layer.1.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
336
+ "qformer.encoder.layer.1.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
337
+ "qformer.encoder.layer.1.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
338
+ "qformer.encoder.layer.1.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
339
+ "qformer.encoder.layer.1.output_query.dense.bias": "model-00001-of-00007.safetensors",
340
+ "qformer.encoder.layer.1.output_query.dense.weight": "model-00001-of-00007.safetensors",
341
+ "qformer.encoder.layer.10.attention.attention.key.bias": "model-00001-of-00007.safetensors",
342
+ "qformer.encoder.layer.10.attention.attention.key.weight": "model-00001-of-00007.safetensors",
343
+ "qformer.encoder.layer.10.attention.attention.query.bias": "model-00001-of-00007.safetensors",
344
+ "qformer.encoder.layer.10.attention.attention.query.weight": "model-00001-of-00007.safetensors",
345
+ "qformer.encoder.layer.10.attention.attention.value.bias": "model-00001-of-00007.safetensors",
346
+ "qformer.encoder.layer.10.attention.attention.value.weight": "model-00001-of-00007.safetensors",
347
+ "qformer.encoder.layer.10.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
348
+ "qformer.encoder.layer.10.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
349
+ "qformer.encoder.layer.10.attention.output.dense.bias": "model-00001-of-00007.safetensors",
350
+ "qformer.encoder.layer.10.attention.output.dense.weight": "model-00001-of-00007.safetensors",
351
+ "qformer.encoder.layer.10.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
352
+ "qformer.encoder.layer.10.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
353
+ "qformer.encoder.layer.10.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
354
+ "qformer.encoder.layer.10.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
355
+ "qformer.encoder.layer.10.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
356
+ "qformer.encoder.layer.10.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
357
+ "qformer.encoder.layer.10.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
358
+ "qformer.encoder.layer.10.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
359
+ "qformer.encoder.layer.10.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
360
+ "qformer.encoder.layer.10.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
361
+ "qformer.encoder.layer.10.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
362
+ "qformer.encoder.layer.10.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
363
+ "qformer.encoder.layer.10.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
364
+ "qformer.encoder.layer.10.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
365
+ "qformer.encoder.layer.10.output_query.dense.bias": "model-00001-of-00007.safetensors",
366
+ "qformer.encoder.layer.10.output_query.dense.weight": "model-00001-of-00007.safetensors",
367
+ "qformer.encoder.layer.11.attention.attention.key.bias": "model-00001-of-00007.safetensors",
368
+ "qformer.encoder.layer.11.attention.attention.key.weight": "model-00001-of-00007.safetensors",
369
+ "qformer.encoder.layer.11.attention.attention.query.bias": "model-00001-of-00007.safetensors",
370
+ "qformer.encoder.layer.11.attention.attention.query.weight": "model-00001-of-00007.safetensors",
371
+ "qformer.encoder.layer.11.attention.attention.value.bias": "model-00001-of-00007.safetensors",
372
+ "qformer.encoder.layer.11.attention.attention.value.weight": "model-00001-of-00007.safetensors",
373
+ "qformer.encoder.layer.11.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
374
+ "qformer.encoder.layer.11.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
375
+ "qformer.encoder.layer.11.attention.output.dense.bias": "model-00001-of-00007.safetensors",
376
+ "qformer.encoder.layer.11.attention.output.dense.weight": "model-00001-of-00007.safetensors",
377
+ "qformer.encoder.layer.11.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
378
+ "qformer.encoder.layer.11.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
379
+ "qformer.encoder.layer.11.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
380
+ "qformer.encoder.layer.11.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
381
+ "qformer.encoder.layer.11.output_query.dense.bias": "model-00001-of-00007.safetensors",
382
+ "qformer.encoder.layer.11.output_query.dense.weight": "model-00001-of-00007.safetensors",
383
+ "qformer.encoder.layer.2.attention.attention.key.bias": "model-00001-of-00007.safetensors",
384
+ "qformer.encoder.layer.2.attention.attention.key.weight": "model-00001-of-00007.safetensors",
385
+ "qformer.encoder.layer.2.attention.attention.query.bias": "model-00001-of-00007.safetensors",
386
+ "qformer.encoder.layer.2.attention.attention.query.weight": "model-00001-of-00007.safetensors",
387
+ "qformer.encoder.layer.2.attention.attention.value.bias": "model-00001-of-00007.safetensors",
388
+ "qformer.encoder.layer.2.attention.attention.value.weight": "model-00001-of-00007.safetensors",
389
+ "qformer.encoder.layer.2.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
390
+ "qformer.encoder.layer.2.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
391
+ "qformer.encoder.layer.2.attention.output.dense.bias": "model-00001-of-00007.safetensors",
392
+ "qformer.encoder.layer.2.attention.output.dense.weight": "model-00001-of-00007.safetensors",
393
+ "qformer.encoder.layer.2.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
394
+ "qformer.encoder.layer.2.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
395
+ "qformer.encoder.layer.2.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
396
+ "qformer.encoder.layer.2.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
397
+ "qformer.encoder.layer.2.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
398
+ "qformer.encoder.layer.2.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
399
+ "qformer.encoder.layer.2.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
400
+ "qformer.encoder.layer.2.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
401
+ "qformer.encoder.layer.2.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
402
+ "qformer.encoder.layer.2.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
403
+ "qformer.encoder.layer.2.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
404
+ "qformer.encoder.layer.2.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
405
+ "qformer.encoder.layer.2.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
406
+ "qformer.encoder.layer.2.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
407
+ "qformer.encoder.layer.2.output_query.dense.bias": "model-00001-of-00007.safetensors",
408
+ "qformer.encoder.layer.2.output_query.dense.weight": "model-00001-of-00007.safetensors",
409
+ "qformer.encoder.layer.3.attention.attention.key.bias": "model-00001-of-00007.safetensors",
410
+ "qformer.encoder.layer.3.attention.attention.key.weight": "model-00001-of-00007.safetensors",
411
+ "qformer.encoder.layer.3.attention.attention.query.bias": "model-00001-of-00007.safetensors",
412
+ "qformer.encoder.layer.3.attention.attention.query.weight": "model-00001-of-00007.safetensors",
413
+ "qformer.encoder.layer.3.attention.attention.value.bias": "model-00001-of-00007.safetensors",
414
+ "qformer.encoder.layer.3.attention.attention.value.weight": "model-00001-of-00007.safetensors",
415
+ "qformer.encoder.layer.3.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
416
+ "qformer.encoder.layer.3.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
417
+ "qformer.encoder.layer.3.attention.output.dense.bias": "model-00001-of-00007.safetensors",
418
+ "qformer.encoder.layer.3.attention.output.dense.weight": "model-00001-of-00007.safetensors",
419
+ "qformer.encoder.layer.3.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
420
+ "qformer.encoder.layer.3.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
421
+ "qformer.encoder.layer.3.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
422
+ "qformer.encoder.layer.3.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
423
+ "qformer.encoder.layer.3.output_query.dense.bias": "model-00001-of-00007.safetensors",
424
+ "qformer.encoder.layer.3.output_query.dense.weight": "model-00001-of-00007.safetensors",
425
+ "qformer.encoder.layer.4.attention.attention.key.bias": "model-00001-of-00007.safetensors",
426
+ "qformer.encoder.layer.4.attention.attention.key.weight": "model-00001-of-00007.safetensors",
427
+ "qformer.encoder.layer.4.attention.attention.query.bias": "model-00001-of-00007.safetensors",
428
+ "qformer.encoder.layer.4.attention.attention.query.weight": "model-00001-of-00007.safetensors",
429
+ "qformer.encoder.layer.4.attention.attention.value.bias": "model-00001-of-00007.safetensors",
430
+ "qformer.encoder.layer.4.attention.attention.value.weight": "model-00001-of-00007.safetensors",
431
+ "qformer.encoder.layer.4.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
432
+ "qformer.encoder.layer.4.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
433
+ "qformer.encoder.layer.4.attention.output.dense.bias": "model-00001-of-00007.safetensors",
434
+ "qformer.encoder.layer.4.attention.output.dense.weight": "model-00001-of-00007.safetensors",
435
+ "qformer.encoder.layer.4.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
436
+ "qformer.encoder.layer.4.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
437
+ "qformer.encoder.layer.4.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
438
+ "qformer.encoder.layer.4.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
439
+ "qformer.encoder.layer.4.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
440
+ "qformer.encoder.layer.4.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
441
+ "qformer.encoder.layer.4.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
442
+ "qformer.encoder.layer.4.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
443
+ "qformer.encoder.layer.4.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
444
+ "qformer.encoder.layer.4.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
445
+ "qformer.encoder.layer.4.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
446
+ "qformer.encoder.layer.4.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
447
+ "qformer.encoder.layer.4.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
448
+ "qformer.encoder.layer.4.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
449
+ "qformer.encoder.layer.4.output_query.dense.bias": "model-00001-of-00007.safetensors",
450
+ "qformer.encoder.layer.4.output_query.dense.weight": "model-00001-of-00007.safetensors",
451
+ "qformer.encoder.layer.5.attention.attention.key.bias": "model-00001-of-00007.safetensors",
452
+ "qformer.encoder.layer.5.attention.attention.key.weight": "model-00001-of-00007.safetensors",
453
+ "qformer.encoder.layer.5.attention.attention.query.bias": "model-00001-of-00007.safetensors",
454
+ "qformer.encoder.layer.5.attention.attention.query.weight": "model-00001-of-00007.safetensors",
455
+ "qformer.encoder.layer.5.attention.attention.value.bias": "model-00001-of-00007.safetensors",
456
+ "qformer.encoder.layer.5.attention.attention.value.weight": "model-00001-of-00007.safetensors",
457
+ "qformer.encoder.layer.5.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
458
+ "qformer.encoder.layer.5.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
459
+ "qformer.encoder.layer.5.attention.output.dense.bias": "model-00001-of-00007.safetensors",
460
+ "qformer.encoder.layer.5.attention.output.dense.weight": "model-00001-of-00007.safetensors",
461
+ "qformer.encoder.layer.5.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
462
+ "qformer.encoder.layer.5.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
463
+ "qformer.encoder.layer.5.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
464
+ "qformer.encoder.layer.5.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
465
+ "qformer.encoder.layer.5.output_query.dense.bias": "model-00001-of-00007.safetensors",
466
+ "qformer.encoder.layer.5.output_query.dense.weight": "model-00001-of-00007.safetensors",
467
+ "qformer.encoder.layer.6.attention.attention.key.bias": "model-00001-of-00007.safetensors",
468
+ "qformer.encoder.layer.6.attention.attention.key.weight": "model-00001-of-00007.safetensors",
469
+ "qformer.encoder.layer.6.attention.attention.query.bias": "model-00001-of-00007.safetensors",
470
+ "qformer.encoder.layer.6.attention.attention.query.weight": "model-00001-of-00007.safetensors",
471
+ "qformer.encoder.layer.6.attention.attention.value.bias": "model-00001-of-00007.safetensors",
472
+ "qformer.encoder.layer.6.attention.attention.value.weight": "model-00001-of-00007.safetensors",
473
+ "qformer.encoder.layer.6.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
474
+ "qformer.encoder.layer.6.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
475
+ "qformer.encoder.layer.6.attention.output.dense.bias": "model-00001-of-00007.safetensors",
476
+ "qformer.encoder.layer.6.attention.output.dense.weight": "model-00001-of-00007.safetensors",
477
+ "qformer.encoder.layer.6.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
478
+ "qformer.encoder.layer.6.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
479
+ "qformer.encoder.layer.6.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
480
+ "qformer.encoder.layer.6.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
481
+ "qformer.encoder.layer.6.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
482
+ "qformer.encoder.layer.6.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
483
+ "qformer.encoder.layer.6.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
484
+ "qformer.encoder.layer.6.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
485
+ "qformer.encoder.layer.6.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
486
+ "qformer.encoder.layer.6.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
487
+ "qformer.encoder.layer.6.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
488
+ "qformer.encoder.layer.6.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
489
+ "qformer.encoder.layer.6.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
490
+ "qformer.encoder.layer.6.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
491
+ "qformer.encoder.layer.6.output_query.dense.bias": "model-00001-of-00007.safetensors",
492
+ "qformer.encoder.layer.6.output_query.dense.weight": "model-00001-of-00007.safetensors",
493
+ "qformer.encoder.layer.7.attention.attention.key.bias": "model-00001-of-00007.safetensors",
494
+ "qformer.encoder.layer.7.attention.attention.key.weight": "model-00001-of-00007.safetensors",
495
+ "qformer.encoder.layer.7.attention.attention.query.bias": "model-00001-of-00007.safetensors",
496
+ "qformer.encoder.layer.7.attention.attention.query.weight": "model-00001-of-00007.safetensors",
497
+ "qformer.encoder.layer.7.attention.attention.value.bias": "model-00001-of-00007.safetensors",
498
+ "qformer.encoder.layer.7.attention.attention.value.weight": "model-00001-of-00007.safetensors",
499
+ "qformer.encoder.layer.7.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
500
+ "qformer.encoder.layer.7.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
501
+ "qformer.encoder.layer.7.attention.output.dense.bias": "model-00001-of-00007.safetensors",
502
+ "qformer.encoder.layer.7.attention.output.dense.weight": "model-00001-of-00007.safetensors",
503
+ "qformer.encoder.layer.7.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
504
+ "qformer.encoder.layer.7.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
505
+ "qformer.encoder.layer.7.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
506
+ "qformer.encoder.layer.7.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
507
+ "qformer.encoder.layer.7.output_query.dense.bias": "model-00001-of-00007.safetensors",
508
+ "qformer.encoder.layer.7.output_query.dense.weight": "model-00001-of-00007.safetensors",
509
+ "qformer.encoder.layer.8.attention.attention.key.bias": "model-00001-of-00007.safetensors",
510
+ "qformer.encoder.layer.8.attention.attention.key.weight": "model-00001-of-00007.safetensors",
511
+ "qformer.encoder.layer.8.attention.attention.query.bias": "model-00001-of-00007.safetensors",
512
+ "qformer.encoder.layer.8.attention.attention.query.weight": "model-00001-of-00007.safetensors",
513
+ "qformer.encoder.layer.8.attention.attention.value.bias": "model-00001-of-00007.safetensors",
514
+ "qformer.encoder.layer.8.attention.attention.value.weight": "model-00001-of-00007.safetensors",
515
+ "qformer.encoder.layer.8.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
516
+ "qformer.encoder.layer.8.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
517
+ "qformer.encoder.layer.8.attention.output.dense.bias": "model-00001-of-00007.safetensors",
518
+ "qformer.encoder.layer.8.attention.output.dense.weight": "model-00001-of-00007.safetensors",
519
+ "qformer.encoder.layer.8.crossattention.attention.key.bias": "model-00001-of-00007.safetensors",
520
+ "qformer.encoder.layer.8.crossattention.attention.key.weight": "model-00001-of-00007.safetensors",
521
+ "qformer.encoder.layer.8.crossattention.attention.query.bias": "model-00001-of-00007.safetensors",
522
+ "qformer.encoder.layer.8.crossattention.attention.query.weight": "model-00001-of-00007.safetensors",
523
+ "qformer.encoder.layer.8.crossattention.attention.value.bias": "model-00001-of-00007.safetensors",
524
+ "qformer.encoder.layer.8.crossattention.attention.value.weight": "model-00001-of-00007.safetensors",
525
+ "qformer.encoder.layer.8.crossattention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
526
+ "qformer.encoder.layer.8.crossattention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
527
+ "qformer.encoder.layer.8.crossattention.output.dense.bias": "model-00001-of-00007.safetensors",
528
+ "qformer.encoder.layer.8.crossattention.output.dense.weight": "model-00001-of-00007.safetensors",
529
+ "qformer.encoder.layer.8.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
530
+ "qformer.encoder.layer.8.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
531
+ "qformer.encoder.layer.8.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
532
+ "qformer.encoder.layer.8.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
533
+ "qformer.encoder.layer.8.output_query.dense.bias": "model-00001-of-00007.safetensors",
534
+ "qformer.encoder.layer.8.output_query.dense.weight": "model-00001-of-00007.safetensors",
535
+ "qformer.encoder.layer.9.attention.attention.key.bias": "model-00001-of-00007.safetensors",
536
+ "qformer.encoder.layer.9.attention.attention.key.weight": "model-00001-of-00007.safetensors",
537
+ "qformer.encoder.layer.9.attention.attention.query.bias": "model-00001-of-00007.safetensors",
538
+ "qformer.encoder.layer.9.attention.attention.query.weight": "model-00001-of-00007.safetensors",
539
+ "qformer.encoder.layer.9.attention.attention.value.bias": "model-00001-of-00007.safetensors",
540
+ "qformer.encoder.layer.9.attention.attention.value.weight": "model-00001-of-00007.safetensors",
541
+ "qformer.encoder.layer.9.attention.output.LayerNorm.bias": "model-00001-of-00007.safetensors",
542
+ "qformer.encoder.layer.9.attention.output.LayerNorm.weight": "model-00001-of-00007.safetensors",
543
+ "qformer.encoder.layer.9.attention.output.dense.bias": "model-00001-of-00007.safetensors",
544
+ "qformer.encoder.layer.9.attention.output.dense.weight": "model-00001-of-00007.safetensors",
545
+ "qformer.encoder.layer.9.intermediate_query.dense.bias": "model-00001-of-00007.safetensors",
546
+ "qformer.encoder.layer.9.intermediate_query.dense.weight": "model-00001-of-00007.safetensors",
547
+ "qformer.encoder.layer.9.output_query.LayerNorm.bias": "model-00001-of-00007.safetensors",
548
+ "qformer.encoder.layer.9.output_query.LayerNorm.weight": "model-00001-of-00007.safetensors",
549
+ "qformer.encoder.layer.9.output_query.dense.bias": "model-00001-of-00007.safetensors",
550
+ "qformer.encoder.layer.9.output_query.dense.weight": "model-00001-of-00007.safetensors",
551
+ "qformer.layernorm.bias": "model-00001-of-00007.safetensors",
552
+ "qformer.layernorm.weight": "model-00001-of-00007.safetensors",
553
+ "query_tokens": "model-00001-of-00007.safetensors",
554
+ "vision_model.embeddings.class_embedding": "model-00001-of-00007.safetensors",
555
+ "vision_model.embeddings.patch_embedding.bias": "model-00001-of-00007.safetensors",
556
+ "vision_model.embeddings.patch_embedding.weight": "model-00001-of-00007.safetensors",
557
+ "vision_model.embeddings.position_embedding": "model-00001-of-00007.safetensors",
558
+ "vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00007.safetensors",
559
+ "vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00007.safetensors",
560
+ "vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00007.safetensors",
561
+ "vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00007.safetensors",
562
+ "vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00007.safetensors",
563
+ "vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00007.safetensors",
564
+ "vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00007.safetensors",
565
+ "vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00007.safetensors",
566
+ "vision_model.encoder.layers.0.self_attn.projection.bias": "model-00001-of-00007.safetensors",
567
+ "vision_model.encoder.layers.0.self_attn.projection.weight": "model-00001-of-00007.safetensors",
568
+ "vision_model.encoder.layers.0.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
569
+ "vision_model.encoder.layers.0.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
570
+ "vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00007.safetensors",
571
+ "vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00007.safetensors",
572
+ "vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00007.safetensors",
573
+ "vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00007.safetensors",
574
+ "vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00007.safetensors",
575
+ "vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00007.safetensors",
576
+ "vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00007.safetensors",
577
+ "vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00007.safetensors",
578
+ "vision_model.encoder.layers.1.self_attn.projection.bias": "model-00001-of-00007.safetensors",
579
+ "vision_model.encoder.layers.1.self_attn.projection.weight": "model-00001-of-00007.safetensors",
580
+ "vision_model.encoder.layers.1.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
581
+ "vision_model.encoder.layers.1.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
582
+ "vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00007.safetensors",
583
+ "vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00007.safetensors",
584
+ "vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00007.safetensors",
585
+ "vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00007.safetensors",
586
+ "vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00007.safetensors",
587
+ "vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00007.safetensors",
588
+ "vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00007.safetensors",
589
+ "vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00007.safetensors",
590
+ "vision_model.encoder.layers.10.self_attn.projection.bias": "model-00001-of-00007.safetensors",
591
+ "vision_model.encoder.layers.10.self_attn.projection.weight": "model-00001-of-00007.safetensors",
592
+ "vision_model.encoder.layers.10.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
593
+ "vision_model.encoder.layers.10.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
594
+ "vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00007.safetensors",
595
+ "vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00007.safetensors",
596
+ "vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00007.safetensors",
597
+ "vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00007.safetensors",
598
+ "vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00007.safetensors",
599
+ "vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00007.safetensors",
600
+ "vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00007.safetensors",
601
+ "vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00007.safetensors",
602
+ "vision_model.encoder.layers.11.self_attn.projection.bias": "model-00001-of-00007.safetensors",
603
+ "vision_model.encoder.layers.11.self_attn.projection.weight": "model-00001-of-00007.safetensors",
604
+ "vision_model.encoder.layers.11.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
605
+ "vision_model.encoder.layers.11.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
606
+ "vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00007.safetensors",
607
+ "vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00007.safetensors",
608
+ "vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00007.safetensors",
609
+ "vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00007.safetensors",
610
+ "vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00007.safetensors",
611
+ "vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00007.safetensors",
612
+ "vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00007.safetensors",
613
+ "vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00007.safetensors",
614
+ "vision_model.encoder.layers.12.self_attn.projection.bias": "model-00001-of-00007.safetensors",
615
+ "vision_model.encoder.layers.12.self_attn.projection.weight": "model-00001-of-00007.safetensors",
616
+ "vision_model.encoder.layers.12.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
617
+ "vision_model.encoder.layers.12.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
618
+ "vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00007.safetensors",
619
+ "vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00007.safetensors",
620
+ "vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00007.safetensors",
621
+ "vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00007.safetensors",
622
+ "vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00007.safetensors",
623
+ "vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00007.safetensors",
624
+ "vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00007.safetensors",
625
+ "vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00007.safetensors",
626
+ "vision_model.encoder.layers.13.self_attn.projection.bias": "model-00001-of-00007.safetensors",
627
+ "vision_model.encoder.layers.13.self_attn.projection.weight": "model-00001-of-00007.safetensors",
628
+ "vision_model.encoder.layers.13.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
629
+ "vision_model.encoder.layers.13.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
630
+ "vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00007.safetensors",
631
+ "vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00007.safetensors",
632
+ "vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00007.safetensors",
633
+ "vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00007.safetensors",
634
+ "vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00007.safetensors",
635
+ "vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00007.safetensors",
636
+ "vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00007.safetensors",
637
+ "vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00007.safetensors",
638
+ "vision_model.encoder.layers.14.self_attn.projection.bias": "model-00001-of-00007.safetensors",
639
+ "vision_model.encoder.layers.14.self_attn.projection.weight": "model-00001-of-00007.safetensors",
640
+ "vision_model.encoder.layers.14.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
641
+ "vision_model.encoder.layers.14.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
642
+ "vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00007.safetensors",
643
+ "vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00007.safetensors",
644
+ "vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00007.safetensors",
645
+ "vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00007.safetensors",
646
+ "vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00007.safetensors",
647
+ "vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00007.safetensors",
648
+ "vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00007.safetensors",
649
+ "vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00007.safetensors",
650
+ "vision_model.encoder.layers.15.self_attn.projection.bias": "model-00001-of-00007.safetensors",
651
+ "vision_model.encoder.layers.15.self_attn.projection.weight": "model-00001-of-00007.safetensors",
652
+ "vision_model.encoder.layers.15.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
653
+ "vision_model.encoder.layers.15.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
654
+ "vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00007.safetensors",
655
+ "vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00007.safetensors",
656
+ "vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00007.safetensors",
657
+ "vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00007.safetensors",
658
+ "vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00007.safetensors",
659
+ "vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00007.safetensors",
660
+ "vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00007.safetensors",
661
+ "vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00007.safetensors",
662
+ "vision_model.encoder.layers.16.self_attn.projection.bias": "model-00001-of-00007.safetensors",
663
+ "vision_model.encoder.layers.16.self_attn.projection.weight": "model-00001-of-00007.safetensors",
664
+ "vision_model.encoder.layers.16.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
665
+ "vision_model.encoder.layers.16.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
666
+ "vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00007.safetensors",
667
+ "vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00007.safetensors",
668
+ "vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00007.safetensors",
669
+ "vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00007.safetensors",
670
+ "vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00007.safetensors",
671
+ "vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00007.safetensors",
672
+ "vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00007.safetensors",
673
+ "vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00007.safetensors",
674
+ "vision_model.encoder.layers.17.self_attn.projection.bias": "model-00001-of-00007.safetensors",
675
+ "vision_model.encoder.layers.17.self_attn.projection.weight": "model-00001-of-00007.safetensors",
676
+ "vision_model.encoder.layers.17.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
677
+ "vision_model.encoder.layers.17.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
678
+ "vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00007.safetensors",
679
+ "vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00007.safetensors",
680
+ "vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00007.safetensors",
681
+ "vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00007.safetensors",
682
+ "vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00007.safetensors",
683
+ "vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00007.safetensors",
684
+ "vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00007.safetensors",
685
+ "vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00007.safetensors",
686
+ "vision_model.encoder.layers.18.self_attn.projection.bias": "model-00001-of-00007.safetensors",
687
+ "vision_model.encoder.layers.18.self_attn.projection.weight": "model-00001-of-00007.safetensors",
688
+ "vision_model.encoder.layers.18.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
689
+ "vision_model.encoder.layers.18.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
690
+ "vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00007.safetensors",
691
+ "vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00007.safetensors",
692
+ "vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00007.safetensors",
693
+ "vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00007.safetensors",
694
+ "vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00007.safetensors",
695
+ "vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00007.safetensors",
696
+ "vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00007.safetensors",
697
+ "vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00007.safetensors",
698
+ "vision_model.encoder.layers.19.self_attn.projection.bias": "model-00001-of-00007.safetensors",
699
+ "vision_model.encoder.layers.19.self_attn.projection.weight": "model-00001-of-00007.safetensors",
700
+ "vision_model.encoder.layers.19.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
701
+ "vision_model.encoder.layers.19.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
702
+ "vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00007.safetensors",
703
+ "vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00007.safetensors",
704
+ "vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00007.safetensors",
705
+ "vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00007.safetensors",
706
+ "vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00007.safetensors",
707
+ "vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00007.safetensors",
708
+ "vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00007.safetensors",
709
+ "vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00007.safetensors",
710
+ "vision_model.encoder.layers.2.self_attn.projection.bias": "model-00001-of-00007.safetensors",
711
+ "vision_model.encoder.layers.2.self_attn.projection.weight": "model-00001-of-00007.safetensors",
712
+ "vision_model.encoder.layers.2.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
713
+ "vision_model.encoder.layers.2.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
714
+ "vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00007.safetensors",
715
+ "vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00007.safetensors",
716
+ "vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00007.safetensors",
717
+ "vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00007.safetensors",
718
+ "vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00007.safetensors",
719
+ "vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00007.safetensors",
720
+ "vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00007.safetensors",
721
+ "vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00007.safetensors",
722
+ "vision_model.encoder.layers.20.self_attn.projection.bias": "model-00001-of-00007.safetensors",
723
+ "vision_model.encoder.layers.20.self_attn.projection.weight": "model-00001-of-00007.safetensors",
724
+ "vision_model.encoder.layers.20.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
725
+ "vision_model.encoder.layers.20.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
726
+ "vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00007.safetensors",
727
+ "vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00007.safetensors",
728
+ "vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00007.safetensors",
729
+ "vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00007.safetensors",
730
+ "vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00007.safetensors",
731
+ "vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00007.safetensors",
732
+ "vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00007.safetensors",
733
+ "vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00007.safetensors",
734
+ "vision_model.encoder.layers.21.self_attn.projection.bias": "model-00001-of-00007.safetensors",
735
+ "vision_model.encoder.layers.21.self_attn.projection.weight": "model-00001-of-00007.safetensors",
736
+ "vision_model.encoder.layers.21.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
737
+ "vision_model.encoder.layers.21.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
738
+ "vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00007.safetensors",
739
+ "vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00007.safetensors",
740
+ "vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00007.safetensors",
741
+ "vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00007.safetensors",
742
+ "vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00007.safetensors",
743
+ "vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00007.safetensors",
744
+ "vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00007.safetensors",
745
+ "vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00007.safetensors",
746
+ "vision_model.encoder.layers.22.self_attn.projection.bias": "model-00001-of-00007.safetensors",
747
+ "vision_model.encoder.layers.22.self_attn.projection.weight": "model-00001-of-00007.safetensors",
748
+ "vision_model.encoder.layers.22.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
749
+ "vision_model.encoder.layers.22.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
750
+ "vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00007.safetensors",
751
+ "vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00007.safetensors",
752
+ "vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00007.safetensors",
753
+ "vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00007.safetensors",
754
+ "vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00007.safetensors",
755
+ "vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00007.safetensors",
756
+ "vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00007.safetensors",
757
+ "vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00007.safetensors",
758
+ "vision_model.encoder.layers.23.self_attn.projection.bias": "model-00001-of-00007.safetensors",
759
+ "vision_model.encoder.layers.23.self_attn.projection.weight": "model-00001-of-00007.safetensors",
760
+ "vision_model.encoder.layers.23.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
761
+ "vision_model.encoder.layers.23.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
762
+ "vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00007.safetensors",
763
+ "vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00007.safetensors",
764
+ "vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00007.safetensors",
765
+ "vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00007.safetensors",
766
+ "vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00007.safetensors",
767
+ "vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00007.safetensors",
768
+ "vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00007.safetensors",
769
+ "vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00007.safetensors",
770
+ "vision_model.encoder.layers.24.self_attn.projection.bias": "model-00001-of-00007.safetensors",
771
+ "vision_model.encoder.layers.24.self_attn.projection.weight": "model-00001-of-00007.safetensors",
772
+ "vision_model.encoder.layers.24.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
773
+ "vision_model.encoder.layers.24.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
774
+ "vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00007.safetensors",
775
+ "vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00007.safetensors",
776
+ "vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00007.safetensors",
777
+ "vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00007.safetensors",
778
+ "vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00007.safetensors",
779
+ "vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00007.safetensors",
780
+ "vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00007.safetensors",
781
+ "vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00007.safetensors",
782
+ "vision_model.encoder.layers.25.self_attn.projection.bias": "model-00001-of-00007.safetensors",
783
+ "vision_model.encoder.layers.25.self_attn.projection.weight": "model-00001-of-00007.safetensors",
784
+ "vision_model.encoder.layers.25.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
785
+ "vision_model.encoder.layers.25.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
786
+ "vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00007.safetensors",
787
+ "vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00007.safetensors",
788
+ "vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00007.safetensors",
789
+ "vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00007.safetensors",
790
+ "vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00007.safetensors",
791
+ "vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00007.safetensors",
792
+ "vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00007.safetensors",
793
+ "vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00007.safetensors",
794
+ "vision_model.encoder.layers.26.self_attn.projection.bias": "model-00001-of-00007.safetensors",
795
+ "vision_model.encoder.layers.26.self_attn.projection.weight": "model-00001-of-00007.safetensors",
796
+ "vision_model.encoder.layers.26.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
797
+ "vision_model.encoder.layers.26.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
798
+ "vision_model.encoder.layers.27.layer_norm1.bias": "model-00001-of-00007.safetensors",
799
+ "vision_model.encoder.layers.27.layer_norm1.weight": "model-00001-of-00007.safetensors",
800
+ "vision_model.encoder.layers.27.layer_norm2.bias": "model-00001-of-00007.safetensors",
801
+ "vision_model.encoder.layers.27.layer_norm2.weight": "model-00001-of-00007.safetensors",
802
+ "vision_model.encoder.layers.27.mlp.fc1.bias": "model-00001-of-00007.safetensors",
803
+ "vision_model.encoder.layers.27.mlp.fc1.weight": "model-00001-of-00007.safetensors",
804
+ "vision_model.encoder.layers.27.mlp.fc2.bias": "model-00001-of-00007.safetensors",
805
+ "vision_model.encoder.layers.27.mlp.fc2.weight": "model-00001-of-00007.safetensors",
806
+ "vision_model.encoder.layers.27.self_attn.projection.bias": "model-00001-of-00007.safetensors",
807
+ "vision_model.encoder.layers.27.self_attn.projection.weight": "model-00001-of-00007.safetensors",
808
+ "vision_model.encoder.layers.27.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
809
+ "vision_model.encoder.layers.27.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
810
+ "vision_model.encoder.layers.28.layer_norm1.bias": "model-00001-of-00007.safetensors",
811
+ "vision_model.encoder.layers.28.layer_norm1.weight": "model-00001-of-00007.safetensors",
812
+ "vision_model.encoder.layers.28.layer_norm2.bias": "model-00001-of-00007.safetensors",
813
+ "vision_model.encoder.layers.28.layer_norm2.weight": "model-00001-of-00007.safetensors",
814
+ "vision_model.encoder.layers.28.mlp.fc1.bias": "model-00001-of-00007.safetensors",
815
+ "vision_model.encoder.layers.28.mlp.fc1.weight": "model-00001-of-00007.safetensors",
816
+ "vision_model.encoder.layers.28.mlp.fc2.bias": "model-00001-of-00007.safetensors",
817
+ "vision_model.encoder.layers.28.mlp.fc2.weight": "model-00001-of-00007.safetensors",
818
+ "vision_model.encoder.layers.28.self_attn.projection.bias": "model-00001-of-00007.safetensors",
819
+ "vision_model.encoder.layers.28.self_attn.projection.weight": "model-00001-of-00007.safetensors",
820
+ "vision_model.encoder.layers.28.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
821
+ "vision_model.encoder.layers.28.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
822
+ "vision_model.encoder.layers.29.layer_norm1.bias": "model-00001-of-00007.safetensors",
823
+ "vision_model.encoder.layers.29.layer_norm1.weight": "model-00001-of-00007.safetensors",
824
+ "vision_model.encoder.layers.29.layer_norm2.bias": "model-00001-of-00007.safetensors",
825
+ "vision_model.encoder.layers.29.layer_norm2.weight": "model-00001-of-00007.safetensors",
826
+ "vision_model.encoder.layers.29.mlp.fc1.bias": "model-00001-of-00007.safetensors",
827
+ "vision_model.encoder.layers.29.mlp.fc1.weight": "model-00001-of-00007.safetensors",
828
+ "vision_model.encoder.layers.29.mlp.fc2.bias": "model-00001-of-00007.safetensors",
829
+ "vision_model.encoder.layers.29.mlp.fc2.weight": "model-00001-of-00007.safetensors",
830
+ "vision_model.encoder.layers.29.self_attn.projection.bias": "model-00001-of-00007.safetensors",
831
+ "vision_model.encoder.layers.29.self_attn.projection.weight": "model-00001-of-00007.safetensors",
832
+ "vision_model.encoder.layers.29.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
833
+ "vision_model.encoder.layers.29.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
834
+ "vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00007.safetensors",
835
+ "vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00007.safetensors",
836
+ "vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00007.safetensors",
837
+ "vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00007.safetensors",
838
+ "vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00007.safetensors",
839
+ "vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00007.safetensors",
840
+ "vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00007.safetensors",
841
+ "vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00007.safetensors",
842
+ "vision_model.encoder.layers.3.self_attn.projection.bias": "model-00001-of-00007.safetensors",
843
+ "vision_model.encoder.layers.3.self_attn.projection.weight": "model-00001-of-00007.safetensors",
844
+ "vision_model.encoder.layers.3.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
845
+ "vision_model.encoder.layers.3.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
846
+ "vision_model.encoder.layers.30.layer_norm1.bias": "model-00001-of-00007.safetensors",
847
+ "vision_model.encoder.layers.30.layer_norm1.weight": "model-00001-of-00007.safetensors",
848
+ "vision_model.encoder.layers.30.layer_norm2.bias": "model-00001-of-00007.safetensors",
849
+ "vision_model.encoder.layers.30.layer_norm2.weight": "model-00001-of-00007.safetensors",
850
+ "vision_model.encoder.layers.30.mlp.fc1.bias": "model-00001-of-00007.safetensors",
851
+ "vision_model.encoder.layers.30.mlp.fc1.weight": "model-00001-of-00007.safetensors",
852
+ "vision_model.encoder.layers.30.mlp.fc2.bias": "model-00001-of-00007.safetensors",
853
+ "vision_model.encoder.layers.30.mlp.fc2.weight": "model-00001-of-00007.safetensors",
854
+ "vision_model.encoder.layers.30.self_attn.projection.bias": "model-00001-of-00007.safetensors",
855
+ "vision_model.encoder.layers.30.self_attn.projection.weight": "model-00001-of-00007.safetensors",
856
+ "vision_model.encoder.layers.30.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
857
+ "vision_model.encoder.layers.30.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
858
+ "vision_model.encoder.layers.31.layer_norm1.bias": "model-00001-of-00007.safetensors",
859
+ "vision_model.encoder.layers.31.layer_norm1.weight": "model-00001-of-00007.safetensors",
860
+ "vision_model.encoder.layers.31.layer_norm2.bias": "model-00001-of-00007.safetensors",
861
+ "vision_model.encoder.layers.31.layer_norm2.weight": "model-00001-of-00007.safetensors",
862
+ "vision_model.encoder.layers.31.mlp.fc1.bias": "model-00001-of-00007.safetensors",
863
+ "vision_model.encoder.layers.31.mlp.fc1.weight": "model-00001-of-00007.safetensors",
864
+ "vision_model.encoder.layers.31.mlp.fc2.bias": "model-00001-of-00007.safetensors",
865
+ "vision_model.encoder.layers.31.mlp.fc2.weight": "model-00001-of-00007.safetensors",
866
+ "vision_model.encoder.layers.31.self_attn.projection.bias": "model-00001-of-00007.safetensors",
867
+ "vision_model.encoder.layers.31.self_attn.projection.weight": "model-00001-of-00007.safetensors",
868
+ "vision_model.encoder.layers.31.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
869
+ "vision_model.encoder.layers.31.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
870
+ "vision_model.encoder.layers.32.layer_norm1.bias": "model-00001-of-00007.safetensors",
871
+ "vision_model.encoder.layers.32.layer_norm1.weight": "model-00001-of-00007.safetensors",
872
+ "vision_model.encoder.layers.32.layer_norm2.bias": "model-00001-of-00007.safetensors",
873
+ "vision_model.encoder.layers.32.layer_norm2.weight": "model-00001-of-00007.safetensors",
874
+ "vision_model.encoder.layers.32.mlp.fc1.bias": "model-00001-of-00007.safetensors",
875
+ "vision_model.encoder.layers.32.mlp.fc1.weight": "model-00001-of-00007.safetensors",
876
+ "vision_model.encoder.layers.32.mlp.fc2.bias": "model-00001-of-00007.safetensors",
877
+ "vision_model.encoder.layers.32.mlp.fc2.weight": "model-00001-of-00007.safetensors",
878
+ "vision_model.encoder.layers.32.self_attn.projection.bias": "model-00001-of-00007.safetensors",
879
+ "vision_model.encoder.layers.32.self_attn.projection.weight": "model-00001-of-00007.safetensors",
880
+ "vision_model.encoder.layers.32.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
881
+ "vision_model.encoder.layers.32.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
882
+ "vision_model.encoder.layers.33.layer_norm1.bias": "model-00001-of-00007.safetensors",
883
+ "vision_model.encoder.layers.33.layer_norm1.weight": "model-00001-of-00007.safetensors",
884
+ "vision_model.encoder.layers.33.layer_norm2.bias": "model-00001-of-00007.safetensors",
885
+ "vision_model.encoder.layers.33.layer_norm2.weight": "model-00001-of-00007.safetensors",
886
+ "vision_model.encoder.layers.33.mlp.fc1.bias": "model-00001-of-00007.safetensors",
887
+ "vision_model.encoder.layers.33.mlp.fc1.weight": "model-00001-of-00007.safetensors",
888
+ "vision_model.encoder.layers.33.mlp.fc2.bias": "model-00001-of-00007.safetensors",
889
+ "vision_model.encoder.layers.33.mlp.fc2.weight": "model-00001-of-00007.safetensors",
890
+ "vision_model.encoder.layers.33.self_attn.projection.bias": "model-00001-of-00007.safetensors",
891
+ "vision_model.encoder.layers.33.self_attn.projection.weight": "model-00001-of-00007.safetensors",
892
+ "vision_model.encoder.layers.33.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
893
+ "vision_model.encoder.layers.33.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
894
+ "vision_model.encoder.layers.34.layer_norm1.bias": "model-00001-of-00007.safetensors",
895
+ "vision_model.encoder.layers.34.layer_norm1.weight": "model-00001-of-00007.safetensors",
896
+ "vision_model.encoder.layers.34.layer_norm2.bias": "model-00001-of-00007.safetensors",
897
+ "vision_model.encoder.layers.34.layer_norm2.weight": "model-00001-of-00007.safetensors",
898
+ "vision_model.encoder.layers.34.mlp.fc1.bias": "model-00001-of-00007.safetensors",
899
+ "vision_model.encoder.layers.34.mlp.fc1.weight": "model-00001-of-00007.safetensors",
900
+ "vision_model.encoder.layers.34.mlp.fc2.bias": "model-00001-of-00007.safetensors",
901
+ "vision_model.encoder.layers.34.mlp.fc2.weight": "model-00001-of-00007.safetensors",
902
+ "vision_model.encoder.layers.34.self_attn.projection.bias": "model-00001-of-00007.safetensors",
903
+ "vision_model.encoder.layers.34.self_attn.projection.weight": "model-00001-of-00007.safetensors",
904
+ "vision_model.encoder.layers.34.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
905
+ "vision_model.encoder.layers.34.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
906
+ "vision_model.encoder.layers.35.layer_norm1.bias": "model-00001-of-00007.safetensors",
907
+ "vision_model.encoder.layers.35.layer_norm1.weight": "model-00001-of-00007.safetensors",
908
+ "vision_model.encoder.layers.35.layer_norm2.bias": "model-00001-of-00007.safetensors",
909
+ "vision_model.encoder.layers.35.layer_norm2.weight": "model-00001-of-00007.safetensors",
910
+ "vision_model.encoder.layers.35.mlp.fc1.bias": "model-00001-of-00007.safetensors",
911
+ "vision_model.encoder.layers.35.mlp.fc1.weight": "model-00001-of-00007.safetensors",
912
+ "vision_model.encoder.layers.35.mlp.fc2.bias": "model-00001-of-00007.safetensors",
913
+ "vision_model.encoder.layers.35.mlp.fc2.weight": "model-00001-of-00007.safetensors",
914
+ "vision_model.encoder.layers.35.self_attn.projection.bias": "model-00001-of-00007.safetensors",
915
+ "vision_model.encoder.layers.35.self_attn.projection.weight": "model-00001-of-00007.safetensors",
916
+ "vision_model.encoder.layers.35.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
917
+ "vision_model.encoder.layers.35.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
918
+ "vision_model.encoder.layers.36.layer_norm1.bias": "model-00001-of-00007.safetensors",
919
+ "vision_model.encoder.layers.36.layer_norm1.weight": "model-00001-of-00007.safetensors",
920
+ "vision_model.encoder.layers.36.layer_norm2.bias": "model-00001-of-00007.safetensors",
921
+ "vision_model.encoder.layers.36.layer_norm2.weight": "model-00001-of-00007.safetensors",
922
+ "vision_model.encoder.layers.36.mlp.fc1.bias": "model-00001-of-00007.safetensors",
923
+ "vision_model.encoder.layers.36.mlp.fc1.weight": "model-00001-of-00007.safetensors",
924
+ "vision_model.encoder.layers.36.mlp.fc2.bias": "model-00001-of-00007.safetensors",
925
+ "vision_model.encoder.layers.36.mlp.fc2.weight": "model-00001-of-00007.safetensors",
926
+ "vision_model.encoder.layers.36.self_attn.projection.bias": "model-00001-of-00007.safetensors",
927
+ "vision_model.encoder.layers.36.self_attn.projection.weight": "model-00001-of-00007.safetensors",
928
+ "vision_model.encoder.layers.36.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
929
+ "vision_model.encoder.layers.36.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
930
+ "vision_model.encoder.layers.37.layer_norm1.bias": "model-00001-of-00007.safetensors",
931
+ "vision_model.encoder.layers.37.layer_norm1.weight": "model-00001-of-00007.safetensors",
932
+ "vision_model.encoder.layers.37.layer_norm2.bias": "model-00001-of-00007.safetensors",
933
+ "vision_model.encoder.layers.37.layer_norm2.weight": "model-00001-of-00007.safetensors",
934
+ "vision_model.encoder.layers.37.mlp.fc1.bias": "model-00001-of-00007.safetensors",
935
+ "vision_model.encoder.layers.37.mlp.fc1.weight": "model-00001-of-00007.safetensors",
936
+ "vision_model.encoder.layers.37.mlp.fc2.bias": "model-00001-of-00007.safetensors",
937
+ "vision_model.encoder.layers.37.mlp.fc2.weight": "model-00001-of-00007.safetensors",
938
+ "vision_model.encoder.layers.37.self_attn.projection.bias": "model-00001-of-00007.safetensors",
939
+ "vision_model.encoder.layers.37.self_attn.projection.weight": "model-00001-of-00007.safetensors",
940
+ "vision_model.encoder.layers.37.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
941
+ "vision_model.encoder.layers.37.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
942
+ "vision_model.encoder.layers.38.layer_norm1.bias": "model-00001-of-00007.safetensors",
943
+ "vision_model.encoder.layers.38.layer_norm1.weight": "model-00001-of-00007.safetensors",
944
+ "vision_model.encoder.layers.38.layer_norm2.bias": "model-00001-of-00007.safetensors",
945
+ "vision_model.encoder.layers.38.layer_norm2.weight": "model-00001-of-00007.safetensors",
946
+ "vision_model.encoder.layers.38.mlp.fc1.bias": "model-00001-of-00007.safetensors",
947
+ "vision_model.encoder.layers.38.mlp.fc1.weight": "model-00001-of-00007.safetensors",
948
+ "vision_model.encoder.layers.38.mlp.fc2.bias": "model-00001-of-00007.safetensors",
949
+ "vision_model.encoder.layers.38.mlp.fc2.weight": "model-00001-of-00007.safetensors",
950
+ "vision_model.encoder.layers.38.self_attn.projection.bias": "model-00001-of-00007.safetensors",
951
+ "vision_model.encoder.layers.38.self_attn.projection.weight": "model-00001-of-00007.safetensors",
952
+ "vision_model.encoder.layers.38.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
953
+ "vision_model.encoder.layers.38.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
954
+ "vision_model.encoder.layers.39.layer_norm1.bias": "model-00001-of-00007.safetensors",
955
+ "vision_model.encoder.layers.39.layer_norm1.weight": "model-00001-of-00007.safetensors",
956
+ "vision_model.encoder.layers.39.layer_norm2.bias": "model-00001-of-00007.safetensors",
957
+ "vision_model.encoder.layers.39.layer_norm2.weight": "model-00001-of-00007.safetensors",
958
+ "vision_model.encoder.layers.39.mlp.fc1.bias": "model-00001-of-00007.safetensors",
959
+ "vision_model.encoder.layers.39.mlp.fc1.weight": "model-00001-of-00007.safetensors",
960
+ "vision_model.encoder.layers.39.mlp.fc2.bias": "model-00001-of-00007.safetensors",
961
+ "vision_model.encoder.layers.39.mlp.fc2.weight": "model-00001-of-00007.safetensors",
962
+ "vision_model.encoder.layers.39.self_attn.projection.bias": "model-00001-of-00007.safetensors",
963
+ "vision_model.encoder.layers.39.self_attn.projection.weight": "model-00001-of-00007.safetensors",
964
+ "vision_model.encoder.layers.39.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
965
+ "vision_model.encoder.layers.39.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
966
+ "vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00007.safetensors",
967
+ "vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00007.safetensors",
968
+ "vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00007.safetensors",
969
+ "vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00007.safetensors",
970
+ "vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00007.safetensors",
971
+ "vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00007.safetensors",
972
+ "vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00007.safetensors",
973
+ "vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00007.safetensors",
974
+ "vision_model.encoder.layers.4.self_attn.projection.bias": "model-00001-of-00007.safetensors",
975
+ "vision_model.encoder.layers.4.self_attn.projection.weight": "model-00001-of-00007.safetensors",
976
+ "vision_model.encoder.layers.4.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
977
+ "vision_model.encoder.layers.4.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
978
+ "vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00007.safetensors",
979
+ "vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00007.safetensors",
980
+ "vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00007.safetensors",
981
+ "vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00007.safetensors",
982
+ "vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00007.safetensors",
983
+ "vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00007.safetensors",
984
+ "vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00007.safetensors",
985
+ "vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00007.safetensors",
986
+ "vision_model.encoder.layers.5.self_attn.projection.bias": "model-00001-of-00007.safetensors",
987
+ "vision_model.encoder.layers.5.self_attn.projection.weight": "model-00001-of-00007.safetensors",
988
+ "vision_model.encoder.layers.5.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
989
+ "vision_model.encoder.layers.5.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
990
+ "vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00007.safetensors",
991
+ "vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00007.safetensors",
992
+ "vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00007.safetensors",
993
+ "vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00007.safetensors",
994
+ "vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00007.safetensors",
995
+ "vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00007.safetensors",
996
+ "vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00007.safetensors",
997
+ "vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00007.safetensors",
998
+ "vision_model.encoder.layers.6.self_attn.projection.bias": "model-00001-of-00007.safetensors",
999
+ "vision_model.encoder.layers.6.self_attn.projection.weight": "model-00001-of-00007.safetensors",
1000
+ "vision_model.encoder.layers.6.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
1001
+ "vision_model.encoder.layers.6.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
1002
+ "vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00007.safetensors",
1003
+ "vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00007.safetensors",
1004
+ "vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00007.safetensors",
1005
+ "vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00007.safetensors",
1006
+ "vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00007.safetensors",
1007
+ "vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00007.safetensors",
1008
+ "vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00007.safetensors",
1009
+ "vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00007.safetensors",
1010
+ "vision_model.encoder.layers.7.self_attn.projection.bias": "model-00001-of-00007.safetensors",
1011
+ "vision_model.encoder.layers.7.self_attn.projection.weight": "model-00001-of-00007.safetensors",
1012
+ "vision_model.encoder.layers.7.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
1013
+ "vision_model.encoder.layers.7.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
1014
+ "vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00007.safetensors",
1015
+ "vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00007.safetensors",
1016
+ "vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00007.safetensors",
1017
+ "vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00007.safetensors",
1018
+ "vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00007.safetensors",
1019
+ "vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00007.safetensors",
1020
+ "vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00007.safetensors",
1021
+ "vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00007.safetensors",
1022
+ "vision_model.encoder.layers.8.self_attn.projection.bias": "model-00001-of-00007.safetensors",
1023
+ "vision_model.encoder.layers.8.self_attn.projection.weight": "model-00001-of-00007.safetensors",
1024
+ "vision_model.encoder.layers.8.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
1025
+ "vision_model.encoder.layers.8.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
1026
+ "vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00007.safetensors",
1027
+ "vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00007.safetensors",
1028
+ "vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00007.safetensors",
1029
+ "vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00007.safetensors",
1030
+ "vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00007.safetensors",
1031
+ "vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00007.safetensors",
1032
+ "vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00007.safetensors",
1033
+ "vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00007.safetensors",
1034
+ "vision_model.encoder.layers.9.self_attn.projection.bias": "model-00001-of-00007.safetensors",
1035
+ "vision_model.encoder.layers.9.self_attn.projection.weight": "model-00001-of-00007.safetensors",
1036
+ "vision_model.encoder.layers.9.self_attn.qkv.bias": "model-00001-of-00007.safetensors",
1037
+ "vision_model.encoder.layers.9.self_attn.qkv.weight": "model-00001-of-00007.safetensors",
1038
+ "vision_model.post_layernorm.bias": "model-00001-of-00007.safetensors",
1039
+ "vision_model.post_layernorm.weight": "model-00001-of-00007.safetensors"
1040
+ }
1041
+ }
modeling_chexagent.py ADDED
@@ -0,0 +1,1300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The CheXagent Authors, The Salesforce Authors and The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import math
17
+ from dataclasses import dataclass
18
+ from typing import Any, Optional, Tuple, Union
19
+
20
+ import torch
21
+ import torch.utils.checkpoint
22
+ from einops import rearrange
23
+ from torch import nn
24
+ from torch.nn import CrossEntropyLoss
25
+ from transformers.activations import ACT2FN
26
+ from transformers.modeling_outputs import (
27
+ BaseModelOutput,
28
+ BaseModelOutputWithPastAndCrossAttentions,
29
+ BaseModelOutputWithPooling,
30
+ BaseModelOutputWithPoolingAndCrossAttentions,
31
+ )
32
+ from transformers.modeling_utils import PreTrainedModel
33
+ from transformers.models.auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM
34
+ from transformers.pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
35
+ from transformers.utils import ModelOutput, logging
36
+
37
+ from .configuration_chexagent import CheXagentConfig, CheXagentQFormerConfig, CheXagentVisionConfig
38
+
39
+ logger = logging.get_logger(__name__)
40
+
41
+
42
+ @dataclass
43
+ class CheXagentForConditionalGenerationModelOutput(ModelOutput):
44
+ loss: Optional[Tuple[torch.FloatTensor]] = None
45
+ logits: Optional[Tuple[torch.FloatTensor]] = None
46
+ vision_outputs: Optional[torch.FloatTensor] = None
47
+ qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None
48
+ language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None
49
+
50
+ def to_tuple(self) -> Tuple[Any]:
51
+ return tuple(
52
+ self[k]
53
+ if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"]
54
+ else getattr(self, k).to_tuple()
55
+ for k in self.keys()
56
+ )
57
+
58
+
59
+ class CheXagentVisionEmbeddings(nn.Module):
60
+ def __init__(self, config: CheXagentVisionConfig):
61
+ super().__init__()
62
+ self.config = config
63
+ self.embed_dim = config.hidden_size
64
+ self.image_size = config.image_size
65
+ self.patch_size = config.patch_size
66
+
67
+ self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
68
+
69
+ self.patch_embedding = nn.Conv2d(
70
+ in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
71
+ )
72
+
73
+ self.num_patches = (self.image_size // self.patch_size) ** 2
74
+ self.num_positions = self.num_patches + 1
75
+
76
+ self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
77
+
78
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
79
+ batch_size = pixel_values.shape[0]
80
+ target_dtype = self.patch_embedding.weight.dtype
81
+ patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid]
82
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
83
+
84
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
85
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
86
+ embeddings = embeddings + self.position_embedding[:, : embeddings.size(1), :].to(target_dtype)
87
+ return embeddings
88
+
89
+
90
+ class CheXagentAttention(nn.Module):
91
+ def __init__(self, config):
92
+ super().__init__()
93
+ self.config = config
94
+ self.embed_dim = config.hidden_size
95
+ self.num_heads = config.num_attention_heads
96
+ self.head_dim = self.embed_dim // self.num_heads
97
+ if self.head_dim * self.num_heads != self.embed_dim:
98
+ raise ValueError(
99
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
100
+ f" {self.num_heads})."
101
+ )
102
+ self.scale = self.head_dim ** -0.5
103
+ self.dropout = nn.Dropout(config.attention_dropout)
104
+
105
+ # small tweak here compared to CLIP, no bias here
106
+ self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False)
107
+
108
+ if config.qkv_bias:
109
+ q_bias = nn.Parameter(torch.zeros(self.embed_dim))
110
+ v_bias = nn.Parameter(torch.zeros(self.embed_dim))
111
+ else:
112
+ q_bias = None
113
+ v_bias = None
114
+
115
+ if q_bias is not None:
116
+ qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias))
117
+ self.qkv.bias = nn.Parameter(qkv_bias)
118
+
119
+ self.projection = nn.Linear(self.embed_dim, self.embed_dim)
120
+
121
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
122
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
123
+
124
+ def forward(
125
+ self,
126
+ hidden_states: torch.Tensor,
127
+ head_mask: Optional[torch.Tensor] = None,
128
+ output_attentions: Optional[bool] = False,
129
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
130
+ bsz, tgt_len, embed_dim = hidden_states.size()
131
+
132
+ mixed_qkv = self.qkv(hidden_states)
133
+
134
+ mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute(
135
+ 2, 0, 3, 1, 4
136
+ )
137
+ query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
138
+
139
+ # Take the dot product between "query" and "key" to get the raw attention scores.
140
+ attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
141
+
142
+ attention_scores = attention_scores * self.scale
143
+
144
+ # Normalize the attention scores to probabilities.
145
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
146
+
147
+ # This is actually dropping out entire tokens to attend to, which might
148
+ # seem a bit unusual, but is taken from the original Transformer paper.
149
+ attention_probs = self.dropout(attention_probs)
150
+
151
+ # Mask heads if we want to
152
+ if head_mask is not None:
153
+ attention_probs = attention_probs * head_mask
154
+
155
+ context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
156
+
157
+ new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
158
+ context_layer = context_layer.reshape(new_context_layer_shape)
159
+
160
+ output = self.projection(context_layer)
161
+
162
+ outputs = (output, attention_probs) if output_attentions else (output, None)
163
+
164
+ return outputs
165
+
166
+
167
+ class CheXagentMLP(nn.Module):
168
+ def __init__(self, config):
169
+ super().__init__()
170
+ self.config = config
171
+ self.activation_fn = ACT2FN[config.hidden_act]
172
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
173
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
174
+
175
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
176
+ hidden_states = self.fc1(hidden_states)
177
+ hidden_states = self.activation_fn(hidden_states)
178
+ hidden_states = self.fc2(hidden_states)
179
+ return hidden_states
180
+
181
+
182
+ class CheXagentEncoderLayer(nn.Module):
183
+ def __init__(self, config: CheXagentConfig):
184
+ super().__init__()
185
+ self.embed_dim = config.hidden_size
186
+ self.self_attn = CheXagentAttention(config)
187
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
188
+ self.mlp = CheXagentMLP(config)
189
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
190
+
191
+ def forward(
192
+ self,
193
+ hidden_states: torch.Tensor,
194
+ attention_mask: torch.Tensor,
195
+ output_attentions: Optional[bool] = False,
196
+ ) -> Tuple[torch.FloatTensor]:
197
+ residual = hidden_states
198
+ hidden_states = self.layer_norm1(hidden_states)
199
+ hidden_states, attn_weights = self.self_attn(
200
+ hidden_states=hidden_states,
201
+ head_mask=attention_mask,
202
+ output_attentions=output_attentions,
203
+ )
204
+ hidden_states = hidden_states + residual
205
+ residual = hidden_states
206
+ hidden_states = self.layer_norm2(hidden_states)
207
+ hidden_states = self.mlp(hidden_states)
208
+
209
+ hidden_states = hidden_states + residual
210
+
211
+ outputs = (hidden_states,)
212
+
213
+ if output_attentions:
214
+ outputs += (attn_weights,)
215
+
216
+ return outputs
217
+
218
+
219
+ class CheXagentPreTrainedModel(PreTrainedModel):
220
+ config_class = CheXagentConfig
221
+ base_model_prefix = "chexagent"
222
+ supports_gradient_checkpointing = True
223
+ _no_split_modules = [
224
+ "CheXagentQFormerEmbeddings",
225
+ "CheXagentAttention",
226
+ "CheXagentQFormerMultiHeadAttention",
227
+ "CheXagentQFormerSelfOutput",
228
+ ]
229
+ _keep_in_fp32_modules = []
230
+
231
+ def _init_weights(self, module):
232
+ """Initialize the weights"""
233
+ factor = self.config.initializer_range
234
+ if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear):
235
+ module.weight.data.normal_(mean=0.0, std=factor)
236
+ if hasattr(module, "bias") and module.bias is not None:
237
+ module.bias.data.zero_()
238
+
239
+ if isinstance(module, CheXagentVisionEmbeddings):
240
+ if hasattr(self.config, "vision_config"):
241
+ factor = self.config.vision_config.initializer_range
242
+ nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor)
243
+ nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor)
244
+
245
+ elif isinstance(module, nn.LayerNorm):
246
+ module.bias.data.zero_()
247
+ module.weight.data.fill_(1.0)
248
+ elif isinstance(module, nn.Linear) and module.bias is not None:
249
+ module.bias.data.zero_()
250
+
251
+
252
+ class CheXagentEncoder(nn.Module):
253
+ def __init__(self, config: CheXagentConfig):
254
+ super().__init__()
255
+ self.config = config
256
+ self.layers = nn.ModuleList([CheXagentEncoderLayer(config) for _ in range(config.num_hidden_layers)])
257
+ self.gradient_checkpointing = False
258
+
259
+ def forward(
260
+ self,
261
+ inputs_embeds,
262
+ attention_mask: Optional[torch.Tensor] = None,
263
+ output_attentions: Optional[bool] = None,
264
+ output_hidden_states: Optional[bool] = None,
265
+ return_dict: Optional[bool] = None,
266
+ ) -> Union[Tuple, BaseModelOutput]:
267
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
268
+ output_hidden_states = (
269
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
270
+ )
271
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
272
+
273
+ encoder_states = () if output_hidden_states else None
274
+ all_attentions = () if output_attentions else None
275
+ hidden_states = inputs_embeds
276
+ for idx, encoder_layer in enumerate(self.layers):
277
+ if output_hidden_states:
278
+ encoder_states = encoder_states + (hidden_states,)
279
+ if self.gradient_checkpointing and self.training:
280
+ layer_outputs = self._gradient_checkpointing_func(
281
+ encoder_layer.__call__,
282
+ hidden_states,
283
+ attention_mask,
284
+ output_attentions,
285
+ )
286
+ else:
287
+ layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions, )
288
+
289
+ hidden_states = layer_outputs[0]
290
+
291
+ if output_attentions:
292
+ all_attentions = all_attentions + (layer_outputs[1],)
293
+
294
+ if output_hidden_states:
295
+ encoder_states = encoder_states + (hidden_states,)
296
+
297
+ if not return_dict:
298
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
299
+ return BaseModelOutput(
300
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
301
+ )
302
+
303
+
304
+ class CheXagentVisionModel(CheXagentPreTrainedModel):
305
+ main_input_name = "pixel_values"
306
+ config_class = CheXagentVisionConfig
307
+
308
+ def __init__(self, config: CheXagentVisionConfig):
309
+ super().__init__(config)
310
+ self.config = config
311
+ embed_dim = config.hidden_size
312
+
313
+ self.embeddings = CheXagentVisionEmbeddings(config)
314
+ self.encoder = CheXagentEncoder(config)
315
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
316
+
317
+ self.post_init()
318
+
319
+ def forward(
320
+ self,
321
+ pixel_values: Optional[torch.FloatTensor] = None,
322
+ output_attentions: Optional[bool] = None,
323
+ output_hidden_states: Optional[bool] = None,
324
+ return_dict: Optional[bool] = None,
325
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
326
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
327
+ output_hidden_states = (
328
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
329
+ )
330
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
331
+
332
+ if pixel_values is None:
333
+ raise ValueError("You have to specify pixel_values")
334
+ hidden_states = self.embeddings(pixel_values)
335
+
336
+ encoder_outputs = self.encoder(
337
+ inputs_embeds=hidden_states,
338
+ output_attentions=output_attentions,
339
+ output_hidden_states=output_hidden_states,
340
+ return_dict=return_dict,
341
+ )
342
+
343
+ last_hidden_state = encoder_outputs[0]
344
+ last_hidden_state = self.post_layernorm(last_hidden_state)
345
+
346
+ pooled_output = last_hidden_state[:, 0, :]
347
+ pooled_output = self.post_layernorm(pooled_output)
348
+
349
+ if not return_dict:
350
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
351
+
352
+ return BaseModelOutputWithPooling(
353
+ last_hidden_state=last_hidden_state,
354
+ pooler_output=pooled_output,
355
+ hidden_states=encoder_outputs.hidden_states,
356
+ attentions=encoder_outputs.attentions,
357
+ )
358
+
359
+ def get_input_embeddings(self):
360
+ return self.embeddings
361
+
362
+
363
+ class CheXagentQFormerMultiHeadAttention(nn.Module):
364
+ def __init__(self, config, is_cross_attention=False):
365
+ super().__init__()
366
+ self.config = config
367
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
368
+ raise ValueError(
369
+ "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
370
+ % (config.hidden_size, config.num_attention_heads)
371
+ )
372
+
373
+ self.num_attention_heads = config.num_attention_heads
374
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
375
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
376
+
377
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
378
+ if is_cross_attention:
379
+ self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
380
+ self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
381
+ else:
382
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
383
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
384
+
385
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
386
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
387
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
388
+ self.max_position_embeddings = config.max_position_embeddings
389
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
390
+ self.save_attention = False
391
+
392
+ def save_attn_gradients(self, attn_gradients):
393
+ self.attn_gradients = attn_gradients
394
+
395
+ def get_attn_gradients(self):
396
+ return self.attn_gradients
397
+
398
+ def save_attention_map(self, attention_map):
399
+ self.attention_map = attention_map
400
+
401
+ def get_attention_map(self):
402
+ return self.attention_map
403
+
404
+ def transpose_for_scores(self, x):
405
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
406
+ x = x.view(*new_x_shape)
407
+ return x.permute(0, 2, 1, 3)
408
+
409
+ def forward(
410
+ self,
411
+ hidden_states,
412
+ attention_mask=None,
413
+ head_mask=None,
414
+ encoder_hidden_states=None,
415
+ encoder_attention_mask=None,
416
+ past_key_value=None,
417
+ output_attentions=False,
418
+ ):
419
+ # If this is instantiated as a cross-attention module, the keys
420
+ # and values come from an encoder; the attention mask needs to be
421
+ # such that the encoder's padding tokens are not attended to.
422
+ is_cross_attention = encoder_hidden_states is not None
423
+
424
+ if is_cross_attention:
425
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
426
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
427
+ attention_mask = encoder_attention_mask
428
+ elif past_key_value is not None:
429
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
430
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
431
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
432
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
433
+ else:
434
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
435
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
436
+
437
+ mixed_query_layer = self.query(hidden_states)
438
+
439
+ query_layer = self.transpose_for_scores(mixed_query_layer)
440
+
441
+ past_key_value = (key_layer, value_layer)
442
+
443
+ # Take the dot product between "query" and "key" to get the raw attention scores.
444
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
445
+
446
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
447
+ seq_length = hidden_states.size()[1]
448
+ position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
449
+ position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
450
+ distance = position_ids_l - position_ids_r
451
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
452
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
453
+
454
+ if self.position_embedding_type == "relative_key":
455
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
456
+ attention_scores = attention_scores + relative_position_scores
457
+ elif self.position_embedding_type == "relative_key_query":
458
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
459
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
460
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
461
+
462
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
463
+
464
+ if attention_mask is not None:
465
+ # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
466
+ attention_scores = attention_scores + attention_mask
467
+
468
+ # Normalize the attention scores to probabilities.
469
+ attention_probs = nn.Softmax(dim=-1)(attention_scores)
470
+
471
+ if is_cross_attention and self.save_attention:
472
+ self.save_attention_map(attention_probs)
473
+ attention_probs.register_hook(self.save_attn_gradients)
474
+
475
+ # This is actually dropping out entire tokens to attend to, which might
476
+ # seem a bit unusual, but is taken from the original Transformer paper.
477
+ attention_probs_dropped = self.dropout(attention_probs)
478
+
479
+ # Mask heads if we want to
480
+ if head_mask is not None:
481
+ attention_probs_dropped = attention_probs_dropped * head_mask
482
+
483
+ context_layer = torch.matmul(attention_probs_dropped, value_layer)
484
+
485
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
486
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
487
+ context_layer = context_layer.view(*new_context_layer_shape)
488
+
489
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
490
+
491
+ outputs = outputs + (past_key_value,)
492
+ return outputs
493
+
494
+
495
+ class CheXagentQFormerSelfOutput(nn.Module):
496
+ def __init__(self, config):
497
+ super().__init__()
498
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
499
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
500
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
501
+
502
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
503
+ hidden_states = self.dense(hidden_states)
504
+ hidden_states = self.dropout(hidden_states)
505
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
506
+ return hidden_states
507
+
508
+
509
+ class CheXagentQFormerAttention(nn.Module):
510
+ def __init__(self, config, is_cross_attention=False):
511
+ super().__init__()
512
+ self.attention = CheXagentQFormerMultiHeadAttention(config, is_cross_attention)
513
+ self.output = CheXagentQFormerSelfOutput(config)
514
+ self.pruned_heads = set()
515
+
516
+ def prune_heads(self, heads):
517
+ if len(heads) == 0:
518
+ return
519
+ heads, index = find_pruneable_heads_and_indices(
520
+ heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
521
+ )
522
+
523
+ # Prune linear layers
524
+ self.attention.query = prune_linear_layer(self.attention.query, index)
525
+ self.attention.key = prune_linear_layer(self.attention.key, index)
526
+ self.attention.value = prune_linear_layer(self.attention.value, index)
527
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
528
+
529
+ # Update hyper params and store pruned heads
530
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
531
+ self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
532
+ self.pruned_heads = self.pruned_heads.union(heads)
533
+
534
+ def forward(
535
+ self,
536
+ hidden_states: torch.Tensor,
537
+ attention_mask: Optional[torch.FloatTensor] = None,
538
+ head_mask: Optional[torch.FloatTensor] = None,
539
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
540
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
541
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
542
+ output_attentions: Optional[bool] = False,
543
+ ) -> Tuple[torch.Tensor]:
544
+ self_outputs = self.attention(
545
+ hidden_states,
546
+ attention_mask,
547
+ head_mask,
548
+ encoder_hidden_states,
549
+ encoder_attention_mask,
550
+ past_key_value,
551
+ output_attentions,
552
+ )
553
+ attention_output = self.output(self_outputs[0], hidden_states)
554
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
555
+ return outputs
556
+
557
+
558
+ class CheXagentQFormerIntermediate(nn.Module):
559
+ def __init__(self, config):
560
+ super().__init__()
561
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
562
+ if isinstance(config.hidden_act, str):
563
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
564
+ else:
565
+ self.intermediate_act_fn = config.hidden_act
566
+
567
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
568
+ hidden_states = self.dense(hidden_states)
569
+ hidden_states = self.intermediate_act_fn(hidden_states)
570
+ return hidden_states
571
+
572
+
573
+ class CheXagentQFormerOutput(nn.Module):
574
+ def __init__(self, config):
575
+ super().__init__()
576
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
577
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
578
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
579
+
580
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
581
+ hidden_states = self.dense(hidden_states)
582
+ hidden_states = self.dropout(hidden_states)
583
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
584
+ return hidden_states
585
+
586
+
587
+ class CheXagentQFormerLayer(nn.Module):
588
+ def __init__(self, config, layer_idx):
589
+ super().__init__()
590
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
591
+ self.seq_len_dim = 1
592
+ self.attention = CheXagentQFormerAttention(config)
593
+
594
+ self.layer_idx = layer_idx
595
+
596
+ if layer_idx % config.cross_attention_frequency == 0:
597
+ self.crossattention = CheXagentQFormerAttention(config, is_cross_attention=True)
598
+ self.has_cross_attention = True
599
+ else:
600
+ self.has_cross_attention = False
601
+
602
+ self.intermediate_query = CheXagentQFormerIntermediate(config)
603
+ self.output_query = CheXagentQFormerOutput(config)
604
+
605
+ def forward(
606
+ self,
607
+ hidden_states,
608
+ attention_mask=None,
609
+ head_mask=None,
610
+ encoder_hidden_states=None,
611
+ encoder_attention_mask=None,
612
+ past_key_value=None,
613
+ output_attentions=False,
614
+ query_length=0,
615
+ ):
616
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
617
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
618
+ self_attention_outputs = self.attention(
619
+ hidden_states,
620
+ attention_mask,
621
+ head_mask,
622
+ output_attentions=output_attentions,
623
+ past_key_value=self_attn_past_key_value,
624
+ )
625
+ attention_output = self_attention_outputs[0]
626
+ outputs = self_attention_outputs[1:-1]
627
+
628
+ present_key_value = self_attention_outputs[-1]
629
+
630
+ if query_length > 0:
631
+ query_attention_output = attention_output[:, :query_length, :]
632
+
633
+ if self.has_cross_attention:
634
+ if encoder_hidden_states is None:
635
+ raise ValueError("encoder_hidden_states must be given for cross-attention layers")
636
+ cross_attention_outputs = self.crossattention(
637
+ query_attention_output,
638
+ attention_mask,
639
+ head_mask,
640
+ encoder_hidden_states,
641
+ encoder_attention_mask,
642
+ output_attentions=output_attentions,
643
+ )
644
+ query_attention_output = cross_attention_outputs[0]
645
+ # add cross attentions if we output attention weights
646
+ outputs = outputs + cross_attention_outputs[1:-1]
647
+
648
+ layer_output = apply_chunking_to_forward(
649
+ self.feed_forward_chunk_query,
650
+ self.chunk_size_feed_forward,
651
+ self.seq_len_dim,
652
+ query_attention_output,
653
+ )
654
+
655
+ if attention_output.shape[1] > query_length:
656
+ layer_output_text = apply_chunking_to_forward(
657
+ self.feed_forward_chunk,
658
+ self.chunk_size_feed_forward,
659
+ self.seq_len_dim,
660
+ attention_output[:, query_length:, :],
661
+ )
662
+ layer_output = torch.cat([layer_output, layer_output_text], dim=1)
663
+ else:
664
+ layer_output = apply_chunking_to_forward(
665
+ self.feed_forward_chunk,
666
+ self.chunk_size_feed_forward,
667
+ self.seq_len_dim,
668
+ attention_output,
669
+ )
670
+ outputs = (layer_output,) + outputs
671
+
672
+ outputs = outputs + (present_key_value,)
673
+
674
+ return outputs
675
+
676
+ def feed_forward_chunk(self, attention_output):
677
+ intermediate_output = self.intermediate(attention_output)
678
+ layer_output = self.output(intermediate_output, attention_output)
679
+ return layer_output
680
+
681
+ def feed_forward_chunk_query(self, attention_output):
682
+ intermediate_output = self.intermediate_query(attention_output)
683
+ layer_output = self.output_query(intermediate_output, attention_output)
684
+ return layer_output
685
+
686
+
687
+ class CheXagentQFormerEncoder(nn.Module):
688
+ def __init__(self, config):
689
+ super().__init__()
690
+ self.config = config
691
+ self.layer = nn.ModuleList(
692
+ [CheXagentQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
693
+ )
694
+ self.gradient_checkpointing = False
695
+
696
+ def forward(
697
+ self,
698
+ hidden_states,
699
+ attention_mask=None,
700
+ head_mask=None,
701
+ encoder_hidden_states=None,
702
+ encoder_attention_mask=None,
703
+ past_key_values=None,
704
+ use_cache=None,
705
+ output_attentions=False,
706
+ output_hidden_states=False,
707
+ return_dict=True,
708
+ query_length=0,
709
+ ):
710
+ all_hidden_states = () if output_hidden_states else None
711
+ all_self_attentions = () if output_attentions else None
712
+ all_cross_attentions = () if output_attentions else None
713
+
714
+ next_decoder_cache = () if use_cache else None
715
+
716
+ for i in range(self.config.num_hidden_layers):
717
+ layer_module = self.layer[i]
718
+ if output_hidden_states:
719
+ all_hidden_states = all_hidden_states + (hidden_states,)
720
+
721
+ layer_head_mask = head_mask[i] if head_mask is not None else None
722
+ past_key_value = past_key_values[i] if past_key_values is not None else None
723
+
724
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
725
+ if use_cache:
726
+ logger.warning(
727
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
728
+ )
729
+ use_cache = False
730
+ layer_outputs = self._gradient_checkpointing_func(
731
+ layer_module.__call__,
732
+ hidden_states,
733
+ attention_mask,
734
+ layer_head_mask,
735
+ encoder_hidden_states,
736
+ encoder_attention_mask,
737
+ )
738
+ else:
739
+ layer_outputs = layer_module(
740
+ hidden_states,
741
+ attention_mask,
742
+ layer_head_mask,
743
+ encoder_hidden_states,
744
+ encoder_attention_mask,
745
+ past_key_value,
746
+ output_attentions,
747
+ query_length,
748
+ )
749
+
750
+ hidden_states = layer_outputs[0]
751
+ if use_cache:
752
+ next_decoder_cache += (layer_outputs[-1],)
753
+ if output_attentions:
754
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
755
+ if layer_module.has_cross_attention:
756
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
757
+
758
+ if output_hidden_states:
759
+ all_hidden_states = all_hidden_states + (hidden_states,)
760
+
761
+ if not return_dict:
762
+ return tuple(
763
+ v
764
+ for v in [
765
+ hidden_states,
766
+ next_decoder_cache,
767
+ all_hidden_states,
768
+ all_self_attentions,
769
+ all_cross_attentions,
770
+ ]
771
+ if v is not None
772
+ )
773
+ return BaseModelOutputWithPastAndCrossAttentions(
774
+ last_hidden_state=hidden_states,
775
+ past_key_values=next_decoder_cache,
776
+ hidden_states=all_hidden_states,
777
+ attentions=all_self_attentions,
778
+ cross_attentions=all_cross_attentions,
779
+ )
780
+
781
+
782
+ class CheXagentQFormerEmbeddings(nn.Module):
783
+ def __init__(self, config):
784
+ super().__init__()
785
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
786
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
787
+
788
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
789
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
790
+
791
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
792
+ self.register_buffer(
793
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
794
+ )
795
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
796
+
797
+ self.config = config
798
+
799
+ def forward(
800
+ self,
801
+ input_ids=None,
802
+ position_ids=None,
803
+ query_embeds=None,
804
+ past_key_values_length=0,
805
+ ):
806
+ if input_ids is not None:
807
+ seq_length = input_ids.size()[1]
808
+ else:
809
+ seq_length = 0
810
+
811
+ if position_ids is None:
812
+ position_ids = self.position_ids[:, past_key_values_length: seq_length + past_key_values_length].clone()
813
+
814
+ if input_ids is not None:
815
+ embeddings = self.word_embeddings(input_ids)
816
+ if self.position_embedding_type == "absolute":
817
+ position_embeddings = self.position_embeddings(position_ids.to(embeddings.device))
818
+ embeddings = embeddings + position_embeddings
819
+
820
+ if query_embeds is not None:
821
+ embeddings = torch.cat((query_embeds, embeddings), dim=1)
822
+ else:
823
+ embeddings = query_embeds
824
+
825
+ embeddings = embeddings.to(self.layernorm.weight.dtype)
826
+ embeddings = self.layernorm(embeddings)
827
+ embeddings = self.dropout(embeddings)
828
+ return embeddings
829
+
830
+
831
+ class CheXagentQFormerEncoder(nn.Module):
832
+ def __init__(self, config):
833
+ super().__init__()
834
+ self.config = config
835
+ self.layer = nn.ModuleList(
836
+ [CheXagentQFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
837
+ )
838
+ self.gradient_checkpointing = False
839
+
840
+ def forward(
841
+ self,
842
+ hidden_states,
843
+ attention_mask=None,
844
+ head_mask=None,
845
+ encoder_hidden_states=None,
846
+ encoder_attention_mask=None,
847
+ past_key_values=None,
848
+ use_cache=None,
849
+ output_attentions=False,
850
+ output_hidden_states=False,
851
+ return_dict=True,
852
+ query_length=0,
853
+ ):
854
+ all_hidden_states = () if output_hidden_states else None
855
+ all_self_attentions = () if output_attentions else None
856
+ all_cross_attentions = () if output_attentions else None
857
+
858
+ next_decoder_cache = () if use_cache else None
859
+
860
+ for i in range(self.config.num_hidden_layers):
861
+ layer_module = self.layer[i]
862
+ if output_hidden_states:
863
+ all_hidden_states = all_hidden_states + (hidden_states,)
864
+
865
+ layer_head_mask = head_mask[i] if head_mask is not None else None
866
+ past_key_value = past_key_values[i] if past_key_values is not None else None
867
+
868
+ if getattr(self.config, "gradient_checkpointing", False) and self.training:
869
+ if use_cache:
870
+ logger.warning(
871
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
872
+ )
873
+ use_cache = False
874
+ layer_outputs = self._gradient_checkpointing_func(
875
+ layer_module.__call__,
876
+ hidden_states,
877
+ attention_mask,
878
+ layer_head_mask,
879
+ encoder_hidden_states,
880
+ encoder_attention_mask,
881
+ )
882
+ else:
883
+ layer_outputs = layer_module(
884
+ hidden_states,
885
+ attention_mask,
886
+ layer_head_mask,
887
+ encoder_hidden_states,
888
+ encoder_attention_mask,
889
+ past_key_value,
890
+ output_attentions,
891
+ query_length,
892
+ )
893
+
894
+ hidden_states = layer_outputs[0]
895
+ if use_cache:
896
+ next_decoder_cache += (layer_outputs[-1],)
897
+ if output_attentions:
898
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
899
+ if layer_module.has_cross_attention:
900
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
901
+
902
+ if output_hidden_states:
903
+ all_hidden_states = all_hidden_states + (hidden_states,)
904
+
905
+ if not return_dict:
906
+ return tuple(
907
+ v
908
+ for v in [
909
+ hidden_states,
910
+ next_decoder_cache,
911
+ all_hidden_states,
912
+ all_self_attentions,
913
+ all_cross_attentions,
914
+ ]
915
+ if v is not None
916
+ )
917
+ return BaseModelOutputWithPastAndCrossAttentions(
918
+ last_hidden_state=hidden_states,
919
+ past_key_values=next_decoder_cache,
920
+ hidden_states=all_hidden_states,
921
+ attentions=all_self_attentions,
922
+ cross_attentions=all_cross_attentions,
923
+ )
924
+
925
+
926
+ class CheXagentQFormerModel(CheXagentPreTrainedModel):
927
+ def __init__(self, config: CheXagentQFormerConfig):
928
+ super().__init__(config)
929
+ self.config = config
930
+
931
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
932
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
933
+
934
+ self.encoder = CheXagentQFormerEncoder(config)
935
+
936
+ self.post_init()
937
+
938
+ def get_input_embeddings(self):
939
+ return self.embeddings.word_embeddings
940
+
941
+ def set_input_embeddings(self, value):
942
+ self.embeddings.word_embeddings = value
943
+
944
+ def _prune_heads(self, heads_to_prune):
945
+ for layer, heads in heads_to_prune.items():
946
+ self.encoder.layer[layer].attention.prune_heads(heads)
947
+
948
+ def get_extended_attention_mask(
949
+ self,
950
+ attention_mask: torch.Tensor,
951
+ input_shape: Tuple[int],
952
+ device: torch.device,
953
+ has_query: bool = False,
954
+ ) -> torch.Tensor:
955
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
956
+ # ourselves in which case we just need to make it broadcastable to all heads.
957
+ if attention_mask.dim() == 3:
958
+ extended_attention_mask = attention_mask[:, None, :, :]
959
+ elif attention_mask.dim() == 2:
960
+ # Provided a padding mask of dimensions [batch_size, seq_length]
961
+ # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
962
+ extended_attention_mask = attention_mask[:, None, None, :]
963
+ else:
964
+ raise ValueError(
965
+ "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
966
+ input_shape, attention_mask.shape
967
+ )
968
+ )
969
+
970
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
971
+ # masked positions, this operation will create a tensor which is 0.0 for
972
+ # positions we want to attend and -10000.0 for masked positions.
973
+ # Since we are adding it to the raw scores before the softmax, this is
974
+ # effectively the same as removing these entirely.
975
+ extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
976
+ extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
977
+ return extended_attention_mask
978
+
979
+ def forward(
980
+ self,
981
+ query_embeds: torch.FloatTensor,
982
+ attention_mask: Optional[torch.FloatTensor] = None,
983
+ head_mask: Optional[torch.FloatTensor] = None,
984
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
985
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
986
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
987
+ use_cache: Optional[bool] = None,
988
+ output_attentions: Optional[bool] = None,
989
+ output_hidden_states: Optional[bool] = None,
990
+ return_dict: Optional[bool] = None,
991
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
992
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
993
+ output_hidden_states = (
994
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
995
+ )
996
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
997
+
998
+ # past_key_values_length
999
+ past_key_values_length = (
1000
+ past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0
1001
+ )
1002
+
1003
+ query_length = query_embeds.shape[1] if query_embeds is not None else 0
1004
+
1005
+ embedding_output = self.layernorm(query_embeds)
1006
+ embedding_output = self.dropout(embedding_output)
1007
+
1008
+ input_shape = embedding_output.size()[:-1]
1009
+ batch_size, seq_length = input_shape
1010
+ device = embedding_output.device
1011
+
1012
+ if attention_mask is None:
1013
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
1014
+
1015
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
1016
+ # ourselves in which case we just need to make it broadcastable to all heads.
1017
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device)
1018
+
1019
+ # If a 2D or 3D attention mask is provided for the cross-attention
1020
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
1021
+ if encoder_hidden_states is not None:
1022
+ if type(encoder_hidden_states) == list:
1023
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
1024
+ else:
1025
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1026
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1027
+
1028
+ if type(encoder_attention_mask) == list:
1029
+ encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
1030
+ elif encoder_attention_mask is None:
1031
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1032
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1033
+ else:
1034
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1035
+ else:
1036
+ encoder_extended_attention_mask = None
1037
+
1038
+ # Prepare head mask if needed
1039
+ # 1.0 in head_mask indicate we keep the head
1040
+ # attention_probs has shape bsz x n_heads x N x N
1041
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1042
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1043
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1044
+
1045
+ encoder_outputs = self.encoder(
1046
+ embedding_output,
1047
+ attention_mask=extended_attention_mask,
1048
+ head_mask=head_mask,
1049
+ encoder_hidden_states=encoder_hidden_states,
1050
+ encoder_attention_mask=encoder_extended_attention_mask,
1051
+ past_key_values=past_key_values,
1052
+ use_cache=use_cache,
1053
+ output_attentions=output_attentions,
1054
+ output_hidden_states=output_hidden_states,
1055
+ return_dict=return_dict,
1056
+ query_length=query_length,
1057
+ )
1058
+ sequence_output = encoder_outputs[0]
1059
+ pooled_output = sequence_output[:, 0, :]
1060
+
1061
+ if not return_dict:
1062
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1063
+
1064
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1065
+ last_hidden_state=sequence_output,
1066
+ pooler_output=pooled_output,
1067
+ past_key_values=encoder_outputs.past_key_values,
1068
+ hidden_states=encoder_outputs.hidden_states,
1069
+ attentions=encoder_outputs.attentions,
1070
+ cross_attentions=encoder_outputs.cross_attentions,
1071
+ )
1072
+
1073
+
1074
+ class CheXagentForConditionalGeneration(CheXagentPreTrainedModel):
1075
+ config_class = CheXagentConfig
1076
+ main_input_name = "pixel_values"
1077
+
1078
+ def __init__(self, config: CheXagentConfig):
1079
+ super().__init__(config)
1080
+
1081
+ self.vision_model = CheXagentVisionModel(config.vision_config)
1082
+
1083
+ self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size))
1084
+ self.qformer = CheXagentQFormerModel(config.qformer_config)
1085
+
1086
+ self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size)
1087
+ if config.use_decoder_only_language_model:
1088
+ language_model = AutoModelForCausalLM.from_config(config.text_config)
1089
+ else:
1090
+ language_model = AutoModelForSeq2SeqLM.from_config(config.text_config)
1091
+
1092
+ # Update _tied_weights_keys using the base model used.
1093
+ if language_model._tied_weights_keys is not None:
1094
+ self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
1095
+
1096
+ self.language_model = language_model
1097
+
1098
+ # Initialize weights and apply final processing
1099
+ self.post_init()
1100
+
1101
+ def get_input_embeddings(self):
1102
+ return self.language_model.get_input_embeddings()
1103
+
1104
+ def set_input_embeddings(self, value):
1105
+ self.language_model.set_input_embeddings(value)
1106
+
1107
+ def set_output_embeddings(self, new_embeddings):
1108
+ self.language_model.set_output_embeddings(new_embeddings)
1109
+
1110
+ def get_output_embeddings(self) -> nn.Module:
1111
+ return self.language_model.get_output_embeddings()
1112
+
1113
+ def get_encoder(self):
1114
+ return self.language_model.get_encoder()
1115
+
1116
+ def get_decoder(self):
1117
+ return self.language_model.get_decoder()
1118
+
1119
+ def _tie_weights(self):
1120
+ if not self.config.use_decoder_only_language_model:
1121
+ self.language_model.encoder.embed_tokens = self.language_model.shared
1122
+ self.language_model.decoder.embed_tokens = self.language_model.shared
1123
+
1124
+ def _preprocess_accelerate(self):
1125
+ hf_device_map = self.hf_device_map
1126
+ if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1:
1127
+ # warn users about unexpected behavior when using multi-GPU + BLIP-2 + `accelerate`.
1128
+ logger.warning(
1129
+ "The `language_model` is not in the `hf_device_map` dictionary and you are running your script"
1130
+ " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`."
1131
+ " Please pass a `device_map` that contains `language_model` to remove this warning."
1132
+ " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for"
1133
+ " more details on creating a `device_map` for large models.",
1134
+ )
1135
+ if hasattr(self.language_model, "_hf_hook"):
1136
+ self.language_model._hf_hook.io_same_device = True # For `generate` compatibility
1137
+
1138
+ def forward(
1139
+ self,
1140
+ pixel_values: torch.FloatTensor = None,
1141
+ input_ids: torch.FloatTensor = None,
1142
+ attention_mask: Optional[torch.LongTensor] = None,
1143
+ output_attentions: Optional[bool] = None,
1144
+ output_hidden_states: Optional[bool] = None,
1145
+ labels: Optional[torch.LongTensor] = None,
1146
+ return_dict: Optional[bool] = None,
1147
+ ) -> Union[Tuple, CheXagentForConditionalGenerationModelOutput]:
1148
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1149
+
1150
+ vision_outputs, query_outputs = None, None
1151
+ if pixel_values is not None:
1152
+ # step 1: forward the images through the vision encoder,
1153
+ # to get image embeddings of shape (batch_size, seq_len, hidden_size)
1154
+ image_mask = pixel_values.sum(dim=(2, 3, 4)) != 0
1155
+ vision_outputs = self.vision_model(
1156
+ pixel_values=pixel_values[image_mask],
1157
+ output_attentions=output_attentions,
1158
+ output_hidden_states=output_hidden_states,
1159
+ return_dict=return_dict,
1160
+ )
1161
+ tmp = vision_outputs[0]
1162
+ image_embeds = tmp.new_zeros((*image_mask.shape, *tmp.shape[1:]))
1163
+ image_embeds[image_mask] = tmp
1164
+
1165
+ # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
1166
+ image_attention_mask = torch.zeros(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
1167
+ image_attention_mask[image_mask] = 1
1168
+
1169
+ image_embeds = rearrange(image_embeds, "b i n d -> b (i n) d")
1170
+ image_attention_mask = rearrange(image_attention_mask, "b i n -> b (i n)")
1171
+ query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
1172
+ query_outputs = self.qformer(
1173
+ query_embeds=query_tokens,
1174
+ encoder_hidden_states=image_embeds,
1175
+ encoder_attention_mask=image_attention_mask,
1176
+ output_attentions=output_attentions,
1177
+ output_hidden_states=output_hidden_states,
1178
+ return_dict=return_dict,
1179
+ )
1180
+ query_output = query_outputs[0]
1181
+
1182
+ # step 3: project vision to language
1183
+ input_vis = self.language_projection(query_output)
1184
+ vis_atts = torch.ones(input_vis.size()[:-1], dtype=torch.long, device=input_vis.device)
1185
+
1186
+ # step 4: get the embeddings of the prompt
1187
+ inputs_lang = self.language_model.get_input_embeddings()(input_ids)
1188
+ lang_atts = attention_mask
1189
+ if lang_atts is None:
1190
+ lang_atts = torch.ones_like(input_ids)
1191
+
1192
+ # step 5: conditioned on the images and/or prompts
1193
+ if pixel_values is not None:
1194
+ inputs_embeds = torch.cat([input_vis, inputs_lang], dim=1)
1195
+ attention_mask = torch.cat([vis_atts, lang_atts], dim=1)
1196
+ else:
1197
+ inputs_embeds = inputs_lang
1198
+ attention_mask = lang_atts
1199
+
1200
+ outputs = self.language_model(
1201
+ inputs_embeds=inputs_embeds,
1202
+ attention_mask=attention_mask,
1203
+ output_attentions=output_attentions,
1204
+ output_hidden_states=output_hidden_states,
1205
+ return_dict=return_dict
1206
+ )
1207
+ logits = outputs.logits if return_dict else outputs[0]
1208
+
1209
+ loss = None
1210
+ # we compute the loss here since we need to take into account the sequence length of the query embeds
1211
+ if labels is not None:
1212
+ # make target
1213
+ empty_labels = torch.ones(vis_atts.size(), dtype=torch.long, device=input_ids.device).fill_(-100)
1214
+ labels = torch.cat([empty_labels, labels], dim=1)
1215
+ labels = labels.to(logits.device)
1216
+ logits = logits[:, -labels.size(1):, :]
1217
+ # Shift so that tokens < n predict n
1218
+ shift_logits = logits[..., :-1, :].contiguous()
1219
+ shift_labels = labels[..., 1:].contiguous().to(logits.device)
1220
+ # Flatten the tokens
1221
+ loss_fct = CrossEntropyLoss(reduction="mean")
1222
+ loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1))
1223
+
1224
+ if not return_dict:
1225
+ output = (logits, vision_outputs, query_outputs, outputs)
1226
+ return ((loss,) + output) if loss is not None else output
1227
+
1228
+ return CheXagentForConditionalGenerationModelOutput(
1229
+ loss=loss,
1230
+ logits=logits,
1231
+ vision_outputs=vision_outputs,
1232
+ qformer_outputs=query_outputs,
1233
+ language_model_outputs=outputs,
1234
+ )
1235
+
1236
+ @torch.no_grad()
1237
+ def generate(
1238
+ self,
1239
+ pixel_values: torch.FloatTensor = None,
1240
+ input_ids: Optional[torch.LongTensor] = None,
1241
+ attention_mask: Optional[torch.LongTensor] = None,
1242
+ **generate_kwargs,
1243
+ ) -> torch.LongTensor:
1244
+ if hasattr(self, "hf_device_map"):
1245
+ # preprocess for `accelerate`
1246
+ self._preprocess_accelerate()
1247
+
1248
+ batch_size = pixel_values.shape[0] if pixel_values is not None else input_ids.shape[0]
1249
+ if pixel_values is not None:
1250
+ # step 1: forward the images through the vision encoder
1251
+ image_mask = pixel_values.sum(dim=(2, 3, 4)) != 0
1252
+ vision_outputs = self.vision_model(pixel_values[image_mask], return_dict=True)
1253
+ tmp = vision_outputs[0]
1254
+ image_embeds = tmp.new_zeros((*image_mask.shape, *tmp.shape[1:]))
1255
+ image_embeds[image_mask] = tmp
1256
+
1257
+ # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention
1258
+ image_attention_mask = torch.zeros(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
1259
+ image_attention_mask[image_mask] = 1
1260
+ image_embeds = rearrange(image_embeds, "b i n d -> b (i n) d")
1261
+ image_attention_mask = rearrange(image_attention_mask, "b i n -> b (i n)")
1262
+ query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
1263
+ query_outputs = self.qformer(
1264
+ query_embeds=query_tokens,
1265
+ encoder_hidden_states=image_embeds,
1266
+ encoder_attention_mask=image_attention_mask,
1267
+ return_dict=True,
1268
+ )
1269
+ query_output = query_outputs.last_hidden_state
1270
+
1271
+ # step 3: project vision to language
1272
+ input_vis = self.language_projection(query_output)
1273
+ vis_atts = torch.ones(input_vis.size()[:-1], dtype=torch.long, device=input_vis.device)
1274
+
1275
+ # step 4: get the embeddings of the prompt
1276
+ if input_ids is None:
1277
+ input_ids = (
1278
+ torch.LongTensor([[self.config.text_config.bos_token_id]])
1279
+ .repeat(batch_size, 1)
1280
+ .to(next(self.parameters()).device)
1281
+ )
1282
+ inputs_lang = self.language_model.get_input_embeddings()(input_ids)
1283
+ lang_atts = attention_mask
1284
+ if lang_atts is None:
1285
+ lang_atts = torch.ones_like(input_ids)
1286
+
1287
+ # step 5: conditioned on the images and/or prompts
1288
+ if pixel_values is not None:
1289
+ inputs_embeds = torch.cat([input_vis, inputs_lang], dim=1)
1290
+ attention_mask = torch.cat([vis_atts, lang_atts], dim=1)
1291
+ else:
1292
+ inputs_embeds = inputs_lang
1293
+ attention_mask = lang_atts
1294
+
1295
+ outputs = self.language_model.generate(
1296
+ inputs_embeds=inputs_embeds,
1297
+ attention_mask=attention_mask,
1298
+ **generate_kwargs,
1299
+ )
1300
+ return outputs
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": true,
3
+ "do_normalize": true,
4
+ "do_rescale": true,
5
+ "do_resize": true,
6
+ "image_mean": [
7
+ 0.48145466,
8
+ 0.4578275,
9
+ 0.40821073
10
+ ],
11
+ "image_processor_type": "BlipImageProcessor",
12
+ "image_std": [
13
+ 0.26862954,
14
+ 0.26130258,
15
+ 0.27577711
16
+ ],
17
+ "processor_class": "CheXagentProcessor",
18
+ "resample": 3,
19
+ "rescale_factor": 0.00392156862745098,
20
+ "size": {
21
+ "height": 448,
22
+ "width": 448
23
+ },
24
+ "auto_map": {
25
+ "AutoProcessor": "processing_chexagent.CheXagentProcessor"
26
+ }
27
+ }
processing_chexagent.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The CheXagent Authors and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import List, Optional, Union
17
+
18
+ import torch
19
+ from transformers.image_utils import ImageInput
20
+ from transformers.processing_utils import ProcessorMixin
21
+ from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput
22
+ from transformers.tokenization_utils_base import TruncationStrategy
23
+ from transformers.utils import TensorType
24
+
25
+
26
+ class CheXagentProcessor(ProcessorMixin):
27
+ attributes = ["image_processor", "tokenizer"]
28
+ image_processor_class = "BlipImageProcessor"
29
+ tokenizer_class = "AutoTokenizer"
30
+
31
+ def __init__(self, image_processor, tokenizer):
32
+ tokenizer.return_token_type_ids = False
33
+ super().__init__(image_processor, tokenizer)
34
+ self.current_processor = self.image_processor
35
+
36
+ def __call__(
37
+ self,
38
+ images: ImageInput = None,
39
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
40
+ add_special_tokens: bool = True,
41
+ padding: Union[bool, str, PaddingStrategy] = False,
42
+ truncation: Union[bool, str, TruncationStrategy] = None,
43
+ max_length: Optional[int] = None,
44
+ stride: int = 0,
45
+ pad_to_multiple_of: Optional[int] = None,
46
+ return_attention_mask: Optional[bool] = None,
47
+ return_overflowing_tokens: bool = False,
48
+ return_special_tokens_mask: bool = False,
49
+ return_offsets_mapping: bool = False,
50
+ return_token_type_ids: bool = False,
51
+ return_length: bool = False,
52
+ verbose: bool = True,
53
+ return_tensors: Optional[Union[str, TensorType]] = None,
54
+ **kwargs,
55
+ ) -> BatchEncoding:
56
+ if images is None and text is None:
57
+ raise ValueError("You have to specify either images or text.")
58
+
59
+ # Get only text
60
+ if images is None:
61
+ self.current_processor = self.tokenizer
62
+ text_encoding = self.tokenizer(
63
+ text=text,
64
+ add_special_tokens=add_special_tokens,
65
+ padding=padding,
66
+ truncation=truncation,
67
+ max_length=max_length,
68
+ stride=stride,
69
+ pad_to_multiple_of=pad_to_multiple_of,
70
+ return_attention_mask=return_attention_mask,
71
+ return_overflowing_tokens=return_overflowing_tokens,
72
+ return_special_tokens_mask=return_special_tokens_mask,
73
+ return_offsets_mapping=return_offsets_mapping,
74
+ return_token_type_ids=return_token_type_ids,
75
+ return_length=return_length,
76
+ verbose=verbose,
77
+ return_tensors=return_tensors,
78
+ **kwargs,
79
+ )
80
+ return text_encoding
81
+
82
+ # add pixel_values
83
+ if images is not None:
84
+ encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
85
+ encoding_image_processor["pixel_values"] = torch.stack(
86
+ [torch.tensor(pixel_values) for pixel_values in encoding_image_processor["pixel_values"]]
87
+ ).unsqueeze(0)
88
+
89
+ if text is not None:
90
+ text_encoding = self.tokenizer(
91
+ text=text,
92
+ add_special_tokens=add_special_tokens,
93
+ padding=padding,
94
+ truncation=truncation,
95
+ max_length=max_length,
96
+ stride=stride,
97
+ pad_to_multiple_of=pad_to_multiple_of,
98
+ return_attention_mask=return_attention_mask,
99
+ return_overflowing_tokens=return_overflowing_tokens,
100
+ return_special_tokens_mask=return_special_tokens_mask,
101
+ return_offsets_mapping=return_offsets_mapping,
102
+ return_token_type_ids=return_token_type_ids,
103
+ return_length=return_length,
104
+ verbose=verbose,
105
+ return_tensors=return_tensors,
106
+ **kwargs,
107
+ )
108
+ else:
109
+ text_encoding = None
110
+
111
+ if text_encoding is not None:
112
+ encoding_image_processor.update(text_encoding)
113
+
114
+ return encoding_image_processor
115
+
116
+ def batch_decode(self, *args, **kwargs):
117
+ return self.tokenizer.batch_decode(*args, **kwargs)
118
+
119
+ def decode(self, *args, **kwargs):
120
+ return self.tokenizer.decode(*args, **kwargs)
121
+
122
+ @property
123
+ def model_input_names(self):
124
+ tokenizer_input_names = self.tokenizer.model_input_names
125
+ image_processor_input_names = self.image_processor.model_input_names
126
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "unk_token": {
17
+ "content": "<unk>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
3
+ size 493443
tokenizer_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<unk>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ }
27
+ },
28
+ "additional_special_tokens": [],
29
+ "bos_token": "<s>",
30
+ "clean_up_tokenization_spaces": false,
31
+ "eos_token": "</s>",
32
+ "legacy": true,
33
+ "model_max_length": 1000000000000000019884624838656,
34
+ "pad_token": null,
35
+ "processor_class": "CheXagentProcessor",
36
+ "sp_model_kwargs": {},
37
+ "spaces_between_special_tokens": false,
38
+ "tokenizer_class": "LlamaTokenizer",
39
+ "unk_token": "<unk>",
40
+ "use_default_system_prompt": false
41
+ }