vxo hyunjun-eun commited on
Commit
ae1db2c
·
verified ·
0 Parent(s):

Duplicate from skt/A.X-4.0-VL-Light

Browse files

Co-authored-by: Hyunjun Eun <hyunjun-eun@users.noreply.huggingface.co>

.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) 2025 SK Telecom Co., Ltd. All rights reserved.
2
+
3
+ Built with Qwen 2.5 — original model by Alibaba Cloud, and SigLIP 2 — original model by Google DeepMind, licensed under the Apache License 2.0.
4
+
5
+ Unless otherwise stated, all files in this repository (including modified model weights
6
+ and tokenizer files) are distributed under the terms of the Apache License, Version 2.0
7
+ (the "License"). You may obtain a copy of the License at:
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software distributed under
12
+ the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
13
+ ANY KIND, either express or implied. See the License for the specific language governing
14
+ permissions and limitations under the License.
15
+
16
+ ================================================================================
17
+ NOTICE (Apache-2.0 §4 d)
18
+ ================================================================================
19
+
20
+ This product is built with Qwen 2.5 developed by Alibaba Cloud and SigLIP 2 developed by Google DeepMind, under the terms of the Apache License 2.0.
21
+
22
+ Source code and documentation for these models are available at:
23
+ • Qwen 2.5: https://github.com/QwenLM
24
+ • SigLIP 2: https://github.com/google-research/big_vision
25
+
26
+ ================================================================================
27
+ TRADEMARK
28
+ ================================================================================
29
+
30
+ "SK Telecom" and associated logos are trademarks of SK Telecom Co., Ltd.
31
+ This License does not grant permission to use these trademarks without prior
32
+ written consent.
33
+
34
+ ================================================================================
35
+ APACHE LICENSE 2.0
36
+ ================================================================================
37
+
38
+ Apache License
39
+ Version 2.0, January 2004
40
+ http://www.apache.org/licenses/
41
+
42
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
43
+
44
+ 1. Definitions.
45
+
46
+ "License" shall mean the terms and conditions for use, reproduction,
47
+ and distribution as defined by Sections 1 through 9 of this document.
48
+
49
+ "Licensor" shall mean the copyright owner or entity authorized by
50
+ the copyright owner that is granting the License.
51
+
52
+ "Legal Entity" shall mean the union of the acting entity and all
53
+ other entities that control, are controlled by, or are under common
54
+ control with that entity. For the purposes of this definition,
55
+ "control" means (i) the power, direct or indirect, to cause the
56
+ direction or management of such entity, whether by contract or
57
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
58
+ outstanding shares, or (iii) beneficial ownership of such entity.
59
+
60
+ "You" (or "Your") shall mean an individual or Legal Entity
61
+ exercising permissions granted by this License.
62
+
63
+ "Source" form shall mean the preferred form for making modifications,
64
+ including but not limited to software source code, documentation
65
+ source, and configuration files.
66
+
67
+ "Object" form shall mean any form resulting from mechanical
68
+ transformation or translation of a Source form, including but
69
+ not limited to compiled object code, generated documentation,
70
+ and conversions to other media types.
71
+
72
+ "Work" shall mean the work of authorship, whether in Source or
73
+ Object form, made available under the License, as indicated by a
74
+ copyright notice that is included in or attached to the work
75
+ (an example is provided in the Appendix below).
76
+
77
+ "Derivative Works" shall mean any work, whether in Source or Object
78
+ form, that is based on (or derived from) the Work and for which the
79
+ editorial revisions, annotations, elaborations, or other modifications
80
+ represent, as a whole, an original work of authorship. For the purposes
81
+ of this License, Derivative Works shall not include works that remain
82
+ separable from, or merely link (or bind by name) to the interfaces of,
83
+ the Work and Derivative Works thereof.
84
+
85
+ "Contribution" shall mean any work of authorship, including
86
+ the original version of the Work and any modifications or additions
87
+ to that Work or Derivative Works thereof, that is intentionally
88
+ submitted to Licensor for inclusion in the Work by the copyright owner
89
+ or by an individual or Legal Entity authorized to submit on behalf of
90
+ the copyright owner. For the purposes of this definition, "submitted"
91
+ means any form of electronic, verbal, or written communication sent
92
+ to the Licensor or its representatives, including but not limited to
93
+ communication on electronic mailing lists, source code control systems,
94
+ and issue tracking systems that are managed by, or on behalf of, the
95
+ Licensor for the purpose of discussing and improving the Work, but
96
+ excluding communication that is conspicuously marked or otherwise
97
+ designated in writing by the copyright owner as "Not a Contribution."
98
+
99
+ "Contributor" shall mean Licensor and any individual or Legal Entity
100
+ on behalf of whom a Contribution has been received by Licensor and
101
+ subsequently incorporated within the Work.
102
+
103
+ 2. Grant of Copyright License. Subject to the terms and conditions of
104
+ this License, each Contributor hereby grants to You a perpetual,
105
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
106
+ copyright license to reproduce, prepare Derivative Works of,
107
+ publicly display, publicly perform, sublicense, and distribute the
108
+ Work and such Derivative Works in Source or Object form.
109
+
110
+ 3. Grant of Patent License. Subject to the terms and conditions of
111
+ this License, each Contributor hereby grants to You a perpetual,
112
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
113
+ (except as stated in this section) patent license to make, have made,
114
+ use, offer to sell, sell, import, and otherwise transfer the Work,
115
+ where such license applies only to those patent claims licensable
116
+ by such Contributor that are necessarily infringed by their
117
+ Contribution(s) alone or by combination of their Contribution(s)
118
+ with the Work to which such Contribution(s) was submitted. If You
119
+ institute patent litigation against any entity (including a
120
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
121
+ or a Contribution incorporated within the Work constitutes direct
122
+ or contributory patent infringement, then any patent licenses
123
+ granted to You under this License for that Work shall terminate
124
+ as of the date such litigation is filed.
125
+
126
+ 4. Redistribution. You may reproduce and distribute copies of the
127
+ Work or Derivative Works thereof in any medium, with or without
128
+ modifications, and in Source or Object form, provided that You
129
+ meet the following conditions:
130
+
131
+ (a) You must give any other recipients of the Work or
132
+ Derivative Works a copy of this License; and
133
+
134
+ (b) You must cause any modified files to carry prominent notices
135
+ stating that You changed the files; and
136
+
137
+ (c) You must retain, in the Source form of any Derivative Works
138
+ that You distribute, all copyright, patent, trademark, and
139
+ attribution notices from the Source form of the Work,
140
+ excluding those notices that do not pertain to any part of
141
+ the Derivative Works; and
142
+
143
+ (d) If the Work includes a "NOTICE" text file as part of its
144
+ distribution, then any Derivative Works that You distribute must
145
+ include a readable copy of the attribution notices contained
146
+ within such NOTICE file, excluding those notices that do not
147
+ pertain to any part of the Derivative Works, in at least one
148
+ of the following places: within a NOTICE text file distributed
149
+ as part of the Derivative Works; within the Source form or
150
+ documentation, if provided along with the Derivative Works; or,
151
+ within a display generated by the Derivative Works, if and
152
+ wherever such third-party notices normally appear. The contents
153
+ of the NOTICE file are for informational purposes only and
154
+ do not modify the License. You may add Your own attribution
155
+ notices within Derivative Works that You distribute, alongside
156
+ or as an addendum to the NOTICE text from the Work, provided
157
+ that such additional attribution notices cannot be construed
158
+ as modifying the License.
159
+
160
+ You may add Your own copyright statement to Your modifications and
161
+ may provide additional or different license terms and conditions
162
+ for use, reproduction, or distribution of Your modifications, or
163
+ for any such Derivative Works as a whole, provided Your use,
164
+ reproduction, and distribution of the Work otherwise complies with
165
+ the conditions stated in this License.
166
+
167
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
168
+ any Contribution intentionally submitted for inclusion in the Work
169
+ by You to the Licensor shall be under the terms and conditions of
170
+ this License, without any additional terms or conditions.
171
+ Notwithstanding the above, nothing herein shall supersede or modify
172
+ the terms of any separate license agreement you may have executed
173
+ with Licensor regarding such Contributions.
174
+
175
+ 6. Trademarks. This License does not grant permission to use the trade
176
+ names, trademarks, service marks, or product names of the Licensor,
177
+ except as required for reasonable and customary use in describing the
178
+ origin of the Work and reproducing the content of the NOTICE file.
179
+
180
+ 7. Disclaimer of Warranty. Unless required by applicable law or
181
+ agreed to in writing, Licensor provides the Work (and each
182
+ Contributor provides its Contributions) on an "AS IS" BASIS,
183
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
184
+ implied, including, without limitation, any warranties or conditions
185
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
186
+ PARTICULAR PURPOSE. You are solely responsible for determining the
187
+ appropriateness of using or redistributing the Work and assume any
188
+ risks associated with Your exercise of permissions under this License.
189
+
190
+ 8. Limitation of Liability. In no event and under no legal theory,
191
+ whether in tort (including negligence), contract, or otherwise,
192
+ unless required by applicable law (such as deliberate and grossly
193
+ negligent acts) or agreed to in writing, shall any Contributor be
194
+ liable to You for damages, including any direct, indirect, special,
195
+ incidental, or consequential damages of any character arising as a
196
+ result of this License or out of the use or inability to use the
197
+ Work (including but not limited to damages for loss of goodwill,
198
+ work stoppage, computer failure or malfunction, or any and all
199
+ other commercial damages or losses), even if such Contributor
200
+ has been advised of the possibility of such damages.
201
+
202
+ 9. Accepting Warranty or Additional Liability. While redistributing
203
+ the Work or Derivative Works thereof, You may choose to offer,
204
+ and charge a fee for, acceptance of support, warranty, indemnity,
205
+ or other liability obligations and/or rights consistent with this
206
+ License. However, in accepting such obligations, You may act only
207
+ on Your own behalf and on Your sole responsibility, not on behalf
208
+ of any other Contributor, and only if You agree to indemnify,
209
+ defend, and hold each Contributor harmless for any liability
210
+ incurred by, or claims asserted against, such Contributor by reason
211
+ of your accepting any such warranty or additional liability.
212
+
213
+ END OF TERMS AND CONDITIONS
214
+
215
+ APPENDIX: How to apply the Apache License to your work.
216
+
217
+ To apply the Apache License to your work, attach the following
218
+ boilerplate notice, with the fields enclosed by brackets "[]"
219
+ replaced with your own identifying information. (Don't include
220
+ the brackets!) The text should be enclosed in the appropriate
221
+ comment syntax for the file format. We also recommend that a
222
+ file or class name and description of purpose be included on the
223
+ same "printed page" as the copyright notice for easier
224
+ identification within third-party archives.
225
+ Copyright 2024 Alibaba Cloud
226
+ Licensed under the Apache License, Version 2.0 (the "License");
227
+ you may not use this file except in compliance with the License.
228
+ You may obtain a copy of the License at
229
+ http://www.apache.org/licenses/LICENSE-2.0
230
+ Unless required by applicable law or agreed to in writing, software
231
+ distributed under the License is distributed on an "AS IS" BASIS,
232
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
233
+ See the License for the specific language governing permissions and
234
+ limitations under the License.
README.md ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ license_link: https://huggingface.co/skt/A.X-4.0-VL-Light/blob/main/LICENSE
4
+ language:
5
+ - en
6
+ - ko
7
+ pipeline_tag: image-text-to-text
8
+ library_name: transformers
9
+ model_id: skt/A.X-4.0-VL-Light
10
+ developers: SKT AI Model Lab
11
+ base_model:
12
+ - skt/A.X-4.0-Light
13
+ ---
14
+
15
+ # A.X 4.0 VL Light
16
+
17
+ <p align="center">
18
+ <picture>
19
+ <img src="./assets/A.X_logo_ko_4x3.png" width="45%" style="margin: 40px auto;">
20
+ </picture>
21
+ </p>
22
+ <p align="center"> <a href="https://huggingface.co/collections/skt/ax-4-68637ebaa63b9cc51925e886">🤗 Models</a> | <a href="https://github.com/SKT-AI/A.X-4.0-VL-Light">🖥️ Github</a> </p>
23
+
24
+
25
+
26
+ ## Highlights
27
+
28
+ **A.X 4.0 VL Light** (pronounced “A dot X”) is a vision-language model (VLM) optimized for Korean vision and language understanding as well as enterprise deployment. Built upon [A.X 4.0 Light](https://huggingface.co/skt/A.X-4.0-Light), A.X 4.0 VL Light has been further trained on diverse multimodal datasets, with a particular focus on large-scale multimodal Korean datasets, to deliver exceptional performance in domestic business applications.
29
+
30
+ - **Superior Korean Proficiency in Vision and Language**: Achieved an average score of 79.4 on Korean image benchmarks, outperforming Qwen2.5-VL-32B (73.4), despite having a significantly smaller model size. On Korean text benchmarks, recorded an average score of 60.2, comparable to VARCO-VISION-2.0-14B (60.4), while using only half the model size.
31
+ - **Deep Cultural Understanding**: Scored 80.2 on K-Viscuit, a multimodal benchmark designed to evaluate cultural and contextual comprehension in Korean, exceeding Qwen2.5-VL-32B (72.3).
32
+ - **Advanced Document Understanding**: Attained a score of 89.8 on KoBizDoc, a benchmark focused on understanding complex document structures, including charts and tables, performing comparably to Qwen2.5-VL-32B (88.8).
33
+ - **Efficient Token Usage**: A.X 4.0 VL Light utilizes approximately 41% fewer text tokens compared to Qwen2.5-VL for the same Korean input, enabling significantly more cost-effective and efficient processing.
34
+
35
+ A brief comparison on representative benchmarks is as follows:
36
+
37
+
38
+ <p align="center">
39
+ <picture>
40
+ <img src="./assets/benchmark_2x2.png" width="80%" style="margin: 40px auto;">
41
+ </picture>
42
+ </p>
43
+
44
+ ## Performance
45
+
46
+ ### Image Benchmark
47
+ *Korean benchmarks, with K-Viscuit translated into Korean.
48
+
49
+ | Category | Benchmarks | A.X 4.0 VL Light | Qwen2.5-VL-7B | InternVL3-8B | VARCO-VISION-2.0-14B | Qwen2.5-VL-32B |
50
+ |------------------------|---------------------|------------------|---------------|--------------|----------------------|----------------|
51
+ | Document | KoBizDoc* | 89.8 | 84.0 | 73.2 | 83.0 | 88.8 |
52
+ | | K-DTCBench* | 90.0 | 86.7 | 83.8 | 80.8 | 91.7 |
53
+ | | ChartQA | 79.8 | 80.6 | 79.8 | 78.8 | 81.8 |
54
+ | | DocVQA | 94.4 | 95.3 | 92.4 | 91.9 | 94.5 |
55
+ | | InfoVQA | 78.5 | 82.7 | 76.2 | 80.0 | 82.7 |
56
+ | | SEEDBench2-Plus | 69.7 | 71.2 | 69.7 | 71.9 | 73.3 |
57
+ | OCR | OutdoorKorean* | 97.3 | 91.9 | 72.7 | 79.7 | 86.9 |
58
+ | | K-Handwriting* | 84.3 | 85.0 | 43.5 | 55.2 | 60.1 |
59
+ | | TextVQA | 82.0 | 85.4 | 82.1 | 80.3 | 79.8 |
60
+ | Culture | K-Viscuit* | 80.2 | 65.0 | 65.3 | 72.0 | 72.3 |
61
+ | Knowledge | KoEduBench* | 58.1 | 53.9 | 53.9 | 39.4 | 52.4 |
62
+ | | KoCertBench* | 54.9 | 50.1 | 39.4 | 51.4 | 47.5 |
63
+ | | MMMU | 54.1 | 56.3 | 59.4 | 58.3 | 63.6 |
64
+ | | ScienceQA | 95.3 | 87.2 | 97.8 | 92.2 | 92.4 |
65
+ | General | K-LLaVA-W* | 83.2 | 73.0 | 67.0 | 80.0 | 84.3 |
66
+ | | K-SEED* | 76.5 | 76.4 | 76.4 | 76.9 | 77.3 |
67
+ | | SEEDBench_IMG | 76.7 | 77.1 | 77.1 | 78.1 | 77.6 |
68
+ | Hallucination | HallusionBench | 69.6 | 70.2 | 66.3 | 70.4 | 72.0 |
69
+ | IF | MM-IFEval | 53.5 | 51.4 | 51.9 | 50.8 | 59.3 |
70
+
71
+
72
+
73
+ The following in-house benchmarks have been established to rigorously assess model performance on Korean vision-language understanding and the comprehension of Korea-specific knowledge domains:
74
+
75
+ - **KoBizDoc**: A visual question answering (VQA) benchmark designed for understanding Korean business documents.
76
+ - **OutdoorKorean**: A benchmark focused on recognizing Korean text in complex outdoor scenes (provided by AIHub).
77
+ - **K-Handwriting**: A Korean handwriting recognition dataset comprising various handwritten styles (provided by AIHub).
78
+ - **KoEduBench**: A VQA benchmark targeting Korean general academic exams, including GED and CSAT questions, to assess academic reasoning ability.
79
+ - **KoCertBench**: A Korean certification exam-based VQA benchmark, covering domains such as civil service, technical licenses, and professional qualifications.
80
+
81
+ ### Text Benchmark
82
+ *Korean benchmarks.
83
+
84
+ | Category | Benchmarks | A.X 4.0 VL Light | Qwen2.5-VL-7B | InternVL3-8B | VARCO-VISION-2.0-14B |
85
+ |-----------------------|--------------|------------------|---------------|--------------|----------------------|
86
+ | Knowledge | KMMLU* | 60.5 | 45.6 | 50.9 | 58.8 |
87
+ | | MMLU | 72.6 | 71.9 | 77.5 | 80.7 |
88
+ | Math | HRM8K* | 40.6 | 25.4 | 34.6 | 49.5 |
89
+ | | MATH | 56.5 | 61.7 | 65.1 | 71.1 |
90
+ | General | Ko-MT-bench* | 68.9 | 51.5 | 59.5 | 75.9 |
91
+ | | MT-bench | 72.9 | 73.2 | 69.9 | 76.6 |
92
+ | IF | Ko-IFEval* | 71.8 | 55.0 | 46.1 | 57.2 |
93
+ | | IFEval | 81.9 | 66.6 | 67.5 | 75.3 |
94
+
95
+
96
+
97
+
98
+ ## 🚀 Quickstart
99
+
100
+ ### with HuggingFace Transformers
101
+
102
+ - `transformers>=4.49.0` or the latest version is required to use `skt/A.X-4.0-VL-Light`
103
+
104
+ ```bash
105
+ pip install transformers>=4.49.0
106
+ ```
107
+
108
+ #### Example Usage
109
+
110
+ ```python
111
+ import torch
112
+ from transformers import AutoModelForCausalLM, AutoProcessor
113
+ from PIL import Image
114
+ import requests
115
+ from io import BytesIO
116
+
117
+
118
+
119
+ model_name = "skt/A.X-4.0-VL-Light"
120
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device='cuda')
121
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
122
+
123
+ url = "https://huggingface.co/skt/A.X-4.0-VL-Light/resolve/main/assets/image.png"
124
+
125
+ # 이미지 출처: 국가유산포털 (https://www.heritage.go.kr/unisearch/images/national_treasure/thumb/2021042017434700.JPG)
126
+ response = requests.get(url)
127
+ response.raise_for_status()
128
+ image = Image.open(BytesIO(response.content))
129
+
130
+ messages = [
131
+ {
132
+ "role": "user",
133
+ "content": [
134
+ {"type": "image"},
135
+ {"type": "text", "text": "이미지에 대해서 설명해줘."},
136
+ ],
137
+ }
138
+ ]
139
+
140
+ inputs = processor(
141
+ images=[image],
142
+ conversations=[messages],
143
+ padding=True,
144
+ return_tensors="pt",
145
+ ).to("cuda")
146
+
147
+ # Decoding parameters (top_p, temperature, top_k, repetition_penalty) should be tuned depending on the generation task.
148
+ generation_kwargs = {
149
+ "max_new_tokens": 256,
150
+ "top_p": 0.8,
151
+ "temperature": 0.5,
152
+ "top_k": 20,
153
+ "repetition_penalty": 1.05,
154
+ "do_sample": True,
155
+ }
156
+ generated_ids = model.generate(**inputs, **generation_kwargs)
157
+ generated_ids_trimmed = [
158
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
159
+ ]
160
+ response = processor.batch_decode(
161
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
162
+ )
163
+ print(response[0])
164
+ """
165
+ 숭례문은 대한민국 서울에 위치한 국보 제1호로, 조선 시대에 건축된 목조 건축물이다. 이 문은 서울의 남쪽 대문으로, 전통적인 한국 건축 양식을 보여준다. 두 층으로 이루어진 이 문은 기와지붕을 얹고 있으며, 지붕의 곡선이 아름답게 표현되어 있다. 문 아래에는 아치형의 출입구가 있으며, 그 주위로는 견고한 석재로 쌓은 성벽이 이어져 있다. 배경에는 현대적인 고층 빌딩들이 자리잡고 있어, 전통과 현대가 공존하는 서울의 모습을 잘 나타낸다. 숭례문은 역사적, 문화적 가치가 높아 많은 관광객들이 찾는 명소이다.
166
+ """
167
+
168
+ ```
169
+
170
+
171
+ #### Example for Document Transcription
172
+
173
+
174
+ ```python
175
+ import torch
176
+ from transformers import AutoModelForCausalLM, AutoProcessor
177
+ from PIL import Image
178
+ import requests
179
+ from io import BytesIO
180
+
181
+
182
+
183
+ model_name = "skt/A.X-4.0-VL-Light"
184
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device='cuda')
185
+ processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
186
+
187
+ url = "https://huggingface.co/skt/A.X-4.0-VL-Light/resolve/main/assets/document.png"
188
+
189
+ response = requests.get(url)
190
+ response.raise_for_status()
191
+ image = Image.open(BytesIO(response.content))
192
+
193
+ messages = [
194
+ {
195
+ "role": "user",
196
+ "content": [
197
+ {"type": "image"},
198
+ {"type": "text", "text": "사진에 무엇이 적혀있나요? 다른 설명 없이 적혀있는 텍스트만 결과로 보여줘."},
199
+ ],
200
+ }
201
+ ]
202
+
203
+ inputs = processor(
204
+ images=[image],
205
+ conversations=[messages],
206
+ padding=True,
207
+ return_tensors="pt",
208
+ ).to("cuda")
209
+
210
+
211
+ generation_kwargs = {
212
+ "max_new_tokens": 1024,
213
+ "top_p": 0.95,
214
+ "top_k": 1,
215
+ "temperature": 0.7,
216
+ "repetition_penalty": 1.05,
217
+ "do_sample": True,
218
+ }
219
+ generated_ids = model.generate(**inputs, **generation_kwargs)
220
+ generated_ids_trimmed = [
221
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
222
+ ]
223
+ response = processor.batch_decode(
224
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
225
+ )
226
+ print(response[0])
227
+ """
228
+ # A.X 4.0: 기업용 한국어 특화 대규모 언어 모델
229
+
230
+ View English README
231
+
232
+ SK텔레콤이 한국어 처리 능력과 기업 활용성을 높인 대규모 언어 모델(LLM) A.X 4.0 (에이닷엑스 4.0)을 2025년 4월 30일에 출시하였습니다. A.X 4.0은 오픈소스 모델인 Qwen2.5에 방대한 한국어 데이터를 추가로 학습시켜 국내 비즈니스 환경에 최적화된 성능을 발휘합니다.
233
+
234
+ ## A.X 4.0, 무엇이 다른가요?
235
+
236
+ - 뛰어난 한국어 실력: 대표적인 한국어 능력 평가 벤치마크인 KMMLU에서 78.3점을 기록하여, GPT-40(72.5점)보다 우수한 성능을 보였습니다.
237
+ - 높은 한국 문화 이해도: 한국어 및 한국 문화 벤치마크인 CLiCk에서도 83.5점을 획득해, GPT-40(80.2점)보다 더 높은 이해도를 입증했습니다.
238
+ - 효율적인 토큰 처리: 동일한 한국어 텍스트를 입력해도 A.X 4.0보다 GPT-40가 약 1.5배 많은 토큰을 사용합니다.
239
+ - 방대한 정보 처리: 최대 131,072 토큰에 이르는 긴 문서나 대화도 한 번에 이해하고 처리할 수 있습니다.
240
+ - 도메인 지원: 코딩, 제조업 등 전문 지식이 필요한 분야에서도 활용할 수 있도록 기본 성능을 강화했습니다.
241
+ - 배포 옵션: 720억 개(72B) 매개변수를 갖춘 표준 모델과 70억 개(7B) 매개변수의 경량 모델로 제공되며, 기업 내부 서버에 직접 설치(온프레미스)할 수 있어 데이터 보안에 대한 걱정을 덜 수 있습니다.
242
+
243
+ ## 핵심 기술은?
244
+
245
+ ### 한국어 특화 토크나이저 적용
246
+
247
+ 한국어의 고유한 특성을 잘 이해하도록 최적화된 토크나이저를 사용합니다. 이 토크나이저는 한국어의 다양한 표현과 문맥을 효과적으로 파악하도록 설계되었습니다. 내부 테스트 결과, 같은 한국어 문장을 입력했을 때 GPT-40보다 A.X 4.0이 33.3% 효율적으로 토큰을 사용합니다.
248
+
249
+ 이는 실제 사용 환경에서 다음과 같은 장점이 있습니다.
250
+
251
+ - 같은 조건이라면 대략 1.5배 더 많은 한국어 정보를 처리할 수 있습니다.
252
+ - 토큰 수가 줄어들어 처리 비용을 34% 정도 절감할 수 있습니다.
253
+ - API를 호출할 때 토큰 사용량에 따라 비용이 책정되는 구조에서 유리합니다.
254
+
255
+ 특히 문서 요약이나 검색 증강 생성(RAG) 등 긴 글을 다루는 기업 환경에서, 토큰 효율성은 운영 비용을 크게 절감하는 데 기여합니다.
256
+
257
+ ### 한국어 이해와 생성 능력을 향상시키는 학습 데이터 구성
258
+
259
+ A.X 4.0에 사용된 학습 데이터는 다음과 같은 특징을 갖습니다.
260
+
261
+ - 고품질의 한국어 자료: 웹에서 추출한 고품질 데이터, 전문 서적, 합성 데이터를 포함한 대규모 고품질 데이터셋을 활용했습니다.
262
+ - 체계적인 데이터 분류: 다양한 분야에서 균형있게 높은 성능을 발휘하도록 주제별로 분류된 데이터셋을 구성했습니다.
263
+ - 균형 잡힌 언어 분포: 한국어 42%, 영어 51%, 기타 언어 및 코드 7%로 구성해 언어 간 균형을 유지했습니다.
264
+
265
+ 이러한 데이터 구성은 모델이 한국어의 다양한 표현과 미묘한 문맥까지 깊이 이해하도록 돕습니다.
266
+ """
267
+
268
+ ```
269
+
270
+
271
+
272
+ ## License
273
+
274
+ The `A.X 4.0 VL Light` model is licensed under `Apache License 2.0`.
275
+
276
+ ## Citation
277
+
278
+ ```
279
+ @article{SKTAdotX4VLLight,
280
+ title={A.X 4.0 VL Light},
281
+ author={SKT AI Model Lab},
282
+ year={2025},
283
+ url={https://huggingface.co/skt/A.X-4.0-VL-Light}
284
+ }
285
+ ```
286
+
287
+ ## Contact
288
+
289
+ - Business & Partnership Contact: [a.x@sk.com](a.x@sk.com)
assets/A.X_logo_ko_4x3.png ADDED

Git LFS Details

  • SHA256: 45eecde59e22d7be0f8a451c2cd7bb7f1526e0bb5b021ed202ce12c0838fc53d
  • Pointer size: 131 Bytes
  • Size of remote file: 183 kB
assets/benchmark_2x2.png ADDED

Git LFS Details

  • SHA256: 19bacc594f905fc048c20cc69083b9e6c3fc1e2c8e4dc0b2f800c6063fb21354
  • Pointer size: 131 Bytes
  • Size of remote file: 449 kB
assets/document.png ADDED

Git LFS Details

  • SHA256: ad6e37488e664007b4fe1b7b9b4710811c27568ef68dbef4d97207542192727b
  • Pointer size: 131 Bytes
  • Size of remote file: 345 kB
assets/image.png ADDED

Git LFS Details

  • SHA256: 9fa0586932428af2123243680896b874aab7e1cee450b3fcd70439597c103e61
  • Pointer size: 131 Bytes
  • Size of remote file: 227 kB
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{%- if tools is iterable and tools | length > 0 %}\n {{- '<|im_start|><|system|>'}}\n {{- '\ub2f9\uc2e0\uc740 \ub3c4\uad6c \ud638\ucd9c \uae30\ub2a5\uc744 \uac16\ucd98 \uc720\uc6a9\ud55c \ub3c4\uc6b0\ubbf8\uc785\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uc758 \uc694\uccad\uc744 \ucc98\ub9ac\ud558\uae30 \uc704\ud574\uc11c \ud544\uc694\ud55c \ub3c4\uad6c\uac00 \uc8fc\uc5b4\uc9c4 \ubaa9\ub85d\uc5d0 \uc788\ub294 \uacbd\uc6b0 \ub3c4\uad6c \ud638\ucd9c\ub85c \uc751\ub2f5\ud558\uc138\uc694.\n\ud544\uc694\ud55c \ub3c4\uad6c\uac00 \ubaa9\ub85d\uc5d0 \uc5c6\ub294 \uacbd\uc6b0\uc5d0\ub294 \ub3c4\uad6c \ud638\ucd9c \uc5c6\uc774 \uc0ac\uc6a9\uc790\uac00 \uc694\uad6c\ud55c \uc815\ubcf4\ub97c \uc81c\uacf5\ud558\uc138\uc694.\n\ud544\uc694\ud55c \ub3c4\uad6c\uac00 \ubaa9\ub85d\uc5d0 \uc788\uc9c0\ub9cc \ud574\ub2f9 \ub3c4\uad6c\ub97c \ud638\ucd9c\ud558\ub294\ub370 \ud544\uc694\ud55c argument \uc815\ubcf4\uac00 \ubd80\uc871\ud55c \uacbd\uc6b0 \ud574\ub2f9 \uc815\ubcf4\ub97c \uc0ac\uc6a9\uc790\uc5d0\uac8c \uc694\uccad\ud558\uc138\uc694.\n\uc0ac\uc6a9\uc790\uc758 \uc694\uccad\uc744 \ucc98\ub9ac\ud558\uae30 \uc704\ud574 \uc5ec\ub7ec\ubc88 \ub3c4\uad6c\ub97c \ud638\ucd9c\ud560 \uc218 \uc788\uc5b4\uc57c \ud569\ub2c8\ub2e4.\n\ub3c4\uad6c \ud638\ucd9c \uc774\ud6c4 \ub3c4\uad6c \uc2e4\ud589 \uacb0\uacfc\ub97c \uc785\ub825\uc73c\ub85c \ubc1b\uc73c\uba74 \ud574\ub2f9 \uacb0\uacfc\ub97c \ud65c\uc6a9\ud558\uc5ec \ub2f5\ubcc0\uc744 \uc0dd\uc131\ud558\uc138\uc694.\n\n\ub2e4\uc74c\uc740 \uc811\uadfc\ud560 \uc218 \uc788\ub294 \ub3c4\uad6c\ub4e4\uc758 \ubaa9\ub85d \uc785\ub2c8\ub2e4:\n<tools>\n'}}\n {%- for t in tools %}\n {{- t | tojson }}\n {{- '\n' }}\n {%- endfor %}\n {{- '</tools>' }}\n {{- '\n\n\ub3c4\uad6c\ub97c \ud638\ucd9c\ud558\ub824\uba74 \uc544\ub798\uc758 JSON\uc73c\ub85c \uc751\ub2f5\ud558\uc138\uc694.\n\ub3c4\uad6c \ud638\ucd9c \ud615\uc2dd: <tool_call>{\"name\": \ub3c4\uad6c \uc774\ub984, \"arguments\": dictionary \ud615\ud0dc\uc758 \ub3c4\uad6c \uc778\uc790\uac12}</tool_call>' }}\n \n {%- if messages[0].role == 'system' %}\n {{- '\n\n' + messages[0].content}}\n {% set dummy = messages.pop(0) %}\n {%- endif %} \n {{- '<|im_end|>' }}\n{%- endif %}\n\n{%- for message in messages %}\n {%- if loop.first and message.role != 'system' %}\n {{- '<|im_start|><|system|>\ub2f9\uc2e0\uc740 \uc720\uc6a9\ud55c \uc5b8\uc5b4 \ubc0f \uc2dc\uac01 \ub3c4\uc6b0\ubbf8\uc785\ub2c8\ub2e4. \uc0ac\uc6a9\uc790\uac00 \uc81c\uacf5\ud558\ub294 \uc2dc\uac01\uc801 \ucf58\ud150\uce20\ub97c \uc774\ud574\ud560 \uc218 \uc788\uc73c\uba70, \uc790\uc5f0\uc5b4\ub97c \uc0ac\uc6a9\ud558\uc5ec \uc0ac\uc6a9\uc790\uc5d0\uac8c \ub2e4\uc591\ud55c \uc791\uc5c5\uc744 \uc9c0\uc6d0\ud569\ub2c8\ub2e4.<|im_end|>' }}\n {%- endif %}\n\n {%- if message.role == 'system' %}\n {{- '<|im_start|><|system|>' + message.content + '<|im_end|>'}}\n {%- elif message.role == 'user' %}\n {%- if message.content is string %}\n {{- '<|im_start|><|user|>' + message.content + '<|im_end|>'}}\n {%- else %}\n {{- '<|im_start|><|user|>' }}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {{- '<|extra_id_11|>' }}\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {{- '<|im_end|>' }}\n {%- endif %}\n {%- elif message.role == 'assistant' %}\n {{- '<|im_start|><|assistant|>'}}\n {%- if message.content is defined %}\n {%- if message.content is string %}\n {{- message.content }}\n {%- else %}\n {%- for content in message.content %}\n {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}\n {{- '<image>' }}\n {%- elif 'text' in content %}\n {{- content.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n {%- endif %}\n {%- if message.tool_calls is defined %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>' }}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\"' }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '}' }}\n {{- '</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>'}}\n\n {%- elif message.role == 'tool' %}\n {{- '<|im_start|><|extra_id_13|><tool_output>' + message.content + '</tool_output><|im_end|>'}}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|><|assistant|>'}}\n{%- endif %}"
3
+ }
config.json ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AX4VLForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "configuration_ax4vl.AX4VLConfig",
7
+ "AutoModelForCausalLM": "modeling_ax4vl.AX4VLForConditionalGeneration",
8
+ "AutoProcessor": "processing_ax4vl.AX4VLProcessor"
9
+ },
10
+ "downsample_ratio": 0.5,
11
+ "dynamic_image_size": true,
12
+ "force_image_size": 384,
13
+ "image_token_index": 22,
14
+ "llm_config": {
15
+ "_attn_implementation_autoset": false,
16
+ "add_cross_attention": false,
17
+ "architectures": [
18
+ "Qwen2ForCausalLM"
19
+ ],
20
+ "attention_dropout": 0.0,
21
+ "attn_implementation": "flash_attention_2",
22
+ "bad_words_ids": null,
23
+ "begin_suppress_tokens": null,
24
+ "bos_token_id": 0,
25
+ "chunk_size_feed_forward": 0,
26
+ "cross_attention_hidden_size": null,
27
+ "decoder_start_token_id": null,
28
+ "diversity_penalty": 0.0,
29
+ "do_sample": false,
30
+ "early_stopping": false,
31
+ "encoder_no_repeat_ngram_size": 0,
32
+ "eos_token_id": 0,
33
+ "exponential_decay_length_penalty": null,
34
+ "finetuning_task": null,
35
+ "forced_bos_token_id": null,
36
+ "forced_eos_token_id": null,
37
+ "hidden_act": "silu",
38
+ "hidden_size": 3584,
39
+ "id2label": {
40
+ "0": "LABEL_0",
41
+ "1": "LABEL_1"
42
+ },
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 18944,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "label2id": {
48
+ "LABEL_0": 0,
49
+ "LABEL_1": 1
50
+ },
51
+ "length_penalty": 1.0,
52
+ "max_length": 20,
53
+ "max_position_embeddings": 16384,
54
+ "max_window_layers": 28,
55
+ "min_length": 0,
56
+ "model_type": "qwen2",
57
+ "no_repeat_ngram_size": 0,
58
+ "num_attention_heads": 28,
59
+ "num_beam_groups": 1,
60
+ "num_beams": 1,
61
+ "num_hidden_layers": 28,
62
+ "num_key_value_heads": 4,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": 1,
68
+ "prefix": null,
69
+ "problem_type": null,
70
+ "pruned_heads": {},
71
+ "remove_invalid_values": false,
72
+ "repetition_penalty": 1.0,
73
+ "return_dict": true,
74
+ "return_dict_in_generate": false,
75
+ "rms_norm_eps": 1e-05,
76
+ "rope_scaling": null,
77
+ "rope_theta": 1000000,
78
+ "sep_token_id": null,
79
+ "sliding_window": null,
80
+ "suppress_tokens": null,
81
+ "task_specific_params": null,
82
+ "temperature": 1.0,
83
+ "tf_legacy_loss": false,
84
+ "tie_encoder_decoder": false,
85
+ "tie_word_embeddings": false,
86
+ "tokenizer_class": null,
87
+ "top_k": 50,
88
+ "top_p": 1.0,
89
+ "torch_dtype": "bfloat16",
90
+ "torchscript": false,
91
+ "typical_p": 1.0,
92
+ "use_bfloat16": false,
93
+ "use_cache": false,
94
+ "use_sliding_window": false,
95
+ "vocab_size": 102400
96
+ },
97
+ "max_dynamic_patch": 12,
98
+ "max_num_tiles": 12,
99
+ "min_dynamic_patch": 1,
100
+ "min_num_tiles": 1,
101
+ "model_type": "a.x-4-vl",
102
+ "pad_token_id": 1,
103
+ "projector_config": {
104
+ "grid_size": 12,
105
+ "in_hidden_size": 1152,
106
+ "model_type": "ldpnetv2_projector",
107
+ "out_hidden_size": 3584,
108
+ "torch_dtype": "bfloat16"
109
+ },
110
+ "ps_version": "v2",
111
+ "select_layer": -1,
112
+ "template": "axvlm",
113
+ "text_config": {
114
+ "architectures": [
115
+ "Qwen2ForCausalLM"
116
+ ],
117
+ "attn_implementation": "flash_attention_2",
118
+ "bos_token_id": 0,
119
+ "eos_token_id": 0,
120
+ "hidden_size": 3584,
121
+ "intermediate_size": 18944,
122
+ "max_position_embeddings": 16384,
123
+ "model_type": "qwen2",
124
+ "num_attention_heads": 28,
125
+ "num_hidden_layers": 28,
126
+ "num_key_value_heads": 4,
127
+ "pad_token_id": 1,
128
+ "rms_norm_eps": 1e-05,
129
+ "rope_theta": 1000000,
130
+ "sliding_window": null,
131
+ "torch_dtype": "bfloat16",
132
+ "use_cache": false,
133
+ "vocab_size": 102400
134
+ },
135
+ "tie_word_embeddings": false,
136
+ "torch_dtype": "bfloat16",
137
+ "transformers_version": "4.49.0",
138
+ "use_thumbnail": true,
139
+ "vision_config": {
140
+ "drop_path_rate": 0.0,
141
+ "hidden_size": 1152,
142
+ "image_size": 384,
143
+ "intermediate_size": 4304,
144
+ "model_type": "siglip_vision_model",
145
+ "num_attention_heads": 16,
146
+ "num_hidden_layers": 27,
147
+ "torch_dtype": "bfloat16",
148
+ "vision_use_head": false
149
+ },
150
+ "vision_feature_layer": 0,
151
+ "vision_feature_select_strategy": "full"
152
+ }
configuration_ax4vl.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import transformers
2
+ from transformers.utils import logging
3
+ from transformers.models.auto import CONFIG_MAPPING, AutoConfig
4
+ from transformers.configuration_utils import PretrainedConfig
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ class LDPConfig(PretrainedConfig):
9
+ model_type = "ldpnetv2_projector"
10
+
11
+ def __init__(
12
+ self,
13
+ in_hidden_size=1024,
14
+ out_hidden_size=2048,
15
+ grid_size=12,
16
+ **kwargs
17
+ ):
18
+ self.in_hidden_size = in_hidden_size
19
+ self.out_hidden_size = out_hidden_size
20
+ self.grid_size = grid_size
21
+
22
+ super().__init__(**kwargs)
23
+
24
+ class MLPProjectorConfig(PretrainedConfig):
25
+ model_type = "mlp2x_projector"
26
+
27
+ def __init__(
28
+ self,
29
+ hidden_act="gelu",
30
+ in_hidden_size=1024,
31
+ out_hidden_size=2048,
32
+ bias: bool=True,
33
+ **kwargs
34
+ ):
35
+ self.hidden_act = hidden_act
36
+ self.in_hidden_size = in_hidden_size
37
+ self.out_hidden_size = out_hidden_size
38
+ self.bias = bias
39
+
40
+ super().__init__(**kwargs)
41
+
42
+
43
+
44
+ class AX4VLConfig(PretrainedConfig):
45
+ model_type = "a.x-4-vl"
46
+ sub_configs = {
47
+ "text_config": AutoConfig,
48
+ "projector_config": AutoConfig,
49
+ "vision_config": AutoConfig
50
+ }
51
+
52
+ def __init__(
53
+ self,
54
+ vision_config=None,
55
+ projector_config=None,
56
+ text_config=None,
57
+ image_token_index=102400,
58
+ vision_feature_select_strategy="full",
59
+ vision_feature_layer=0,
60
+ tie_word_embeddings=False,
61
+ **kwargs,
62
+ ):
63
+ self.image_token_index = image_token_index
64
+
65
+ if vision_feature_select_strategy not in ["default", "full"]:
66
+ raise ValueError(
67
+ "vision_feature_select_strategy should be one of 'default', 'full'."
68
+ f"Got: {vision_feature_select_strategy}"
69
+ )
70
+
71
+ self.vision_feature_select_strategy = vision_feature_select_strategy
72
+ self.vision_feature_layer = vision_feature_layer
73
+
74
+ if isinstance(vision_config, dict):
75
+ vision_config["model_type"] = (
76
+ vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
77
+ )
78
+ vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
79
+ elif vision_config is None:
80
+ vision_config = CONFIG_MAPPING["siglip_vision_model"](
81
+ intermediate_size=4304,
82
+ hidden_size=1152,
83
+ patch_size=16,
84
+ image_size=384,
85
+ num_hidden_layers=27,
86
+ num_attention_heads=16,
87
+ vision_use_head=False
88
+ )
89
+ self.vision_config = vision_config
90
+
91
+ if isinstance(projector_config, dict):
92
+ projector_config["model_type"] = (
93
+ projector_config["model_type"] if "model_type" in projector_config else "mlp2x"
94
+ )
95
+ projector_config = CONFIG_MAPPING[projector_config["model_type"]](**projector_config)
96
+ elif projector_config is None:
97
+ projector_config = CONFIG_MAPPING["mlp2x_projector"]()
98
+ self.projector_config = projector_config
99
+
100
+ if isinstance(text_config, dict):
101
+ text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "qwen2"
102
+ text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
103
+ elif text_config is None:
104
+ text_config = CONFIG_MAPPING["qwen2"]()
105
+
106
+ self.text_config = text_config
107
+
108
+ super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
109
+
110
+
111
+ AutoConfig.register(LDPConfig.model_type, LDPConfig)
112
+ AutoConfig.register(MLPProjectorConfig.model_type, MLPProjectorConfig)
113
+ AutoConfig.register(AX4VLConfig.model_type, AX4VLConfig)
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": [
5
+ 0,
6
+ 27,
7
+ 1
8
+ ],
9
+ "pad_token_id": 1,
10
+ "transformers_version": "4.49.0",
11
+ "use_cache": false
12
+ }
image_processing_ax4vl.py ADDED
@@ -0,0 +1,497 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Image processor class for Megatron-LM LLaVA.
3
+ """
4
+
5
+ import math
6
+ from typing import Dict, Iterable, List, Optional, Tuple, Union
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+ from .configuration_ax4vl import AX4VLConfig
11
+
12
+ from transformers.image_utils import (
13
+ OPENAI_CLIP_MEAN,
14
+ OPENAI_CLIP_STD,
15
+ ChannelDimension,
16
+ ImageInput,
17
+ PILImageResampling,
18
+ infer_channel_dimension_format,
19
+ is_scaled_image,
20
+ is_valid_image,
21
+ valid_images,
22
+ make_list_of_images,
23
+ to_numpy_array,
24
+ validate_preprocess_arguments,
25
+ )
26
+ from transformers.image_processing_utils import BatchFeature, get_size_dict, BaseImageProcessor
27
+ from transformers.image_transforms import (
28
+ PaddingMode,
29
+ pad,
30
+ to_channel_dimension_format,
31
+ )
32
+ from transformers.utils import TensorType, logging
33
+ from transformers.models.auto import AutoImageProcessor
34
+
35
+
36
+ logger = logging.get_logger(__name__)
37
+
38
+ def _get_patch_output_size(image, target_resolution):
39
+ original_width, original_height = image.size
40
+ target_width, target_height = target_resolution
41
+
42
+ scale_w = target_width / original_width
43
+ scale_h = target_height / original_height
44
+
45
+ if scale_w < scale_h:
46
+ new_width = target_width
47
+ new_height = min(math.ceil(original_height * scale_w), target_height)
48
+ else:
49
+ new_height = target_height
50
+ new_width = min(math.ceil(original_width * scale_h), target_width)
51
+
52
+ return new_width, new_height
53
+
54
+ # From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
55
+ # Copyright (c) 2023 OpenGVLab.
56
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
57
+ best_ratio_diff = float('inf')
58
+ best_ratio = (1, 1)
59
+ area = width * height
60
+ for ratio in target_ratios:
61
+ target_aspect_ratio = ratio[0] / ratio[1]
62
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
63
+ if ratio_diff < best_ratio_diff:
64
+ best_ratio_diff = ratio_diff
65
+ best_ratio = ratio
66
+ elif ratio_diff == best_ratio_diff:
67
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
68
+ best_ratio = ratio
69
+ # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
70
+ return best_ratio
71
+
72
+ def _pad_for_patching(image, target_resolution, background_color=(0, 0, 0)):
73
+ """
74
+ Pad an image to a target resolution while maintaining aspect ratio.
75
+ """
76
+ target_width, target_height = target_resolution
77
+ new_width, new_height = _get_patch_output_size(image, target_resolution)
78
+
79
+ paste_x = (target_width - new_width) // 2
80
+ paste_y = (target_height - new_height) // 2
81
+
82
+ padded_image = Image.new(image.mode, target_resolution, background_color)
83
+ padded_image.paste(image, (paste_x, paste_y))
84
+ return padded_image
85
+
86
+ def _resize_for_patching(image, target_resolution):
87
+ new_size = _get_patch_output_size(image, target_resolution)
88
+
89
+ # Resize the image
90
+ resized_image = image.resize(new_size)
91
+
92
+ return resized_image
93
+
94
+ def get_target_ratios(image_size, min_num=1, max_num=6, tile_size=384):
95
+ orig_width, orig_height = image_size
96
+ aspect_ratio = orig_width / orig_height
97
+
98
+ target_ratios = set(
99
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
100
+ i * j <= max_num and i * j >= min_num)
101
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
102
+
103
+ return find_closest_aspect_ratio(
104
+ aspect_ratio, target_ratios, orig_width, orig_height, tile_size
105
+ )
106
+
107
+ # From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
108
+ # Copyright (c) 2023 OpenGVLab.
109
+ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False, padding=False):
110
+ # find the closest aspect ratio to the target
111
+ target_aspect_ratio = get_target_ratios(image.size, min_num=min_num, max_num=max_num, tile_size=image_size)
112
+
113
+ # calculate the target width and height
114
+ target_width = image_size * target_aspect_ratio[0]
115
+ target_height = image_size * target_aspect_ratio[1]
116
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
117
+
118
+ # resize the image
119
+ if padding: # LLaVA-Next tiling strategy
120
+ resized_img = _resize_for_patching(image, (target_width, target_height))
121
+ resized_img = _pad_for_patching(resized_img, (target_width, target_height))
122
+ else: # InternVL tiling strategy
123
+ resized_img = image.resize((target_width, target_height))
124
+ processed_images = []
125
+ for i in range(blocks):
126
+ box = (
127
+ (i % (target_width // image_size)) * image_size,
128
+ (i // (target_width // image_size)) * image_size,
129
+ ((i % (target_width // image_size)) + 1) * image_size,
130
+ ((i // (target_width // image_size)) + 1) * image_size
131
+ )
132
+ # split the image
133
+ split_img = resized_img.crop(box)
134
+ processed_images.append(split_img)
135
+ assert len(processed_images) == blocks
136
+ if use_thumbnail and len(processed_images) != 1:
137
+ thumbnail_img = image.resize((image_size, image_size))
138
+ processed_images.append(thumbnail_img)
139
+ return processed_images
140
+
141
+ class AX4VLImageProcessor(BaseImageProcessor):
142
+
143
+ model_input_names = ["pixel_values"]
144
+
145
+ def __init__(
146
+ self,
147
+ do_resize: bool = True,
148
+ size: Dict[str, int] = None,
149
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
150
+ do_rescale: bool = True,
151
+ rescale_factor: Union[int, float] = 1 / 255,
152
+ do_normalize: bool = True,
153
+ image_mean: Optional[Union[float, List[float]]] = None,
154
+ image_std: Optional[Union[float, List[float]]] = None,
155
+ do_pad: Optional[bool] = True,
156
+ do_tile_pad: Optional[bool] = True,
157
+ do_convert_rgb: bool = True,
158
+ use_thumbnail: bool = True,
159
+ min_num_tiles: int = 1,
160
+ max_num_tiles: int = 6,
161
+ **kwargs,
162
+ ) -> None:
163
+ super().__init__(**kwargs)
164
+ size = dict(size) if size is not None else {"shortest_edge": 224}
165
+ size = get_size_dict(size, default_to_square=False)
166
+
167
+ self.do_resize = do_resize
168
+ self.size = size
169
+ self.resample = resample
170
+ self.do_rescale = do_rescale
171
+ self.rescale_factor = rescale_factor
172
+ self.do_normalize = do_normalize
173
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
174
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
175
+ self.do_pad = do_pad
176
+ self.do_tile_pad = do_tile_pad
177
+ self.do_convert_rgb = do_convert_rgb
178
+ self.use_thumbnail = use_thumbnail
179
+ self.min_num_tiles = min_num_tiles
180
+ self.max_num_tiles = max_num_tiles
181
+
182
+ def pad(
183
+ self,
184
+ image: np.ndarray,
185
+ padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]],
186
+ mode: PaddingMode = PaddingMode.CONSTANT,
187
+ constant_values: Union[float, Iterable[float]] = 0.0,
188
+ data_format: Optional[Union[str, ChannelDimension]] = None,
189
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
190
+ ) -> np.ndarray:
191
+ """
192
+ Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
193
+ dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
194
+ as input.
195
+
196
+ Args:
197
+ image (`np.ndarray`):
198
+ The image to pad.
199
+ padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`):
200
+ Padding to apply to the edges of the height, width axes. Can be one of three formats:
201
+ - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
202
+ - `((before, after),)` yields same before and after pad for height and width.
203
+ - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
204
+ mode (`PaddingMode`):
205
+ The padding mode to use. Can be one of:
206
+ - `"constant"`: pads with a constant value.
207
+ - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
208
+ vector along each axis.
209
+ - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
210
+ - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
211
+ constant_values (`float` or `Iterable[float]`, *optional*):
212
+ The value to use for the padding if `mode` is `"constant"`.
213
+ data_format (`str` or `ChannelDimension`, *optional*):
214
+ The channel dimension format for the output image. Can be one of:
215
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
216
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
217
+ If unset, will use same as the input image.
218
+ input_data_format (`str` or `ChannelDimension`, *optional*):
219
+ The channel dimension format for the input image. Can be one of:
220
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
221
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
222
+ If unset, will use the inferred format of the input image.
223
+
224
+ Returns:
225
+ `np.ndarray`: The padded image.
226
+
227
+ """
228
+
229
+ # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim
230
+ if isinstance(padding, int) or len(padding) != 4:
231
+ return pad(image, padding, mode, constant_values, data_format, input_data_format)
232
+
233
+ if input_data_format is None:
234
+ input_data_format = infer_channel_dimension_format(image)
235
+ if mode == PaddingMode.CONSTANT:
236
+ image = np.pad(image, padding, mode="constant", constant_values=constant_values)
237
+ elif mode == PaddingMode.REFLECT:
238
+ image = np.pad(image, padding, mode="reflect")
239
+ elif mode == PaddingMode.REPLICATE:
240
+ image = np.pad(image, padding, mode="edge")
241
+ elif mode == PaddingMode.SYMMETRIC:
242
+ image = np.pad(image, padding, mode="symmetric")
243
+ else:
244
+ raise ValueError(f"Invalid padding mode: {mode}")
245
+ image = (
246
+ to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
247
+ )
248
+ return image
249
+
250
+ def _pad_for_batching(
251
+ self,
252
+ pixel_values: List[np.ndarray],
253
+ data_format: Optional[Union[str, ChannelDimension]] = None,
254
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
255
+ ):
256
+ """
257
+ Pads images on the `num_of_patches` dimension with zeros to form a batch of same number of patches.
258
+
259
+ Args:
260
+ pixel_values (`List[np.ndarray]`):
261
+ An array of pixel values of each images of shape (`batch_size`, `num_patches`, `image_in_3D`)
262
+ data_format (`str` or `ChannelDimension`, *optional*):
263
+ The channel dimension format for the output image. Can be one of:
264
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
265
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
266
+ If unset, will use same as the input image.
267
+ input_data_format (`str` or `ChannelDimension`, *optional*):
268
+ The channel dimension format for the input image. Can be one of:
269
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
270
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
271
+ If unset, will use the inferred format of the input image.
272
+
273
+ Returns:
274
+ List[`np.ndarray`]: The padded images.
275
+ """
276
+ max_patch = max(len(x) for x in pixel_values)
277
+ pixel_values = [
278
+ self.pad(
279
+ image,
280
+ padding=((0, max_patch - image.shape[0]), (0, 0), (0, 0), (0, 0)),
281
+ data_format=data_format,
282
+ input_data_format=input_data_format,
283
+ )
284
+ for image in pixel_values
285
+ ]
286
+
287
+ return pixel_values
288
+
289
+ def _preprocess(
290
+ self,
291
+ images: ImageInput,
292
+ do_resize: bool = None,
293
+ size: Dict[str, int] = None,
294
+ resample: PILImageResampling = None,
295
+ do_rescale: bool = None,
296
+ rescale_factor: float = None,
297
+ do_normalize: bool = None,
298
+ image_mean: Optional[Union[float, List[float]]] = None,
299
+ image_std: Optional[Union[float, List[float]]] = None,
300
+ do_convert_rgb: bool = None,
301
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
302
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
303
+ ):
304
+ images = make_list_of_images(images)
305
+
306
+ all_images = []
307
+ for image in images:
308
+ if do_resize:
309
+ image = image.resize((size["shortest_edge"], size["shortest_edge"]), resample)
310
+
311
+ image = to_numpy_array(image)
312
+
313
+ if input_data_format is None:
314
+ # We assume that all images have the same channel dimension format.
315
+ input_data_format = infer_channel_dimension_format(image)
316
+
317
+ if is_scaled_image(image) and do_rescale:
318
+ logger.warning_once(
319
+ "It looks like you are trying to rescale already rescaled images. If the input"
320
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
321
+ )
322
+ if do_rescale:
323
+ image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
324
+
325
+ if do_normalize:
326
+ image = self.normalize(
327
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
328
+ )
329
+
330
+ all_images.append(image)
331
+
332
+ images = [
333
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
334
+ for image in all_images
335
+ ]
336
+
337
+ return images
338
+
339
+ def preprocess(
340
+ self,
341
+ images: ImageInput,
342
+ do_resize: bool = None,
343
+ size: Dict[str, int] = None,
344
+ resample: PILImageResampling = None,
345
+ do_rescale: bool = None,
346
+ rescale_factor: float = None,
347
+ do_normalize: bool = None,
348
+ image_mean: Optional[Union[float, List[float]]] = None,
349
+ image_std: Optional[Union[float, List[float]]] = None,
350
+ do_pad: Optional[bool] = None,
351
+ do_convert_rgb: bool = None,
352
+ return_tensors: Optional[Union[str, TensorType]] = None,
353
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
354
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
355
+ ):
356
+ """
357
+ Args:
358
+ images (`ImageInput`):
359
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255.
360
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
361
+ Whether to resize the image.
362
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
363
+ Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
364
+ the longest edge resized to keep the input aspect ratio.
365
+ resample (`int`, *optional*, defaults to `self.resample`):
366
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
367
+ has an effect if `do_resize` is set to `True`.
368
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
369
+ Whether to normalize the image.
370
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
371
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
372
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
373
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
374
+ `True`.
375
+ do_pad (`bool`, *optional*, defaults to `self.do_pad`):
376
+ Whether to pad the image. If `True`, will pad the patch dimension of the images in the batch to the largest
377
+ number of patches in the batch. Padding will be applied to the bottom and right with zeros.
378
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
379
+ Whether to convert the image to RGB.
380
+ return_tensors (`str` or `TensorType`, *optional*):
381
+ The type of tensors to return. Can be one of:
382
+ - Unset: Return a list of `np.ndarray`.
383
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
384
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
385
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
386
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
387
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
388
+ The channel dimension format for the output image. Can be one of:
389
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
390
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
391
+ - Unset: Use the channel dimension format of the input image.
392
+ input_data_format (`ChannelDimension` or `str`, *optional*):
393
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
394
+ from the input image. Can be one of:
395
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
396
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
397
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
398
+
399
+ """
400
+ do_resize = do_resize if do_resize is not None else self.do_resize
401
+ size = size if size is not None else self.size
402
+ size = get_size_dict(size, param_name="size", default_to_square=False)
403
+ resample = resample if resample is not None else self.resample
404
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
405
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
406
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
407
+ image_mean = image_mean if image_mean is not None else self.image_mean
408
+ image_std = image_std if image_std is not None else self.image_std
409
+ do_pad = do_pad if do_pad is not None else self.do_pad
410
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
411
+
412
+ images = make_batched_images(images)
413
+
414
+ if not valid_images(images):
415
+ raise ValueError(
416
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
417
+ "torch.Tensor, tf.Tensor or jax.ndarray."
418
+ )
419
+
420
+ validate_preprocess_arguments(
421
+ do_rescale=do_rescale,
422
+ rescale_factor=rescale_factor,
423
+ do_normalize=do_normalize,
424
+ image_mean=image_mean,
425
+ image_std=image_std,
426
+ do_resize=do_resize,
427
+ size=size,
428
+ resample=resample,
429
+ )
430
+
431
+ new_images, num_tiles = [], []
432
+ image_sizes = [image.size for image in images]
433
+ for image in images:
434
+ if do_convert_rgb and image.mode != "RGB":
435
+ image = image.convert("RGB")
436
+
437
+ image_patches = dynamic_preprocess(
438
+ image,
439
+ min_num=self.min_num_tiles,
440
+ max_num=self.max_num_tiles,
441
+ image_size=self.size["shortest_edge"],
442
+ use_thumbnail=self.use_thumbnail,
443
+ padding=self.do_tile_pad
444
+ )
445
+
446
+ # preprocess patches
447
+ pixel_values = self._preprocess(
448
+ image_patches,
449
+ do_resize=do_resize,
450
+ size=size,
451
+ resample=resample,
452
+ do_rescale=do_rescale,
453
+ rescale_factor=rescale_factor,
454
+ do_normalize=do_normalize,
455
+ image_mean=image_mean,
456
+ image_std=image_std,
457
+ data_format=data_format,
458
+ input_data_format=input_data_format
459
+ )
460
+ pixel_values = np.array(pixel_values)
461
+ new_images.append(pixel_values)
462
+ num_tiles.append(len(image_patches))
463
+
464
+ if do_pad:
465
+ processed_images = self._pad_for_batching(new_images)
466
+ else:
467
+ processed_images = np.concatenate(new_images)
468
+
469
+ return BatchFeature(
470
+ data={"pixel_values": processed_images, "image_sizes": image_sizes, "num_tiles": num_tiles},
471
+ tensor_type=return_tensors
472
+ )
473
+
474
+
475
+ def make_batched_images(images) -> List[List[ImageInput]]:
476
+ """
477
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
478
+
479
+ Args:
480
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
481
+ The input image.
482
+
483
+ Returns:
484
+ list: A list of images.
485
+ """
486
+ if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
487
+ return [img for img_list in images for img in img_list]
488
+
489
+ elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
490
+ return images
491
+
492
+ elif is_valid_image(images):
493
+ return [images]
494
+
495
+ raise ValueError(f"Could not make batched video from {images}")
496
+
497
+ AutoImageProcessor.register(AX4VLConfig, AX4VLImageProcessor)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df2cb01ffd10ce08ecc0bc65bf7574b9b6307255d08223f1f9bb293be670f354
3
+ size 4915685072
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33edd933eba6633abf140bb94f5e2f1b6632cf936a790f58400d07ec2df8b602
3
+ size 4932752832
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d092fc7551ec532351529e9e9f7df911e5748bfbe4083b17114848ddae75194
3
+ size 4796984024
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6263e4575c902f940b9c52a36144a748ba46d92ae0cbfca5689d110594184372
3
+ size 734003344
model.safetensors.index.json ADDED
@@ -0,0 +1,789 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 15379320288
4
+ },
5
+ "weight_map": {
6
+ "language_model.lm_head.weight": "model-00004-of-00004.safetensors",
7
+ "language_model.model.embed_tokens.weight": "model-00001-of-00004.safetensors",
8
+ "language_model.model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
9
+ "language_model.model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
10
+ "language_model.model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
11
+ "language_model.model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
12
+ "language_model.model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
13
+ "language_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
14
+ "language_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
15
+ "language_model.model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
16
+ "language_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
17
+ "language_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
18
+ "language_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
19
+ "language_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
20
+ "language_model.model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
21
+ "language_model.model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
22
+ "language_model.model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
23
+ "language_model.model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
24
+ "language_model.model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
25
+ "language_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
26
+ "language_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
27
+ "language_model.model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
28
+ "language_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
29
+ "language_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
30
+ "language_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
31
+ "language_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
32
+ "language_model.model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
33
+ "language_model.model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
34
+ "language_model.model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
35
+ "language_model.model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
36
+ "language_model.model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
37
+ "language_model.model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
38
+ "language_model.model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
39
+ "language_model.model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
40
+ "language_model.model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
41
+ "language_model.model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
42
+ "language_model.model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
43
+ "language_model.model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
44
+ "language_model.model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
45
+ "language_model.model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
46
+ "language_model.model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
47
+ "language_model.model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
48
+ "language_model.model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
49
+ "language_model.model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
50
+ "language_model.model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
51
+ "language_model.model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
52
+ "language_model.model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
53
+ "language_model.model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
54
+ "language_model.model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
55
+ "language_model.model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
56
+ "language_model.model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
57
+ "language_model.model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
58
+ "language_model.model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
59
+ "language_model.model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
60
+ "language_model.model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
61
+ "language_model.model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
62
+ "language_model.model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
63
+ "language_model.model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
64
+ "language_model.model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
65
+ "language_model.model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
66
+ "language_model.model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
67
+ "language_model.model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
68
+ "language_model.model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
69
+ "language_model.model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
70
+ "language_model.model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
71
+ "language_model.model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
72
+ "language_model.model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
73
+ "language_model.model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
74
+ "language_model.model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
75
+ "language_model.model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
76
+ "language_model.model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
77
+ "language_model.model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
78
+ "language_model.model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
79
+ "language_model.model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
80
+ "language_model.model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
81
+ "language_model.model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
82
+ "language_model.model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
83
+ "language_model.model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
84
+ "language_model.model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
85
+ "language_model.model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
86
+ "language_model.model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
87
+ "language_model.model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
88
+ "language_model.model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
89
+ "language_model.model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
90
+ "language_model.model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
91
+ "language_model.model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
92
+ "language_model.model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
93
+ "language_model.model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
94
+ "language_model.model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
95
+ "language_model.model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
96
+ "language_model.model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
97
+ "language_model.model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
98
+ "language_model.model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
99
+ "language_model.model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
100
+ "language_model.model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
101
+ "language_model.model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
102
+ "language_model.model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
103
+ "language_model.model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
104
+ "language_model.model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
105
+ "language_model.model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
106
+ "language_model.model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
107
+ "language_model.model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
108
+ "language_model.model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
109
+ "language_model.model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
110
+ "language_model.model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
111
+ "language_model.model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
112
+ "language_model.model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
113
+ "language_model.model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
114
+ "language_model.model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
115
+ "language_model.model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
116
+ "language_model.model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
117
+ "language_model.model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
118
+ "language_model.model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
119
+ "language_model.model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
120
+ "language_model.model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
121
+ "language_model.model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
122
+ "language_model.model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
123
+ "language_model.model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
124
+ "language_model.model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
125
+ "language_model.model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
126
+ "language_model.model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
127
+ "language_model.model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
128
+ "language_model.model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
129
+ "language_model.model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
130
+ "language_model.model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
131
+ "language_model.model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
132
+ "language_model.model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
133
+ "language_model.model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
134
+ "language_model.model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
135
+ "language_model.model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
136
+ "language_model.model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
137
+ "language_model.model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
138
+ "language_model.model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
139
+ "language_model.model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
140
+ "language_model.model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
141
+ "language_model.model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
142
+ "language_model.model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
143
+ "language_model.model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
144
+ "language_model.model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
145
+ "language_model.model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
146
+ "language_model.model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
147
+ "language_model.model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
148
+ "language_model.model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
149
+ "language_model.model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
150
+ "language_model.model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
151
+ "language_model.model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
152
+ "language_model.model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
153
+ "language_model.model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
154
+ "language_model.model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
155
+ "language_model.model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
156
+ "language_model.model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
157
+ "language_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
158
+ "language_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
159
+ "language_model.model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
160
+ "language_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
161
+ "language_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
162
+ "language_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
163
+ "language_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
164
+ "language_model.model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
165
+ "language_model.model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
166
+ "language_model.model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
167
+ "language_model.model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
168
+ "language_model.model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
169
+ "language_model.model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
170
+ "language_model.model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
171
+ "language_model.model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
172
+ "language_model.model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
173
+ "language_model.model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
174
+ "language_model.model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
175
+ "language_model.model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
176
+ "language_model.model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
177
+ "language_model.model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
178
+ "language_model.model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
179
+ "language_model.model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
180
+ "language_model.model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
181
+ "language_model.model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
182
+ "language_model.model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
183
+ "language_model.model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
184
+ "language_model.model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
185
+ "language_model.model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
186
+ "language_model.model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
187
+ "language_model.model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
188
+ "language_model.model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
189
+ "language_model.model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
190
+ "language_model.model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
191
+ "language_model.model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
192
+ "language_model.model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
193
+ "language_model.model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
194
+ "language_model.model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
195
+ "language_model.model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
196
+ "language_model.model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
197
+ "language_model.model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
198
+ "language_model.model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
199
+ "language_model.model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
200
+ "language_model.model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
201
+ "language_model.model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
202
+ "language_model.model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
203
+ "language_model.model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
204
+ "language_model.model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
205
+ "language_model.model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
206
+ "language_model.model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
207
+ "language_model.model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
208
+ "language_model.model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
209
+ "language_model.model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
210
+ "language_model.model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
211
+ "language_model.model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
212
+ "language_model.model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
213
+ "language_model.model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
214
+ "language_model.model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
215
+ "language_model.model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
216
+ "language_model.model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
217
+ "language_model.model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
218
+ "language_model.model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
219
+ "language_model.model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
220
+ "language_model.model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
221
+ "language_model.model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
222
+ "language_model.model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
223
+ "language_model.model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
224
+ "language_model.model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
225
+ "language_model.model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
226
+ "language_model.model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
227
+ "language_model.model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
228
+ "language_model.model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
229
+ "language_model.model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
230
+ "language_model.model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
231
+ "language_model.model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
232
+ "language_model.model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
233
+ "language_model.model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
234
+ "language_model.model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
235
+ "language_model.model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
236
+ "language_model.model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
237
+ "language_model.model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
238
+ "language_model.model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
239
+ "language_model.model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
240
+ "language_model.model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
241
+ "language_model.model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
242
+ "language_model.model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
243
+ "language_model.model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
244
+ "language_model.model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
245
+ "language_model.model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
246
+ "language_model.model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
247
+ "language_model.model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
248
+ "language_model.model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
249
+ "language_model.model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
250
+ "language_model.model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
251
+ "language_model.model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
252
+ "language_model.model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
253
+ "language_model.model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
254
+ "language_model.model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
255
+ "language_model.model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
256
+ "language_model.model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
257
+ "language_model.model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
258
+ "language_model.model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
259
+ "language_model.model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
260
+ "language_model.model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
261
+ "language_model.model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
262
+ "language_model.model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
263
+ "language_model.model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
264
+ "language_model.model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
265
+ "language_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
266
+ "language_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
267
+ "language_model.model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
268
+ "language_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
269
+ "language_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
270
+ "language_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
271
+ "language_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
272
+ "language_model.model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
273
+ "language_model.model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
274
+ "language_model.model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
275
+ "language_model.model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
276
+ "language_model.model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
277
+ "language_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
278
+ "language_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
279
+ "language_model.model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
280
+ "language_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
281
+ "language_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
282
+ "language_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
283
+ "language_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
284
+ "language_model.model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
285
+ "language_model.model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
286
+ "language_model.model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
287
+ "language_model.model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
288
+ "language_model.model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
289
+ "language_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
290
+ "language_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
291
+ "language_model.model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
292
+ "language_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
293
+ "language_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
294
+ "language_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
295
+ "language_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
296
+ "language_model.model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
297
+ "language_model.model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
298
+ "language_model.model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
299
+ "language_model.model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
300
+ "language_model.model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
301
+ "language_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
302
+ "language_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
303
+ "language_model.model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
304
+ "language_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
305
+ "language_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
306
+ "language_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
307
+ "language_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
308
+ "language_model.model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
309
+ "language_model.model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
310
+ "language_model.model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
311
+ "language_model.model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
312
+ "language_model.model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
313
+ "language_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
314
+ "language_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
315
+ "language_model.model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
316
+ "language_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
317
+ "language_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
318
+ "language_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
319
+ "language_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
320
+ "language_model.model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
321
+ "language_model.model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
322
+ "language_model.model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
323
+ "language_model.model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
324
+ "language_model.model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
325
+ "language_model.model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
326
+ "language_model.model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
327
+ "language_model.model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
328
+ "language_model.model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
329
+ "language_model.model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
330
+ "language_model.model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
331
+ "language_model.model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
332
+ "language_model.model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
333
+ "language_model.model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
334
+ "language_model.model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
335
+ "language_model.model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
336
+ "language_model.model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
337
+ "language_model.model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
338
+ "language_model.model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
339
+ "language_model.model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
340
+ "language_model.model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
341
+ "language_model.model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
342
+ "language_model.model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
343
+ "language_model.model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
344
+ "language_model.model.norm.weight": "model-00003-of-00004.safetensors",
345
+ "multi_modal_projector.mlp.mlp.0.bias": "model-00001-of-00004.safetensors",
346
+ "multi_modal_projector.mlp.mlp.0.weight": "model-00001-of-00004.safetensors",
347
+ "multi_modal_projector.mlp.mlp.2.bias": "model-00001-of-00004.safetensors",
348
+ "multi_modal_projector.mlp.mlp.2.weight": "model-00001-of-00004.safetensors",
349
+ "multi_modal_projector.peg.peg.0.bias": "model-00001-of-00004.safetensors",
350
+ "multi_modal_projector.peg.peg.0.weight": "model-00001-of-00004.safetensors",
351
+ "vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00004.safetensors",
352
+ "vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00004.safetensors",
353
+ "vision_tower.vision_model.embeddings.position_embedding.weight": "model-00001-of-00004.safetensors",
354
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00004.safetensors",
355
+ "vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00004.safetensors",
356
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00004.safetensors",
357
+ "vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00004.safetensors",
358
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00004.safetensors",
359
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00004.safetensors",
360
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00004.safetensors",
361
+ "vision_tower.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00004.safetensors",
362
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
363
+ "vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
364
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
365
+ "vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
366
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
367
+ "vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
368
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
369
+ "vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
370
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00004.safetensors",
371
+ "vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00004.safetensors",
372
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00004.safetensors",
373
+ "vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00004.safetensors",
374
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00004.safetensors",
375
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00004.safetensors",
376
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00004.safetensors",
377
+ "vision_tower.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00004.safetensors",
378
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
379
+ "vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
380
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
381
+ "vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
382
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
383
+ "vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
384
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
385
+ "vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
386
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00004.safetensors",
387
+ "vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00004.safetensors",
388
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00004.safetensors",
389
+ "vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00004.safetensors",
390
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00004.safetensors",
391
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00004.safetensors",
392
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00004.safetensors",
393
+ "vision_tower.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00004.safetensors",
394
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
395
+ "vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
396
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
397
+ "vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
398
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
399
+ "vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
400
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
401
+ "vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
402
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00004.safetensors",
403
+ "vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00004.safetensors",
404
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00004.safetensors",
405
+ "vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00004.safetensors",
406
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00004.safetensors",
407
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00004.safetensors",
408
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00004.safetensors",
409
+ "vision_tower.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00004.safetensors",
410
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
411
+ "vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
412
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
413
+ "vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
414
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
415
+ "vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
416
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
417
+ "vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
418
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00004.safetensors",
419
+ "vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00004.safetensors",
420
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00004.safetensors",
421
+ "vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00004.safetensors",
422
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00004.safetensors",
423
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00004.safetensors",
424
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00004.safetensors",
425
+ "vision_tower.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00004.safetensors",
426
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
427
+ "vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
428
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
429
+ "vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
430
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
431
+ "vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
432
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
433
+ "vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
434
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00004.safetensors",
435
+ "vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00004.safetensors",
436
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00004.safetensors",
437
+ "vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00004.safetensors",
438
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00004.safetensors",
439
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00004.safetensors",
440
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00004.safetensors",
441
+ "vision_tower.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00004.safetensors",
442
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
443
+ "vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
444
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
445
+ "vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
446
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
447
+ "vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
448
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
449
+ "vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
450
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00004.safetensors",
451
+ "vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00004.safetensors",
452
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00004.safetensors",
453
+ "vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00004.safetensors",
454
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00004.safetensors",
455
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00004.safetensors",
456
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00004.safetensors",
457
+ "vision_tower.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00004.safetensors",
458
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
459
+ "vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
460
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
461
+ "vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
462
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
463
+ "vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
464
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
465
+ "vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
466
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00004.safetensors",
467
+ "vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00004.safetensors",
468
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00004.safetensors",
469
+ "vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00004.safetensors",
470
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00004.safetensors",
471
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00004.safetensors",
472
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00004.safetensors",
473
+ "vision_tower.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00004.safetensors",
474
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
475
+ "vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
476
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
477
+ "vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
478
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
479
+ "vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
480
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
481
+ "vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
482
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00004.safetensors",
483
+ "vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00004.safetensors",
484
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00004.safetensors",
485
+ "vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00004.safetensors",
486
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00004.safetensors",
487
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00004.safetensors",
488
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00004.safetensors",
489
+ "vision_tower.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00004.safetensors",
490
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
491
+ "vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
492
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
493
+ "vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
494
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
495
+ "vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
496
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
497
+ "vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
498
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00004.safetensors",
499
+ "vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00004.safetensors",
500
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00004.safetensors",
501
+ "vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00004.safetensors",
502
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00004.safetensors",
503
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00004.safetensors",
504
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00004.safetensors",
505
+ "vision_tower.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00004.safetensors",
506
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
507
+ "vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
508
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
509
+ "vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
510
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
511
+ "vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
512
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
513
+ "vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
514
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00004.safetensors",
515
+ "vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00004.safetensors",
516
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00004.safetensors",
517
+ "vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00004.safetensors",
518
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00004.safetensors",
519
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00004.safetensors",
520
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00004.safetensors",
521
+ "vision_tower.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00004.safetensors",
522
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
523
+ "vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
524
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
525
+ "vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
526
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
527
+ "vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
528
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
529
+ "vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
530
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00004.safetensors",
531
+ "vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00004.safetensors",
532
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00004.safetensors",
533
+ "vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00004.safetensors",
534
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00004.safetensors",
535
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00004.safetensors",
536
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00004.safetensors",
537
+ "vision_tower.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00004.safetensors",
538
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
539
+ "vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
540
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
541
+ "vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
542
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
543
+ "vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
544
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
545
+ "vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
546
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00004.safetensors",
547
+ "vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00004.safetensors",
548
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00004.safetensors",
549
+ "vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00004.safetensors",
550
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00004.safetensors",
551
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00004.safetensors",
552
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00004.safetensors",
553
+ "vision_tower.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00004.safetensors",
554
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
555
+ "vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
556
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
557
+ "vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
558
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
559
+ "vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
560
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
561
+ "vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
562
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00004.safetensors",
563
+ "vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00004.safetensors",
564
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00004.safetensors",
565
+ "vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00004.safetensors",
566
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00004.safetensors",
567
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00004.safetensors",
568
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00004.safetensors",
569
+ "vision_tower.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00004.safetensors",
570
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
571
+ "vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
572
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
573
+ "vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
574
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
575
+ "vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
576
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
577
+ "vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
578
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00004.safetensors",
579
+ "vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00004.safetensors",
580
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00004.safetensors",
581
+ "vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00004.safetensors",
582
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00004.safetensors",
583
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00004.safetensors",
584
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00004.safetensors",
585
+ "vision_tower.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00004.safetensors",
586
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
587
+ "vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
588
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
589
+ "vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
590
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
591
+ "vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
592
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
593
+ "vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
594
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00004.safetensors",
595
+ "vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00004.safetensors",
596
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00004.safetensors",
597
+ "vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00004.safetensors",
598
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00004.safetensors",
599
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00004.safetensors",
600
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00004.safetensors",
601
+ "vision_tower.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00004.safetensors",
602
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
603
+ "vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
604
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
605
+ "vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
606
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
607
+ "vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
608
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
609
+ "vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
610
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00004.safetensors",
611
+ "vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00004.safetensors",
612
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00004.safetensors",
613
+ "vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00004.safetensors",
614
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00004.safetensors",
615
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00004.safetensors",
616
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00004.safetensors",
617
+ "vision_tower.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00004.safetensors",
618
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
619
+ "vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
620
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
621
+ "vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
622
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
623
+ "vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
624
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
625
+ "vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
626
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00004.safetensors",
627
+ "vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00004.safetensors",
628
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00004.safetensors",
629
+ "vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00004.safetensors",
630
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00004.safetensors",
631
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00004.safetensors",
632
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00004.safetensors",
633
+ "vision_tower.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00004.safetensors",
634
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
635
+ "vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
636
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
637
+ "vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
638
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
639
+ "vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
640
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
641
+ "vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
642
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00004.safetensors",
643
+ "vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00004.safetensors",
644
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00004.safetensors",
645
+ "vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00004.safetensors",
646
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00004.safetensors",
647
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00004.safetensors",
648
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00004.safetensors",
649
+ "vision_tower.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00004.safetensors",
650
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
651
+ "vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
652
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
653
+ "vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
654
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
655
+ "vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
656
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
657
+ "vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
658
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00004.safetensors",
659
+ "vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00004.safetensors",
660
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00004.safetensors",
661
+ "vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00004.safetensors",
662
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00004.safetensors",
663
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00004.safetensors",
664
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00004.safetensors",
665
+ "vision_tower.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00004.safetensors",
666
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
667
+ "vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
668
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
669
+ "vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
670
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
671
+ "vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
672
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
673
+ "vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
674
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00004.safetensors",
675
+ "vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00004.safetensors",
676
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00004.safetensors",
677
+ "vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00004.safetensors",
678
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00004.safetensors",
679
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00004.safetensors",
680
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00004.safetensors",
681
+ "vision_tower.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00004.safetensors",
682
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
683
+ "vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
684
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
685
+ "vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
686
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
687
+ "vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
688
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
689
+ "vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
690
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00004.safetensors",
691
+ "vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00004.safetensors",
692
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00004.safetensors",
693
+ "vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00004.safetensors",
694
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00004.safetensors",
695
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00004.safetensors",
696
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00004.safetensors",
697
+ "vision_tower.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00004.safetensors",
698
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
699
+ "vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
700
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
701
+ "vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
702
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
703
+ "vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
704
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
705
+ "vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
706
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00004.safetensors",
707
+ "vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00004.safetensors",
708
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00004.safetensors",
709
+ "vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00004.safetensors",
710
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00004.safetensors",
711
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00004.safetensors",
712
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00004.safetensors",
713
+ "vision_tower.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00004.safetensors",
714
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
715
+ "vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
716
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
717
+ "vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
718
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
719
+ "vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
720
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
721
+ "vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
722
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00004.safetensors",
723
+ "vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00004.safetensors",
724
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00004.safetensors",
725
+ "vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00004.safetensors",
726
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00004.safetensors",
727
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00004.safetensors",
728
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00004.safetensors",
729
+ "vision_tower.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00004.safetensors",
730
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
731
+ "vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
732
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
733
+ "vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
734
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
735
+ "vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
736
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
737
+ "vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
738
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00004.safetensors",
739
+ "vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00004.safetensors",
740
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00004.safetensors",
741
+ "vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00004.safetensors",
742
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00004.safetensors",
743
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00004.safetensors",
744
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00004.safetensors",
745
+ "vision_tower.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00004.safetensors",
746
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
747
+ "vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
748
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
749
+ "vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
750
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
751
+ "vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
752
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
753
+ "vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
754
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00004.safetensors",
755
+ "vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00004.safetensors",
756
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00004.safetensors",
757
+ "vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00004.safetensors",
758
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00004.safetensors",
759
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00004.safetensors",
760
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00004.safetensors",
761
+ "vision_tower.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00004.safetensors",
762
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
763
+ "vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
764
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
765
+ "vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
766
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
767
+ "vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
768
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
769
+ "vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
770
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00004.safetensors",
771
+ "vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00004.safetensors",
772
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00004.safetensors",
773
+ "vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00004.safetensors",
774
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00004.safetensors",
775
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00004.safetensors",
776
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00004.safetensors",
777
+ "vision_tower.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00004.safetensors",
778
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
779
+ "vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
780
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00004.safetensors",
781
+ "vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00004.safetensors",
782
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
783
+ "vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
784
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
785
+ "vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
786
+ "vision_tower.vision_model.post_layernorm.bias": "model-00001-of-00004.safetensors",
787
+ "vision_tower.vision_model.post_layernorm.weight": "model-00001-of-00004.safetensors"
788
+ }
789
+ }
modeling_ax4vl.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ base code: LLaVA-Next (transformers==4.49.0)
3
+ """
4
+ from typing import List, Optional, Tuple, Union
5
+ import math
6
+
7
+ import torch
8
+ import torch.utils.checkpoint
9
+ from torch import nn
10
+ from .configuration_ax4vl import LDPConfig, MLPProjectorConfig, AX4VLConfig
11
+
12
+ from transformers.activations import ACT2FN
13
+ from transformers.generation import GenerationMixin
14
+ from transformers.models.auto import AutoModel, AutoModelForCausalLM
15
+ from transformers.utils import (
16
+ is_torchdynamo_compiling,
17
+ logging,
18
+ )
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.modeling_utils import PreTrainedModel
21
+ from transformers.modeling_outputs import ModelOutput
22
+ from dataclasses import dataclass
23
+
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+ def build_projector(config):
29
+ if config.model_type == "ldpnetv2_projector":
30
+ return LDPProjector(config)
31
+ else:
32
+ raise ValueError(f"Unknown projector type: {config.model_type}")
33
+
34
+ @dataclass
35
+ class AX4CausalLMOutputWithPast(ModelOutput):
36
+ loss: Optional[torch.FloatTensor] = None
37
+ logits: torch.FloatTensor = None
38
+ past_key_values: Optional[List[torch.FloatTensor]] = None
39
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
40
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
41
+ image_hidden_states: Optional[torch.FloatTensor] = None
42
+
43
+
44
+ class BaseAXPretrainedModel(PreTrainedModel):
45
+ config_class = PretrainedConfig
46
+ base_model_prefix = "model"
47
+ supports_gradient_checkpointing = True
48
+ _no_split_modules = ["AXVisionAttention"]
49
+ _skip_keys_device_placement = "past_key_values"
50
+ _supports_cache_class = True
51
+ _supports_flash_attn_2 = True
52
+ _supports_sdpa = True
53
+ _supports_quantized_cache = True
54
+ _supports_static_cache = True
55
+
56
+ def __init__(self, config: PretrainedConfig):
57
+ super().__init__(config)
58
+
59
+ def _init_weights(self, module):
60
+ # important: this ported version of LlavaNext isn't meant for training from scratch - only
61
+ # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
62
+ # https://github.com/haotian-liu/LLaVA/tree/main/llava_next should serve for that purpose
63
+ std = (
64
+ self.config.initializer_range
65
+ if hasattr(self.config, "initializer_range")
66
+ else self.config.text_config.initializer_range
67
+ )
68
+
69
+ if hasattr(module, "class_embedding"):
70
+ module.class_embedding.data.normal_(mean=0.0, std=std)
71
+
72
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
73
+ module.weight.data.normal_(mean=0.0, std=std)
74
+ if module.bias is not None:
75
+ module.bias.data.zero_()
76
+ elif isinstance(module, nn.Embedding):
77
+ module.weight.data.normal_(mean=0.0, std=std)
78
+ if module.padding_idx is not None:
79
+ module.weight.data[module.padding_idx].zero_()
80
+
81
+
82
+
83
+ class AX4VLForConditionalGeneration(BaseAXPretrainedModel, GenerationMixin):
84
+ config_class = AX4VLConfig
85
+
86
+ def __init__(self, config: AX4VLConfig):
87
+ super().__init__(config)
88
+ self.vision_tower = AutoModel.from_config(config.vision_config)
89
+
90
+ self.multi_modal_projector = build_projector(config.projector_config)
91
+ self.vocab_size = config.text_config.vocab_size
92
+ self.language_model = AutoModelForCausalLM.from_config(config.text_config)
93
+ if self.language_model._tied_weights_keys is not None:
94
+ self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys]
95
+
96
+ self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
97
+ self.post_init()
98
+
99
+ def get_input_embeddings(self):
100
+ return self.language_model.get_input_embeddings()
101
+
102
+ def set_input_embeddings(self, value):
103
+ self.language_model.set_input_embeddings(value)
104
+
105
+ def get_output_embeddings(self):
106
+ return self.language_model.get_output_embeddings()
107
+
108
+ def set_output_embeddings(self, new_embeddings):
109
+ self.language_model.set_output_embeddings(new_embeddings)
110
+
111
+ def set_decoder(self, decoder):
112
+ self.language_model.set_decoder(decoder)
113
+
114
+ def get_decoder(self):
115
+ return self.language_model.get_decoder()
116
+
117
+ def get_image_features(
118
+ self,
119
+ pixel_values: torch.FloatTensor,
120
+ vision_feature_layer: Union[int, List[int]],
121
+ vision_feature_select_strategy: str,
122
+ ):
123
+ if pixel_values.dim() != 4:
124
+ # otherwise has to be stacked from list of (num_patches, num_channels, height, width)
125
+ raise ValueError(f"pixel_values of shape {pixel_values.shape}, expect to be of 4 or 5 dimensions")
126
+
127
+ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
128
+ # If we have one vision feature layer, return the corresponding hidden states,
129
+ # otherwise, select the hidden states of each feature layer and concatenate them
130
+ if isinstance(vision_feature_layer, int):
131
+ if vision_feature_layer == 0:
132
+ selected_image_feature = image_outputs.last_hidden_state
133
+ else:
134
+ selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
135
+ else:
136
+ hs_pool = [image_features.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
137
+ selected_image_feature = torch.cat(hs_pool, dim=-1)
138
+
139
+ if vision_feature_select_strategy == "default":
140
+ selected_image_feature = selected_image_feature[:, 1:]
141
+ elif vision_feature_select_strategy == "full":
142
+ selected_image_feature = selected_image_feature
143
+
144
+ image_features = self.multi_modal_projector(selected_image_feature)
145
+ return image_features
146
+
147
+ def forward(
148
+ self,
149
+ input_ids: torch.LongTensor = None,
150
+ pixel_values: torch.FloatTensor = None,
151
+ attention_mask: Optional[torch.Tensor] = None,
152
+ position_ids: Optional[torch.LongTensor] = None,
153
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
154
+ inputs_embeds: Optional[torch.FloatTensor] = None,
155
+ vision_feature_layer: Optional[Union[int, List[int]]] = None,
156
+ vision_feature_select_strategy: Optional[str] = None,
157
+ labels: Optional[torch.LongTensor] = None,
158
+ use_cache: Optional[bool] = None,
159
+ output_attentions: Optional[bool] = None,
160
+ output_hidden_states: Optional[bool] = None,
161
+ return_dict: Optional[bool] = None,
162
+ cache_position: Optional[torch.LongTensor] = None,
163
+ logits_to_keep: Union[int, torch.Tensor] = 0,
164
+ **lm_kwargs,
165
+ ) -> Union[Tuple, AX4CausalLMOutputWithPast]:
166
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
167
+ output_hidden_states = (
168
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
169
+ )
170
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
171
+ vision_feature_layer = (
172
+ vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
173
+ )
174
+ vision_feature_select_strategy = (
175
+ vision_feature_select_strategy
176
+ if vision_feature_select_strategy is not None
177
+ else self.config.vision_feature_select_strategy
178
+ )
179
+
180
+ if (input_ids is None) ^ (inputs_embeds is not None):
181
+ raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
182
+
183
+ if pixel_values is not None and inputs_embeds is not None:
184
+ raise ValueError(
185
+ "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
186
+ )
187
+
188
+ if inputs_embeds is None:
189
+ inputs_embeds = self.get_input_embeddings()(input_ids)
190
+
191
+ if pixel_values is not None and pixel_values.size(0) > 0:
192
+ image_features = self.get_image_features(
193
+ pixel_values,
194
+ vision_feature_layer=vision_feature_layer,
195
+ vision_feature_select_strategy=vision_feature_select_strategy,
196
+ )
197
+
198
+ special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
199
+ special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
200
+ if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
201
+ n_image_tokens = (input_ids == self.config.image_token_index).sum()
202
+ n_image_features = image_features.shape[0]
203
+ raise ValueError(
204
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
205
+ )
206
+ image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
207
+ inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
208
+
209
+ outputs = self.language_model(
210
+ attention_mask=attention_mask,
211
+ position_ids=position_ids,
212
+ past_key_values=past_key_values,
213
+ inputs_embeds=inputs_embeds,
214
+ use_cache=use_cache,
215
+ output_attentions=output_attentions,
216
+ output_hidden_states=output_hidden_states,
217
+ return_dict=return_dict,
218
+ cache_position=cache_position,
219
+ logits_to_keep=logits_to_keep,
220
+ **lm_kwargs,
221
+ )
222
+
223
+ logits = outputs[0]
224
+
225
+ loss = None
226
+ if labels is not None:
227
+ # Shift so that tokens < n predict n
228
+ if attention_mask is not None:
229
+ # we use the input attention mask to shift the logits and labels, because it is 2D.
230
+ # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
231
+ shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device)
232
+ shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
233
+ shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
234
+ else:
235
+ shift_logits = logits[..., :-1, :].contiguous()
236
+ shift_labels = labels[..., 1:].contiguous()
237
+ # Flatten the tokens
238
+ loss_fct = nn.CrossEntropyLoss()
239
+ loss = loss_fct(
240
+ shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
241
+ )
242
+
243
+ if not return_dict:
244
+ output = (logits,) + outputs[1:]
245
+ return (loss,) + output if loss is not None else output
246
+
247
+ return AX4CausalLMOutputWithPast(
248
+ loss=loss,
249
+ logits=logits,
250
+ past_key_values=outputs.past_key_values,
251
+ hidden_states=outputs.hidden_states,
252
+ attentions=outputs.attentions,
253
+ image_hidden_states=image_features if pixel_values is not None else None,
254
+ )
255
+
256
+ def prepare_inputs_for_generation(
257
+ self,
258
+ input_ids,
259
+ past_key_values=None,
260
+ inputs_embeds=None,
261
+ pixel_values=None,
262
+ image_sizes=None,
263
+ attention_mask=None,
264
+ cache_position=None,
265
+ logits_to_keep=None,
266
+ **kwargs,
267
+ ):
268
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
269
+
270
+ model_inputs = self.language_model.prepare_inputs_for_generation(
271
+ input_ids,
272
+ past_key_values=past_key_values,
273
+ inputs_embeds=inputs_embeds,
274
+ attention_mask=attention_mask,
275
+ cache_position=cache_position,
276
+ logits_to_keep=logits_to_keep,
277
+ **kwargs,
278
+ )
279
+
280
+ # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
281
+ # Otherwise we need pixel values to be passed to model
282
+ if cache_position[0] == 0:
283
+ model_inputs["pixel_values"] = pixel_values
284
+ model_inputs["image_sizes"] = image_sizes
285
+
286
+ return model_inputs
287
+
288
+
289
+
290
+
291
+
292
+ class FeatureIRLayer(nn.Module):
293
+ def __init__(self, in_dim: int, out_dim: int) -> None:
294
+ super().__init__()
295
+ self.mlp = nn.Sequential(
296
+ nn.Linear(in_dim, out_dim), nn.GELU(), nn.Linear(out_dim, out_dim)
297
+ )
298
+
299
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
300
+ return self.mlp(x)
301
+
302
+
303
+ class TokenDownLayer(nn.Module):
304
+ def __init__(self, shape) -> None:
305
+ super().__init__()
306
+ self.dwn = nn.Sequential(
307
+ nn.AdaptiveAvgPool2d(shape)
308
+ )
309
+
310
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
311
+ b, num_tokens, c = x.shape
312
+ h = int(math.sqrt(num_tokens))
313
+ assert h * h == num_tokens
314
+ x = x.permute(0, 2, 1).reshape(b, -1, h, h)
315
+ x = self.dwn(x)
316
+ x = x.flatten(2).transpose(1, 2)
317
+ return x
318
+
319
+
320
+ class PosInjectLayer(nn.Module):
321
+ # https://github.com/Meituan-AutoML/Twins/blob/main/gvt.py
322
+ def __init__(
323
+ self,
324
+ in_dim: int,
325
+ out_dim: int,
326
+ stride: int = 1,
327
+ padding: int = 1,
328
+ shape = None) -> None:
329
+ super().__init__()
330
+ self.peg = nn.Sequential(
331
+ nn.Conv2d(in_dim, out_dim, 3, stride, padding, bias=True, groups=out_dim)
332
+ )
333
+ self.pool = None
334
+ if shape is not None:
335
+ self.pool = nn.Sequential(
336
+ nn.AdaptiveAvgPool2d(shape)
337
+ )
338
+
339
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
340
+ b, num_tokens, c = x.shape
341
+ h = int(math.sqrt(num_tokens))
342
+ assert h * h == num_tokens
343
+ cnn_feat = x.transpose(1, 2).view(b, c, h, h)
344
+ if self.pool is not None:
345
+ x = self.peg(cnn_feat) + self.pool(cnn_feat)
346
+ else:
347
+ x = self.peg(cnn_feat) + cnn_feat
348
+ x = x.flatten(2).transpose(1, 2)
349
+ return x
350
+
351
+ class LDPProjector(PreTrainedModel):
352
+ config_class = LDPConfig
353
+ _no_split_modules = []
354
+
355
+ def __init__(self, config):
356
+ super().__init__(config)
357
+ inc, ouc = config.in_hidden_size, config.out_hidden_size
358
+ grid = config.grid_size
359
+ self.mlp = FeatureIRLayer(inc, ouc)
360
+ self.dwn = TokenDownLayer((grid, grid))
361
+ self.peg = PosInjectLayer(ouc, ouc, stride=1)
362
+
363
+ def forward(self, x):
364
+ x = self.mlp(x)
365
+ x = self.dwn(x)
366
+ x = self.peg(x)
367
+ return x
368
+
369
+ class MLPProjector(PreTrainedModel):
370
+ config_class = MLPProjectorConfig
371
+ _no_split_modules = []
372
+
373
+ def __init__(self, config):
374
+ super().__init__(config)
375
+
376
+ self.linear_1 = nn.Linear(config.in_hidden_size, config.out_hidden_size, bias=config.bias)
377
+ self.act = ACT2FN[config.hidden_act]
378
+ self.linear_2 = nn.Linear(config.out_hidden_size, config.out_hidden_size, bias=config.bias)
379
+
380
+ def forward(self, image_features):
381
+ hidden_states = self.linear_1(image_features)
382
+ hidden_states = self.act(hidden_states)
383
+ hidden_states = self.linear_2(hidden_states)
384
+ return hidden_states
385
+
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_ax4vl.AX4VLImageProcessor",
4
+ "AutoProcessor": "processing_ax4vl.AX4VLProcessor"
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_pad": false,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_tile_pad": false,
12
+ "image_mean": [
13
+ 0.5,
14
+ 0.5,
15
+ 0.5
16
+ ],
17
+ "image_processor_type": "AX4VLImageProcessor",
18
+ "image_std": [
19
+ 0.5,
20
+ 0.5,
21
+ 0.5
22
+ ],
23
+ "max_num_tiles": 12,
24
+ "min_num_tiles": 1,
25
+ "processor_class": "AX4VLProcessor",
26
+ "resample": 2,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "size": {
29
+ "shortest_edge": 384
30
+ },
31
+ "use_thumbnail": true
32
+ }
processing_ax4vl.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Union
2
+ from .configuration_ax4vl import AX4VLConfig
3
+ from transformers.models.auto import AutoProcessor
4
+ from transformers.feature_extraction_utils import BatchFeature
5
+ from transformers.image_utils import ImageInput
6
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
7
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, _validate_images_text_input_order
8
+
9
+
10
+
11
+ class BaseAXProcessor(ProcessorMixin):
12
+ attributes = ["image_processor", "tokenizer"]
13
+ image_processor_class = "AutoImageProcessor"
14
+ tokenizer_class = "AutoTokenizer"
15
+
16
+
17
+ class AX4VLProcessorKwargs(ProcessingKwargs, total=False):
18
+ _defaults = {
19
+ "text_kwargs": {
20
+ "padding": False,
21
+ },
22
+ "images_kwargs": {
23
+ "do_pad": False,
24
+ },
25
+ }
26
+
27
+
28
+ class AX4VLProcessor(BaseAXProcessor):
29
+ valid_kwargs = [
30
+ "chat_template",
31
+ "patch_size",
32
+ "num_tokens_per_tile",
33
+ "image_token",
34
+ ]
35
+
36
+ def __init__(
37
+ self,
38
+ image_processor=None,
39
+ tokenizer=None,
40
+ patch_size=16,
41
+ num_tokens_per_tile=144,
42
+ image_token="<image>", # set the default and let users change if they have peculiar special tokens in rare cases
43
+ chat_template=None,
44
+ **kwargs
45
+ ):
46
+ self.patch_size = patch_size
47
+ self.num_tokens_per_tile = num_tokens_per_tile
48
+ self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
49
+ super().__init__(image_processor, tokenizer, chat_template=chat_template)
50
+
51
+ def __call__(
52
+ self,
53
+ images: ImageInput = None,
54
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
55
+ conversations: List = None,
56
+ **kwargs
57
+ ) -> BatchFeature:
58
+ if images is None and conversations is None and text is None:
59
+ raise ValueError("You have to specify at least images, text or conversation.")
60
+
61
+ if not text and conversations is not None:
62
+ if isinstance(conversations[0], dict):
63
+ conversations = [conversations]
64
+ text = [self.apply_chat_template(conv, **kwargs) for conv in conversations]
65
+
66
+ images, text = _validate_images_text_input_order(images, text)
67
+
68
+ output_kwargs = self._merge_kwargs(
69
+ AX4VLProcessorKwargs,
70
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
71
+ **kwargs,
72
+ )
73
+
74
+ if images is not None:
75
+ image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
76
+ else:
77
+ image_inputs = {}
78
+
79
+ prompt_strings = text
80
+ if image_inputs:
81
+ num_tiles = iter(image_inputs["num_tiles"])
82
+ prompt_strings = []
83
+ for sample in text:
84
+ while self.image_token in sample:
85
+ num_tile = next(num_tiles)
86
+ num_image_tokens = num_tile * self.num_tokens_per_tile
87
+ sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
88
+ prompt_strings.append(sample)
89
+ prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
90
+
91
+ text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
92
+
93
+ if "num_tiles" in image_inputs:
94
+ del image_inputs["num_tiles"]
95
+ return BatchFeature(data={**text_inputs, **image_inputs})
96
+
97
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
98
+ def batch_decode(self, *args, **kwargs):
99
+ """
100
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
101
+ refer to the docstring of this method for more information.
102
+ """
103
+ return self.tokenizer.batch_decode(*args, **kwargs)
104
+
105
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
106
+ def decode(self, *args, **kwargs):
107
+ """
108
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
109
+ the docstring of this method for more information.
110
+ """
111
+ return self.tokenizer.decode(*args, **kwargs)
112
+
113
+ @property
114
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
115
+ def model_input_names(self):
116
+ tokenizer_input_names = self.tokenizer.model_input_names
117
+ image_processor_input_names = self.image_processor.model_input_names
118
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
119
+
120
+
121
+ AutoProcessor.register(AX4VLConfig, AX4VLProcessor)
processor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_ax4vl.AX4VLProcessor"
4
+ },
5
+ "image_token": "<|extra_id_11|>",
6
+ "num_tokens_per_tile": 144,
7
+ "patch_size": 16,
8
+ "processor_class": "AX4VLProcessor"
9
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "<|pad|>",
5
+ "<|unk|>",
6
+ "<|sep|>",
7
+ "<|mask|>",
8
+ "<|cls|>",
9
+ "<|image|>",
10
+ "<|audio|>",
11
+ "<|user|>",
12
+ "<|system|>",
13
+ "<|assistant|>",
14
+ "<|extra_id_0|>",
15
+ "<|extra_id_1|>",
16
+ "<|extra_id_2|>",
17
+ "<|extra_id_3|>",
18
+ "<|extra_id_4|>",
19
+ "<|extra_id_5|>",
20
+ "<|extra_id_6|>",
21
+ "<|extra_id_7|>",
22
+ "<|extra_id_8|>",
23
+ "<|extra_id_9|>",
24
+ "<|extra_id_10|>",
25
+ "<|extra_id_11|>",
26
+ "<|extra_id_12|>",
27
+ "<|extra_id_13|>",
28
+ "<|im_start|>",
29
+ "<|im_sep|>",
30
+ "<|im_end|>",
31
+ "<|resident_reg|>",
32
+ "<|foreigner_reg|>",
33
+ "<|business_reg|>",
34
+ "<|credit_card|>",
35
+ "<|passport|>",
36
+ "<|driver_license|>",
37
+ "<|telephone|>",
38
+ "<|health_insurance|>",
39
+ "<|bank_account|>"
40
+ ],
41
+ "bos_token": {
42
+ "content": "<|endoftext|>",
43
+ "lstrip": false,
44
+ "normalized": false,
45
+ "rstrip": false,
46
+ "single_word": false
47
+ },
48
+ "cls_token": {
49
+ "content": "<|cls|>",
50
+ "lstrip": false,
51
+ "normalized": false,
52
+ "rstrip": false,
53
+ "single_word": false
54
+ },
55
+ "eos_token": {
56
+ "content": "<|im_end|>",
57
+ "lstrip": false,
58
+ "normalized": false,
59
+ "rstrip": false,
60
+ "single_word": false
61
+ },
62
+ "mask_token": {
63
+ "content": "<|mask|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false
68
+ },
69
+ "pad_token": {
70
+ "content": "<|pad|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false
75
+ },
76
+ "sep_token": {
77
+ "content": "<|sep|>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false
82
+ },
83
+ "unk_token": {
84
+ "content": "<|unk|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false
89
+ }
90
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<|pad|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<|unk|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<|sep|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "4": {
39
+ "content": "<|mask|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "5": {
47
+ "content": "<|cls|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "6": {
55
+ "content": "<|image|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "7": {
63
+ "content": "<|audio|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "8": {
71
+ "content": "<|user|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "9": {
79
+ "content": "<|system|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "10": {
87
+ "content": "<|assistant|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "11": {
95
+ "content": "<|extra_id_0|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "12": {
103
+ "content": "<|extra_id_1|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "13": {
111
+ "content": "<|extra_id_2|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "14": {
119
+ "content": "<|extra_id_3|>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": true
125
+ },
126
+ "15": {
127
+ "content": "<|extra_id_4|>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": true
133
+ },
134
+ "16": {
135
+ "content": "<|extra_id_5|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": true
141
+ },
142
+ "17": {
143
+ "content": "<|extra_id_6|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": true
149
+ },
150
+ "18": {
151
+ "content": "<|extra_id_7|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": true
157
+ },
158
+ "19": {
159
+ "content": "<|extra_id_8|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": true
165
+ },
166
+ "20": {
167
+ "content": "<|extra_id_9|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": true
173
+ },
174
+ "21": {
175
+ "content": "<|extra_id_10|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": true
181
+ },
182
+ "22": {
183
+ "content": "<|extra_id_11|>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": true
189
+ },
190
+ "23": {
191
+ "content": "<|extra_id_12|>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": true
197
+ },
198
+ "24": {
199
+ "content": "<|extra_id_13|>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": true
205
+ },
206
+ "25": {
207
+ "content": "<|im_start|>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": true
213
+ },
214
+ "26": {
215
+ "content": "<|im_sep|>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "27": {
223
+ "content": "<|im_end|>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "28": {
231
+ "content": "<|resident_reg|>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "29": {
239
+ "content": "<|foreigner_reg|>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "30": {
247
+ "content": "<|business_reg|>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "31": {
255
+ "content": "<|credit_card|>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "32": {
263
+ "content": "<|passport|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "33": {
271
+ "content": "<|driver_license|>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "34": {
279
+ "content": "<|telephone|>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "35": {
287
+ "content": "<|health_insurance|>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ },
294
+ "36": {
295
+ "content": "<|bank_account|>",
296
+ "lstrip": false,
297
+ "normalized": false,
298
+ "rstrip": false,
299
+ "single_word": false,
300
+ "special": true
301
+ },
302
+ "37": {
303
+ "content": "</tool_output>",
304
+ "lstrip": false,
305
+ "normalized": false,
306
+ "rstrip": false,
307
+ "single_word": false,
308
+ "special": false
309
+ },
310
+ "38": {
311
+ "content": "<tool_output>",
312
+ "lstrip": false,
313
+ "normalized": false,
314
+ "rstrip": false,
315
+ "single_word": false,
316
+ "special": false
317
+ },
318
+ "39": {
319
+ "content": "</tool_call>",
320
+ "lstrip": false,
321
+ "normalized": false,
322
+ "rstrip": false,
323
+ "single_word": false,
324
+ "special": false
325
+ },
326
+ "40": {
327
+ "content": "<tool_call>",
328
+ "lstrip": false,
329
+ "normalized": false,
330
+ "rstrip": false,
331
+ "single_word": false,
332
+ "special": false
333
+ }
334
+ },
335
+ "additional_special_tokens": [
336
+ "<|endoftext|>",
337
+ "<|pad|>",
338
+ "<|unk|>",
339
+ "<|sep|>",
340
+ "<|mask|>",
341
+ "<|cls|>",
342
+ "<|image|>",
343
+ "<|audio|>",
344
+ "<|user|>",
345
+ "<|system|>",
346
+ "<|assistant|>",
347
+ "<|extra_id_0|>",
348
+ "<|extra_id_1|>",
349
+ "<|extra_id_2|>",
350
+ "<|extra_id_3|>",
351
+ "<|extra_id_4|>",
352
+ "<|extra_id_5|>",
353
+ "<|extra_id_6|>",
354
+ "<|extra_id_7|>",
355
+ "<|extra_id_8|>",
356
+ "<|extra_id_9|>",
357
+ "<|extra_id_10|>",
358
+ "<|extra_id_11|>",
359
+ "<|extra_id_12|>",
360
+ "<|extra_id_13|>",
361
+ "<|im_start|>",
362
+ "<|im_sep|>",
363
+ "<|im_end|>",
364
+ "<|resident_reg|>",
365
+ "<|foreigner_reg|>",
366
+ "<|business_reg|>",
367
+ "<|credit_card|>",
368
+ "<|passport|>",
369
+ "<|driver_license|>",
370
+ "<|telephone|>",
371
+ "<|health_insurance|>",
372
+ "<|bank_account|>"
373
+ ],
374
+ "bos_token": "<|endoftext|>",
375
+ "chat_template": "{%- if tools is iterable and tools | length > 0 %}\n {{- '<|im_start|><|system|>'}}\n {{- '당신은 도구 호출 기능을 갖춘 유용한 도우미입니다. 사용자의 요청을 처리하기 위해서 필요한 도구가 주어진 목록에 있는 경우 도구 호출로 응답하세요.\n필요한 도구가 목록에 없는 경우에는 도구 호출 없이 사용자가 요구한 정보를 제공하세요.\n필요한 도구가 목록에 있지만 해당 도구를 호출하는데 필요한 argument 정보가 부족한 경우 해당 정보를 사용자에게 요청하세요.\n사용자의 요청을 처리하기 위해 여러번 도구를 호출할 수 있어야 합니다.\n도구 호출 이후 도구 실행 결과를 입력으로 받으면 해당 결과를 활용하여 답변을 생성하세요.\n\n다음은 접근할 수 있는 도구들의 목록 입니다:\n<tools>\n'}}\n {%- for t in tools %}\n {{- t | tojson }}\n {{- '\n' }}\n {%- endfor %}\n {{- '</tools>' }}\n {{- '\n\n도구를 호출하려면 아래의 JSON으로 응답하세요.\n도구 호출 형식: <tool_call>{\"name\": 도구 이름, \"arguments\": dictionary 형태의 도구 인자값}</tool_call>' }}\n \n {%- if messages[0].role == 'system' %}\n {{- '\n\n' + messages[0].content}}\n {% set dummy = messages.pop(0) %}\n {%- endif %} \n {{- '<|im_end|>' }}\n {%- endif %}\n \n {%- for message in messages %}\n {%- if message.role == 'system' %}\n {{- '<|im_start|><|system|>' + message.content + '<|im_end|>'}}\n {%- elif message.role == 'user' %}\n {{- '<|im_start|><|user|>' + message.content + '<|im_end|>'}}\n {%- elif message.role == 'assistant' %}\n {{- '<|im_start|><|assistant|>'}}\n {%- if message.content is defined %}\n {{- message.content}}\n {%- endif %}\n {%- if message.tool_calls is defined %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>' }}\n {{- '{' }}\n {{- '\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\"' }}\n {%- if tool_call.arguments is defined %}\n {{- ', ' }}\n {{- '\"arguments\": ' }}\n {{- tool_call.arguments|tojson }}\n {%- endif %}\n {{- '}' }}\n {{- '</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>'}}\n \n {%- elif message.role == 'tool' %}\n {{- '<|im_start|><|extra_id_13|><tool_output>' + message.content + '</tool_output><|im_end|>'}}\n {%- endif %}\n {%- endfor %}\n {%- if add_generation_prompt %}\n {{- '<|im_start|><|assistant|>'}}\n {%- endif %}",
376
+ "clean_up_tokenization_spaces": true,
377
+ "auto_map": {
378
+ "AutoProcessor": "processing_ax4vl.AX4VLProcessor"
379
+ },
380
+ "cls_token": "<|cls|>",
381
+ "eod_token": "<|endoftext|>",
382
+ "eos_token": "<|im_end|>",
383
+ "errors": "replace",
384
+ "extra_special_tokens": {},
385
+ "mask_token": "<|mask|>",
386
+ "model_max_length": 8192,
387
+ "pad_token": "<|pad|>",
388
+ "padding_side": "right",
389
+ "processor_class": "AX4VLProcessor",
390
+ "sep_token": "<|sep|>",
391
+ "tokenizer_class": "GPT2Tokenizer",
392
+ "truncation_side": "left",
393
+ "unk_token": "<|unk|>",
394
+ "vocab_size": 102400
395
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff