Snapmap commited on
Commit
5f16a95
·
verified ·
1 Parent(s): f941328

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. LLM/Florence-2-base/.gitattributes +36 -0
  2. LLM/Florence-2-base/CODE_OF_CONDUCT.md +9 -0
  3. LLM/Florence-2-base/LICENSE +21 -0
  4. LLM/Florence-2-base/README.md +259 -0
  5. LLM/Florence-2-base/SECURITY.md +41 -0
  6. LLM/Florence-2-base/SUPPORT.md +25 -0
  7. LLM/Florence-2-base/config.json +85 -0
  8. LLM/Florence-2-base/configuration_florence2.py +340 -0
  9. LLM/Florence-2-base/modeling_florence2.py +0 -0
  10. LLM/Florence-2-base/preprocessor_config.json +39 -0
  11. LLM/Florence-2-base/processing_florence2.py +1148 -0
  12. LLM/Florence-2-base/tokenizer.json +0 -0
  13. LLM/Florence-2-base/tokenizer_config.json +4 -0
  14. LLM/Florence-2-base/vocab.json +0 -0
  15. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/config.json +319 -0
  16. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/model.safetensors.index.json +827 -0
  17. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/preprocessor_config.json +21 -0
  18. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/tokenizer.json +0 -0
  19. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/tokenizer_config.json +239 -0
  20. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/video_preprocessor_config.json +21 -0
  21. LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/vocab.json +0 -0
  22. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/.gitattributes +36 -0
  23. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/README.md +117 -0
  24. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/added_tokens.json +28 -0
  25. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/chat_template.jinja +120 -0
  26. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/config.json +320 -0
  27. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/generation_config.json +13 -0
  28. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/merges.txt +0 -0
  29. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/model.safetensors.index.json +973 -0
  30. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/preprocessor_config.json +39 -0
  31. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/special_tokens_map.json +31 -0
  32. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/tokenizer_config.json +240 -0
  33. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/video_preprocessor_config.json +41 -0
  34. LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/vocab.json +0 -0
  35. RMBG/RMBG-2.0/BiRefNet_config.py +11 -0
  36. RMBG/RMBG-2.0/__pycache__/BiRefNet_config.cpython-312.pyc +0 -0
  37. RMBG/RMBG-2.0/birefnet.py +2244 -0
  38. RMBG/RMBG-2.0/config.json +20 -0
  39. audio_encoders/put_audio_encoder_models_here +0 -0
  40. checkpoints/ComfyUI-Frame-Interpolation +0 -0
  41. checkpoints/Qwen-Rapid-AIO-NSFW-v18.metadata.json +25 -0
  42. checkpoints/SUPIR-v0F.metadata.json +25 -0
  43. checkpoints/analogMadnessSDXL_xl5.metadata.json +25 -0
  44. checkpoints/epicrealismXL_pureFix.metadata.json +25 -0
  45. checkpoints/gonzalomoXLFluxPony_v30FluxDAIO.metadata.json +25 -0
  46. checkpoints/illustriousRealismBy_v10.metadata.json +25 -0
  47. checkpoints/illustriousRealismBy_v10VAE.metadata.json +25 -0
  48. checkpoints/illustriousRealismBy_v10VAE.safetensors +0 -0
  49. checkpoints/intorealismUltra_v10.metadata.json +25 -0
  50. checkpoints/juggernautXL_ragnarokBy.metadata.json +25 -0
LLM/Florence-2-base/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *bin filter=lfs diff=lfs merge=lfs -text
LLM/Florence-2-base/CODE_OF_CONDUCT.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # Microsoft Open Source Code of Conduct
2
+
3
+ This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4
+
5
+ Resources:
6
+
7
+ - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8
+ - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9
+ - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
LLM/Florence-2-base/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Microsoft Corporation.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
LLM/Florence-2-base/README.md ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ license_link: https://huggingface.co/microsoft/Florence-2-base/resolve/main/LICENSE
4
+ pipeline_tag: image-text-to-text
5
+ tags:
6
+ - vision
7
+ ---
8
+
9
+ # Florence-2: Advancing a Unified Representation for a Variety of Vision Tasks
10
+
11
+ ## Model Summary
12
+
13
+ This Hub repository contains a HuggingFace's `transformers` implementation of Florence-2 model from Microsoft.
14
+
15
+ Florence-2 is an advanced vision foundation model that uses a prompt-based approach to handle a wide range of vision and vision-language tasks. Florence-2 can interpret simple text prompts to perform tasks like captioning, object detection, and segmentation. It leverages our FLD-5B dataset, containing 5.4 billion annotations across 126 million images, to master multi-task learning. The model's sequence-to-sequence architecture enables it to excel in both zero-shot and fine-tuned settings, proving to be a competitive vision foundation model.
16
+
17
+ Resources and Technical Documentation:
18
+ + [Florence-2 technical report](https://arxiv.org/abs/2311.06242).
19
+ + [Jupyter Notebook for inference and visualization of Florence-2-large model](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
20
+
21
+ | Model | Model size | Model Description |
22
+ | ------- | ------------- | ------------- |
23
+ | Florence-2-base[[HF]](https://huggingface.co/microsoft/Florence-2-base) | 0.23B | Pretrained model with FLD-5B
24
+ | Florence-2-large[[HF]](https://huggingface.co/microsoft/Florence-2-large) | 0.77B | Pretrained model with FLD-5B
25
+ | Florence-2-base-ft[[HF]](https://huggingface.co/microsoft/Florence-2-base-ft) | 0.23B | Finetuned model on a colletion of downstream tasks
26
+ | Florence-2-large-ft[[HF]](https://huggingface.co/microsoft/Florence-2-large-ft) | 0.77B | Finetuned model on a colletion of downstream tasks
27
+
28
+ ## How to Get Started with the Model
29
+
30
+ Use the code below to get started with the model. All models are trained with float16.
31
+
32
+ ```python
33
+ import requests
34
+
35
+ from PIL import Image
36
+ from transformers import AutoProcessor, AutoModelForCausalLM
37
+
38
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
39
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
40
+
41
+ model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
42
+ processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
43
+
44
+ prompt = "<OD>"
45
+
46
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
47
+ image = Image.open(requests.get(url, stream=True).raw)
48
+
49
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
50
+
51
+ generated_ids = model.generate(
52
+ input_ids=inputs["input_ids"],
53
+ pixel_values=inputs["pixel_values"],
54
+ max_new_tokens=1024,
55
+ do_sample=False,
56
+ num_beams=3,
57
+ )
58
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
59
+
60
+ parsed_answer = processor.post_process_generation(generated_text, task="<OD>", image_size=(image.width, image.height))
61
+
62
+ print(parsed_answer)
63
+
64
+ ```
65
+
66
+
67
+ ## Tasks
68
+
69
+ This model is capable of performing different tasks through changing the prompts.
70
+
71
+ First, let's define a function to run a prompt.
72
+
73
+ <details>
74
+ <summary> Click to expand </summary>
75
+
76
+ ```python
77
+ import requests
78
+
79
+ from PIL import Image
80
+ from transformers import AutoProcessor, AutoModelForCausalLM
81
+
82
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
83
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
84
+
85
+ model = AutoModelForCausalLM.from_pretrained("microsoft/Florence-2-base", torch_dtype=torch_dtype, trust_remote_code=True).to(device)
86
+ processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base", trust_remote_code=True)
87
+
88
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
89
+ image = Image.open(requests.get(url, stream=True).raw)
90
+
91
+ def run_example(task_prompt, text_input=None):
92
+ if text_input is None:
93
+ prompt = task_prompt
94
+ else:
95
+ prompt = task_prompt + text_input
96
+ inputs = processor(text=prompt, images=image, return_tensors="pt").to(device, torch_dtype)
97
+ generated_ids = model.generate(
98
+ input_ids=inputs["input_ids"],
99
+ pixel_values=inputs["pixel_values"],
100
+ max_new_tokens=1024,
101
+ num_beams=3
102
+ )
103
+ generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
104
+
105
+ parsed_answer = processor.post_process_generation(generated_text, task=task_prompt, image_size=(image.width, image.height))
106
+
107
+ print(parsed_answer)
108
+ ```
109
+ </details>
110
+
111
+ Here are the tasks `Florence-2` could perform:
112
+
113
+ <details>
114
+ <summary> Click to expand </summary>
115
+
116
+
117
+
118
+ ### Caption
119
+ ```python
120
+ prompt = "<CAPTION>"
121
+ run_example(prompt)
122
+ ```
123
+
124
+ ### Detailed Caption
125
+ ```python
126
+ prompt = "<DETAILED_CAPTION>"
127
+ run_example(prompt)
128
+ ```
129
+
130
+ ### More Detailed Caption
131
+ ```python
132
+ prompt = "<MORE_DETAILED_CAPTION>"
133
+ run_example(prompt)
134
+ ```
135
+
136
+ ### Caption to Phrase Grounding
137
+ caption to phrase grounding task requires additional text input, i.e. caption.
138
+
139
+ Caption to phrase grounding results format:
140
+ {'\<CAPTION_TO_PHRASE_GROUNDING>': {'bboxes': [[x1, y1, x2, y2], ...], 'labels': ['', '', ...]}}
141
+ ```python
142
+ task_prompt = "<CAPTION_TO_PHRASE_GROUNDING>"
143
+ results = run_example(task_prompt, text_input="A green car parked in front of a yellow building.")
144
+ ```
145
+
146
+ ### Object Detection
147
+
148
+ OD results format:
149
+ {'\<OD>': {'bboxes': [[x1, y1, x2, y2], ...],
150
+ 'labels': ['label1', 'label2', ...]} }
151
+
152
+ ```python
153
+ prompt = "<OD>"
154
+ run_example(prompt)
155
+ ```
156
+
157
+ ### Dense Region Caption
158
+ Dense region caption results format:
159
+ {'\<DENSE_REGION_CAPTION>' : {'bboxes': [[x1, y1, x2, y2], ...],
160
+ 'labels': ['label1', 'label2', ...]} }
161
+ ```python
162
+ prompt = "<DENSE_REGION_CAPTION>"
163
+ run_example(prompt)
164
+ ```
165
+
166
+ ### Region proposal
167
+ Dense region caption results format:
168
+ {'\<REGION_PROPOSAL>': {'bboxes': [[x1, y1, x2, y2], ...],
169
+ 'labels': ['', '', ...]}}
170
+ ```python
171
+ prompt = "<REGION_PROPOSAL>"
172
+ run_example(prompt)
173
+ ```
174
+
175
+ ### OCR
176
+
177
+ ```python
178
+ prompt = "<OCR>"
179
+ run_example(prompt)
180
+ ```
181
+
182
+ ### OCR with Region
183
+ OCR with region output format:
184
+ {'\<OCR_WITH_REGION>': {'quad_boxes': [[x1, y1, x2, y2, x3, y3, x4, y4], ...], 'labels': ['text1', ...]}}
185
+ ```python
186
+ prompt = "<OCR_WITH_REGION>"
187
+ run_example(prompt)
188
+ ```
189
+
190
+ for More detailed examples, please refer to [notebook](https://huggingface.co/microsoft/Florence-2-large/blob/main/sample_inference.ipynb)
191
+ </details>
192
+
193
+ # Benchmarks
194
+
195
+ ## Florence-2 Zero-shot performance
196
+
197
+ The following table presents the zero-shot performance of generalist vision foundation models on image captioning and object detection evaluation tasks. These models have not been exposed to the training data of the evaluation tasks during their training phase.
198
+
199
+ | Method | #params | COCO Cap. test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | COCO Det. val2017 mAP |
200
+ |--------|---------|----------------------|------------------|--------------------|-----------------------|
201
+ | Flamingo | 80B | 84.3 | - | - | - |
202
+ | Florence-2-base| 0.23B | 133.0 | 118.7 | 70.1 | 34.7 |
203
+ | Florence-2-large| 0.77B | 135.6 | 120.8 | 72.8 | 37.5 |
204
+
205
+
206
+ The following table continues the comparison with performance on other vision-language evaluation tasks.
207
+
208
+ | Method | Flickr30k test R@1 | Refcoco val Accuracy | Refcoco test-A Accuracy | Refcoco test-B Accuracy | Refcoco+ val Accuracy | Refcoco+ test-A Accuracy | Refcoco+ test-B Accuracy | Refcocog val Accuracy | Refcocog test Accuracy | Refcoco RES val mIoU |
209
+ |--------|----------------------|----------------------|-------------------------|-------------------------|-----------------------|--------------------------|--------------------------|-----------------------|------------------------|----------------------|
210
+ | Kosmos-2 | 78.7 | 52.3 | 57.4 | 47.3 | 45.5 | 50.7 | 42.2 | 60.6 | 61.7 | - |
211
+ | Florence-2-base | 83.6 | 53.9 | 58.4 | 49.7 | 51.5 | 56.4 | 47.9 | 66.3 | 65.1 | 34.6 |
212
+ | Florence-2-large | 84.4 | 56.3 | 61.6 | 51.4 | 53.6 | 57.9 | 49.9 | 68.0 | 67.0 | 35.8 |
213
+
214
+
215
+
216
+ ## Florence-2 finetuned performance
217
+
218
+ We finetune Florence-2 models with a collection of downstream tasks, resulting two generalist models *Florence-2-base-ft* and *Florence-2-large-ft* that can conduct a wide range of downstream tasks.
219
+
220
+ The table below compares the performance of specialist and generalist models on various captioning and Visual Question Answering (VQA) tasks. Specialist models are fine-tuned specifically for each task, whereas generalist models are fine-tuned in a task-agnostic manner across all tasks. The symbol "▲" indicates the usage of external OCR as input.
221
+
222
+ | Method | # Params | COCO Caption Karpathy test CIDEr | NoCaps val CIDEr | TextCaps val CIDEr | VQAv2 test-dev Acc | TextVQA test-dev Acc | VizWiz VQA test-dev Acc |
223
+ |----------------|----------|-----------------------------------|------------------|--------------------|--------------------|----------------------|-------------------------|
224
+ | **Specialist Models** | | | | | | | |
225
+ | CoCa | 2.1B | 143.6 | 122.4 | - | 82.3 | - | - |
226
+ | BLIP-2 | 7.8B | 144.5 | 121.6 | - | 82.2 | - | - |
227
+ | GIT2 | 5.1B | 145.0 | 126.9 | 148.6 | 81.7 | 67.3 | 71.0 |
228
+ | Flamingo | 80B | 138.1 | - | - | 82.0 | 54.1 | 65.7 |
229
+ | PaLI | 17B | 149.1 | 127.0 | 160.0▲ | 84.3 | 58.8 / 73.1▲ | 71.6 / 74.4▲ |
230
+ | PaLI-X | 55B | 149.2 | 126.3 | 147.0 / 163.7▲ | 86.0 | 71.4 / 80.8▲ | 70.9 / 74.6▲ |
231
+ | **Generalist Models** | | | | | | | |
232
+ | Unified-IO | 2.9B | - | 100.0 | - | 77.9 | - | 57.4 |
233
+ | Florence-2-base-ft | 0.23B | 140.0 | 116.7 | 143.9 | 79.7 | 63.6 | 63.6 |
234
+ | Florence-2-large-ft | 0.77B | 143.3 | 124.9 | 151.1 | 81.7 | 73.5 | 72.6 |
235
+
236
+
237
+ | Method | # Params | COCO Det. val2017 mAP | Flickr30k test R@1 | RefCOCO val Accuracy | RefCOCO test-A Accuracy | RefCOCO test-B Accuracy | RefCOCO+ val Accuracy | RefCOCO+ test-A Accuracy | RefCOCO+ test-B Accuracy | RefCOCOg val Accuracy | RefCOCOg test Accuracy | RefCOCO RES val mIoU |
238
+ |----------------------|----------|-----------------------|--------------------|----------------------|-------------------------|-------------------------|------------------------|---------------------------|---------------------------|------------------------|-----------------------|------------------------|
239
+ | **Specialist Models** | | | | | | | | | | | | |
240
+ | SeqTR | - | - | - | 83.7 | 86.5 | 81.2 | 71.5 | 76.3 | 64.9 | 74.9 | 74.2 | - |
241
+ | PolyFormer | - | - | - | 90.4 | 92.9 | 87.2 | 85.0 | 89.8 | 78.0 | 85.8 | 85.9 | 76.9 |
242
+ | UNINEXT | 0.74B | 60.6 | - | 92.6 | 94.3 | 91.5 | 85.2 | 89.6 | 79.8 | 88.7 | 89.4 | - |
243
+ | Ferret | 13B | - | - | 89.5 | 92.4 | 84.4 | 82.8 | 88.1 | 75.2 | 85.8 | 86.3 | - |
244
+ | **Generalist Models** | | | | | | | | | | | | |
245
+ | UniTAB | - | - | - | 88.6 | 91.1 | 83.8 | 81.0 | 85.4 | 71.6 | 84.6 | 84.7 | - |
246
+ | Florence-2-base-ft | 0.23B | 41.4 | 84.0 | 92.6 | 94.8 | 91.5 | 86.8 | 91.7 | 82.2 | 89.8 | 82.2 | 78.0 |
247
+ | Florence-2-large-ft| 0.77B | 43.4 | 85.2 | 93.4 | 95.3 | 92.0 | 88.3 | 92.9 | 83.6 | 91.2 | 91.7 | 80.5 |
248
+
249
+
250
+ ## BibTex and citation info
251
+
252
+ ```
253
+ @article{xiao2023florence,
254
+ title={Florence-2: Advancing a unified representation for a variety of vision tasks},
255
+ author={Xiao, Bin and Wu, Haiping and Xu, Weijian and Dai, Xiyang and Hu, Houdong and Lu, Yumao and Zeng, Michael and Liu, Ce and Yuan, Lu},
256
+ journal={arXiv preprint arXiv:2311.06242},
257
+ year={2023}
258
+ }
259
+ ```
LLM/Florence-2-base/SECURITY.md ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
2
+
3
+ ## Security
4
+
5
+ Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6
+
7
+ If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8
+
9
+ ## Reporting Security Issues
10
+
11
+ **Please do not report security vulnerabilities through public GitHub issues.**
12
+
13
+ Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14
+
15
+ If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16
+
17
+ You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18
+
19
+ Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20
+
21
+ * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22
+ * Full paths of source file(s) related to the manifestation of the issue
23
+ * The location of the affected source code (tag/branch/commit or direct URL)
24
+ * Any special configuration required to reproduce the issue
25
+ * Step-by-step instructions to reproduce the issue
26
+ * Proof-of-concept or exploit code (if possible)
27
+ * Impact of the issue, including how an attacker might exploit the issue
28
+
29
+ This information will help us triage your report more quickly.
30
+
31
+ If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32
+
33
+ ## Preferred Languages
34
+
35
+ We prefer all communications to be in English.
36
+
37
+ ## Policy
38
+
39
+ Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40
+
41
+ <!-- END MICROSOFT SECURITY.MD BLOCK -->
LLM/Florence-2-base/SUPPORT.md ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # TODO: The maintainer of this repo has not yet edited this file
2
+
3
+ **REPO OWNER**: Do you want Customer Service & Support (CSS) support for this product/project?
4
+
5
+ - **No CSS support:** Fill out this template with information about how to file issues and get help.
6
+ - **Yes CSS support:** Fill out an intake form at [aka.ms/onboardsupport](https://aka.ms/onboardsupport). CSS will work with/help you to determine next steps.
7
+ - **Not sure?** Fill out an intake as though the answer were "Yes". CSS will help you decide.
8
+
9
+ *Then remove this first heading from this SUPPORT.MD file before publishing your repo.*
10
+
11
+ # Support
12
+
13
+ ## How to file issues and get help
14
+
15
+ This project uses GitHub Issues to track bugs and feature requests. Please search the existing
16
+ issues before filing new issues to avoid duplicates. For new issues, file your bug or
17
+ feature request as a new Issue.
18
+
19
+ For help and questions about using this project, please **REPO MAINTAINER: INSERT INSTRUCTIONS HERE
20
+ FOR HOW TO ENGAGE REPO OWNERS OR COMMUNITY FOR HELP. COULD BE A STACK OVERFLOW TAG OR OTHER
21
+ CHANNEL. WHERE WILL YOU HELP PEOPLE?**.
22
+
23
+ ## Microsoft Support Policy
24
+
25
+ Support for this **PROJECT or PRODUCT** is limited to the resources listed above.
LLM/Florence-2-base/config.json ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "florence2",
3
+ "architectures": [
4
+ "Florence2ForConditionalGeneration"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_florence2.Florence2Config",
8
+ "AutoModelForCausalLM": "modeling_florence2.Florence2ForConditionalGeneration"
9
+ },
10
+ "bos_token_id": 0,
11
+ "eos_token_id": 2,
12
+ "ignore_index": -100,
13
+ "model_type": "florence2",
14
+ "pad_token_id": 1,
15
+ "projection_dim": 768,
16
+ "text_config": {
17
+ "vocab_size": 51289,
18
+ "activation_dropout": 0.1,
19
+ "activation_function": "gelu",
20
+ "add_bias_logits": false,
21
+ "add_final_layer_norm": false,
22
+ "attention_dropout": 0.1,
23
+ "bos_token_id": 0,
24
+ "classif_dropout": 0.1,
25
+ "classifier_dropout": 0.0,
26
+ "d_model": 768,
27
+ "decoder_attention_heads": 12,
28
+ "decoder_ffn_dim": 3072,
29
+ "decoder_layerdrop": 0.0,
30
+ "decoder_layers": 6,
31
+ "decoder_start_token_id": 2,
32
+ "dropout": 0.1,
33
+ "early_stopping": true,
34
+ "encoder_attention_heads": 12,
35
+ "encoder_ffn_dim": 3072,
36
+ "encoder_layerdrop": 0.0,
37
+ "encoder_layers": 6,
38
+ "eos_token_id": 2,
39
+ "forced_eos_token_id": 2,
40
+ "forced_bos_token_id": 0,
41
+ "gradient_checkpointing": false,
42
+ "init_std": 0.02,
43
+ "is_encoder_decoder": true,
44
+ "label2id": {
45
+ "LABEL_0": 0,
46
+ "LABEL_1": 1,
47
+ "LABEL_2": 2
48
+ },
49
+ "max_position_embeddings": 1024,
50
+ "no_repeat_ngram_size": 3,
51
+ "normalize_before": false,
52
+ "num_hidden_layers": 6,
53
+ "pad_token_id": 1,
54
+ "scale_embedding": false,
55
+ "num_beams": 3
56
+ },
57
+ "vision_config": {
58
+ "model_type": "davit",
59
+ "drop_path_rate": 0.1,
60
+ "patch_size": [7, 3, 3, 3],
61
+ "patch_stride": [4, 2, 2, 2],
62
+ "patch_padding": [3, 1, 1, 1],
63
+ "patch_prenorm": [false, true, true, true],
64
+ "enable_checkpoint": false,
65
+ "dim_embed": [128, 256, 512, 1024],
66
+ "num_heads": [4, 8, 16, 32],
67
+ "num_groups": [4, 8, 16, 32],
68
+ "depths": [1, 1, 9, 1],
69
+ "window_size": 12,
70
+ "projection_dim": 768,
71
+ "visual_temporal_embedding": {
72
+ "type": "COSINE",
73
+ "max_temporal_embeddings": 100
74
+ },
75
+ "image_pos_embed": {
76
+ "type": "learned_abs_2d",
77
+ "max_pos_embeddings": 50
78
+ },
79
+ "image_feature_source": ["spatial_avg_pool", "temporal_avg_pool"]
80
+ },
81
+ "vocab_size": 51289,
82
+ "torch_dtype": "float16",
83
+ "transformers_version": "4.41.0.dev0",
84
+ "is_encoder_decoder": true
85
+ }
LLM/Florence-2-base/configuration_florence2.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import warnings
15
+ """ Florence-2 configuration"""
16
+
17
+ from typing import Optional
18
+
19
+ from transformers import AutoConfig
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ class Florence2VisionConfig(PretrainedConfig):
26
+ r"""
27
+ This is the configuration class to store the configuration of a [`Florence2VisionModel`]. It is used to instantiate a Florence2VisionModel
28
+ according to the specified arguments, defining the model architecture. Instantiating a configuration with the
29
+ defaults will yield a similar configuration to that of the Florence2VisionModel architecture.
30
+
31
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
32
+ documentation from [`PretrainedConfig`] for more information.
33
+
34
+ Args:
35
+ drop_path_rate (`float`, *optional*, defaults to 0.1):
36
+ The dropout rate of the drop path layer.
37
+ patch_size (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
38
+ The patch size of the image.
39
+ patch_stride (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
40
+ The patch stride of the image.
41
+ patch_padding (`List[int]`, *optional*, defaults to [3, 1, 1, 1]):
42
+ The patch padding of the image.
43
+ patch_prenorm (`List[bool]`, *optional*, defaults to [false, true, true, true]):
44
+ Whether to apply layer normalization before the patch embedding layer.
45
+ enable_checkpoint (`bool`, *optional*, defaults to False):
46
+ Whether to enable checkpointing.
47
+ dim_embed (`List[int]`, *optional*, defaults to [256, 512, 1024, 2048]):
48
+ The dimension of the embedding layer.
49
+ num_heads (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
50
+ The number of attention heads.
51
+ num_groups (`List[int]`, *optional*, defaults to [8, 16, 32, 64]):
52
+ The number of groups.
53
+ depths (`List[int]`, *optional*, defaults to [1, 1, 9, 1]):
54
+ The depth of the model.
55
+ window_size (`int`, *optional*, defaults to 12):
56
+ The window size of the model.
57
+ projection_dim (`int`, *optional*, defaults to 1024):
58
+ The dimension of the projection layer.
59
+ visual_temporal_embedding (`dict`, *optional*):
60
+ The configuration of the visual temporal embedding.
61
+ image_pos_embed (`dict`, *optional*):
62
+ The configuration of the image position embedding.
63
+ image_feature_source (`List[str]`, *optional*, defaults to ["spatial_avg_pool", "temporal_avg_pool"]):
64
+ The source of the image feature.
65
+ Example:
66
+
67
+ ```python
68
+ >>> from transformers import Florence2VisionConfig, Florence2VisionModel
69
+
70
+ >>> # Initializing a Florence2 Vision style configuration
71
+ >>> configuration = Florence2VisionConfig()
72
+
73
+ >>> # Initializing a model (with random weights)
74
+ >>> model = Florence2VisionModel(configuration)
75
+
76
+ >>> # Accessing the model configuration
77
+ >>> configuration = model.config
78
+ ```"""
79
+
80
+ model_type = "davit"
81
+ keys_to_ignore_at_inference = ["past_key_values"]
82
+
83
+ def __init__(
84
+ self,
85
+ drop_path_rate=0.1,
86
+ patch_size=[7, 3, 3, 3],
87
+ patch_stride=[4, 2, 2, 2],
88
+ patch_padding=[3, 1, 1, 1],
89
+ patch_prenorm=[False, True, True, True],
90
+ enable_checkpoint=False,
91
+ dim_embed=[256, 512, 1024, 2048],
92
+ num_heads=[8, 16, 32, 64],
93
+ num_groups=[8, 16, 32, 64],
94
+ depths=[1, 1, 9, 1],
95
+ window_size=12,
96
+ projection_dim=1024,
97
+ visual_temporal_embedding=None,
98
+ image_pos_embed=None,
99
+ image_feature_source=["spatial_avg_pool", "temporal_avg_pool"],
100
+ **kwargs,
101
+ ):
102
+ self.drop_path_rate = drop_path_rate
103
+ self.patch_size = patch_size
104
+ self.patch_stride = patch_stride
105
+ self.patch_padding = patch_padding
106
+ self.patch_prenorm = patch_prenorm
107
+ self.enable_checkpoint = enable_checkpoint
108
+ self.dim_embed = dim_embed
109
+ self.num_heads = num_heads
110
+ self.num_groups = num_groups
111
+ self.depths = depths
112
+ self.window_size = window_size
113
+ self.projection_dim = projection_dim
114
+ self.visual_temporal_embedding = visual_temporal_embedding
115
+ self.image_pos_embed = image_pos_embed
116
+ self.image_feature_source = image_feature_source
117
+
118
+ super().__init__(**kwargs)
119
+
120
+
121
+
122
+ class Florence2LanguageConfig(PretrainedConfig):
123
+ r"""
124
+ This is the configuration class to store the configuration of a [`Florence2LanguagePreTrainedModel`]. It is used to instantiate a BART
125
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
126
+ defaults will yield a similar configuration to that of the BART
127
+ [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
128
+
129
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
130
+ documentation from [`PretrainedConfig`] for more information.
131
+
132
+
133
+ Args:
134
+ vocab_size (`int`, *optional*, defaults to 51289):
135
+ Vocabulary size of the Florence2Language model. Defines the number of different tokens that can be represented by the
136
+ `inputs_ids` passed when calling [`Florence2LanguageModel`].
137
+ d_model (`int`, *optional*, defaults to 1024):
138
+ Dimensionality of the layers and the pooler layer.
139
+ encoder_layers (`int`, *optional*, defaults to 12):
140
+ Number of encoder layers.
141
+ decoder_layers (`int`, *optional*, defaults to 12):
142
+ Number of decoder layers.
143
+ encoder_attention_heads (`int`, *optional*, defaults to 16):
144
+ Number of attention heads for each attention layer in the Transformer encoder.
145
+ decoder_attention_heads (`int`, *optional*, defaults to 16):
146
+ Number of attention heads for each attention layer in the Transformer decoder.
147
+ decoder_ffn_dim (`int`, *optional*, defaults to 4096):
148
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
149
+ encoder_ffn_dim (`int`, *optional*, defaults to 4096):
150
+ Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
151
+ activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
152
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
153
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
154
+ dropout (`float`, *optional*, defaults to 0.1):
155
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
156
+ attention_dropout (`float`, *optional*, defaults to 0.0):
157
+ The dropout ratio for the attention probabilities.
158
+ activation_dropout (`float`, *optional*, defaults to 0.0):
159
+ The dropout ratio for activations inside the fully connected layer.
160
+ classifier_dropout (`float`, *optional*, defaults to 0.0):
161
+ The dropout ratio for classifier.
162
+ max_position_embeddings (`int`, *optional*, defaults to 1024):
163
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
164
+ just in case (e.g., 512 or 1024 or 2048).
165
+ init_std (`float`, *optional*, defaults to 0.02):
166
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
167
+ encoder_layerdrop (`float`, *optional*, defaults to 0.0):
168
+ The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
169
+ for more details.
170
+ decoder_layerdrop (`float`, *optional*, defaults to 0.0):
171
+ The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
172
+ for more details.
173
+ scale_embedding (`bool`, *optional*, defaults to `False`):
174
+ Scale embeddings by diving by sqrt(d_model).
175
+ use_cache (`bool`, *optional*, defaults to `True`):
176
+ Whether or not the model should return the last key/values attentions (not used by all models).
177
+ num_labels (`int`, *optional*, defaults to 3):
178
+ The number of labels to use in [`Florence2LanguageForSequenceClassification`].
179
+ forced_eos_token_id (`int`, *optional*, defaults to 2):
180
+ The id of the token to force as the last generated token when `max_length` is reached. Usually set to
181
+ `eos_token_id`.
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import Florence2LanguageConfig, Florence2LanguageModel
187
+
188
+ >>> # Initializing a Florence2 Language style configuration
189
+ >>> configuration = Florence2LanguageConfig()
190
+
191
+ >>> # Initializing a model (with random weights)
192
+ >>> model = Florence2LangaugeModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "florence2_language"
199
+ keys_to_ignore_at_inference = ["past_key_values"]
200
+ attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
201
+
202
+ def __init__(
203
+ self,
204
+ vocab_size=51289,
205
+ max_position_embeddings=1024,
206
+ encoder_layers=12,
207
+ encoder_ffn_dim=4096,
208
+ encoder_attention_heads=16,
209
+ decoder_layers=12,
210
+ decoder_ffn_dim=4096,
211
+ decoder_attention_heads=16,
212
+ encoder_layerdrop=0.0,
213
+ decoder_layerdrop=0.0,
214
+ activation_function="gelu",
215
+ d_model=1024,
216
+ dropout=0.1,
217
+ attention_dropout=0.0,
218
+ activation_dropout=0.0,
219
+ init_std=0.02,
220
+ classifier_dropout=0.0,
221
+ scale_embedding=False,
222
+ use_cache=True,
223
+ num_labels=3,
224
+ pad_token_id=1,
225
+ bos_token_id=0,
226
+ eos_token_id=2,
227
+ is_encoder_decoder=True,
228
+ decoder_start_token_id=2,
229
+ forced_eos_token_id=2,
230
+ **kwargs,
231
+ ):
232
+ self.vocab_size = vocab_size
233
+ self.max_position_embeddings = max_position_embeddings
234
+ self.d_model = d_model
235
+ self.encoder_ffn_dim = encoder_ffn_dim
236
+ self.encoder_layers = encoder_layers
237
+ self.encoder_attention_heads = encoder_attention_heads
238
+ self.decoder_ffn_dim = decoder_ffn_dim
239
+ self.decoder_layers = decoder_layers
240
+ self.decoder_attention_heads = decoder_attention_heads
241
+ self.dropout = dropout
242
+ self.attention_dropout = attention_dropout
243
+ self.activation_dropout = activation_dropout
244
+ self.activation_function = activation_function
245
+ self.init_std = init_std
246
+ self.encoder_layerdrop = encoder_layerdrop
247
+ self.decoder_layerdrop = decoder_layerdrop
248
+ self.classifier_dropout = classifier_dropout
249
+ self.use_cache = use_cache
250
+ self.num_hidden_layers = encoder_layers
251
+ self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
252
+
253
+ super().__init__(
254
+ num_labels=num_labels,
255
+ pad_token_id=pad_token_id,
256
+ bos_token_id=bos_token_id,
257
+ eos_token_id=eos_token_id,
258
+ is_encoder_decoder=is_encoder_decoder,
259
+ decoder_start_token_id=decoder_start_token_id,
260
+ forced_eos_token_id=forced_eos_token_id,
261
+ **kwargs,
262
+ )
263
+
264
+ # ensure backward compatibility for BART CNN models
265
+ if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False):
266
+ self.forced_bos_token_id = self.bos_token_id
267
+ warnings.warn(
268
+ f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
269
+ "The config can simply be saved and uploaded again to be fixed."
270
+ )
271
+
272
+ class Florence2Config(PretrainedConfig):
273
+ r"""
274
+ This is the configuration class to store the configuration of a [`Florence2ForConditionalGeneration`]. It is used to instantiate an
275
+ Florence-2 model according to the specified arguments, defining the model architecture.
276
+
277
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
278
+ documentation from [`PretrainedConfig`] for more information.
279
+
280
+ Args:
281
+ vision_config (`Florence2VisionConfig`, *optional*):
282
+ Custom vision config or dict
283
+ text_config (`Union[AutoConfig, dict]`, *optional*):
284
+ The config object of the text backbone.
285
+ ignore_index (`int`, *optional*, defaults to -100):
286
+ The ignore index for the loss function.
287
+ vocab_size (`int`, *optional*, defaults to 51289):
288
+ Vocabulary size of the Florence2model. Defines the number of different tokens that can be represented by the
289
+ `inputs_ids` passed when calling [`~Florence2ForConditionalGeneration`]
290
+ projection_dim (`int`, *optional*, defaults to 1024):
291
+ Dimension of the multimodal projection space.
292
+
293
+ Example:
294
+
295
+ ```python
296
+ >>> from transformers import Florence2ForConditionalGeneration, Florence2Config, CLIPVisionConfig, BartConfig
297
+
298
+ >>> # Initializing a clip-like vision config
299
+ >>> vision_config = CLIPVisionConfig()
300
+
301
+ >>> # Initializing a Bart config
302
+ >>> text_config = BartConfig()
303
+
304
+ >>> # Initializing a Florence-2 configuration
305
+ >>> configuration = Florence2Config(vision_config, text_config)
306
+
307
+ >>> # Initializing a model from the florence-2 configuration
308
+ >>> model = Florence2ForConditionalGeneration(configuration)
309
+
310
+ >>> # Accessing the model configuration
311
+ >>> configuration = model.config
312
+ ```"""
313
+
314
+ model_type = "florence2"
315
+ is_composition = False
316
+
317
+ def __init__(
318
+ self,
319
+ vision_config=None,
320
+ text_config=None,
321
+ ignore_index=-100,
322
+ vocab_size=51289,
323
+ projection_dim=1024,
324
+ **kwargs,
325
+ ):
326
+ self.ignore_index = ignore_index
327
+ self.vocab_size = vocab_size
328
+ self.projection_dim = projection_dim
329
+ if vision_config is not None:
330
+ vision_config = Florence2VisionConfig(**vision_config)
331
+ self.vision_config = vision_config
332
+ self.vocab_size = self.vocab_size
333
+
334
+ self.text_config = text_config
335
+ if text_config is not None:
336
+ self.text_config = Florence2LanguageConfig(**text_config)
337
+
338
+
339
+ super().__init__(**kwargs)
340
+
LLM/Florence-2-base/modeling_florence2.py ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-base/preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_florence2.Florence2Processor"
4
+ },
5
+ "_valid_processor_keys": [
6
+ "images",
7
+ "do_resize",
8
+ "size",
9
+ "resample",
10
+ "do_rescale",
11
+ "rescale_factor",
12
+ "do_normalize",
13
+ "image_mean",
14
+ "image_std",
15
+ "return_tensors",
16
+ "data_format",
17
+ "input_data_format",
18
+ "do_convert_rgb"
19
+ ],
20
+ "do_convert_rgb": null,
21
+ "do_normalize": true,
22
+ "do_rescale": true,
23
+ "do_resize": true,
24
+ "do_center_crop": false,
25
+ "image_processor_type": "CLIPImageProcessor",
26
+ "image_seq_length": 577,
27
+ "image_mean": [0.485, 0.456, 0.406],
28
+ "image_std": [0.229, 0.224, 0.225],
29
+ "processor_class": "Florence2Processor",
30
+ "resample": 3,
31
+ "size": {
32
+ "height": 768,
33
+ "width":768
34
+ },
35
+ "crop_size": {
36
+ "height": 768,
37
+ "width": 768
38
+ }
39
+ }
LLM/Florence-2-base/processing_florence2.py ADDED
@@ -0,0 +1,1148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and The HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """
16
+ Processor class for Florence-2.
17
+ """
18
+
19
+ import re
20
+ import logging
21
+ from typing import List, Optional, Union
22
+ import numpy as np
23
+ import math
24
+
25
+ import torch
26
+
27
+ from transformers.feature_extraction_utils import BatchFeature
28
+ from transformers.image_utils import ImageInput, is_valid_image
29
+ from transformers.processing_utils import ProcessorMixin
30
+ from transformers.tokenization_utils_base import (
31
+ PaddingStrategy,
32
+ PreTokenizedInput,
33
+ TextInput,
34
+ TruncationStrategy,
35
+ )
36
+ from transformers import BartTokenizer, BartTokenizerFast
37
+ from transformers.utils import TensorType
38
+
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+ # Copied from transformers.models.idefics2.processing_idefics2.is_url
43
+ def is_url(val) -> bool:
44
+ return isinstance(val, str) and val.startswith("http")
45
+
46
+ # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
47
+ def is_image_or_image_url(elem):
48
+ return is_url(elem) or is_valid_image(elem)
49
+
50
+
51
+ def _is_str_or_image(elem):
52
+ return isinstance(elem, (str)) or is_image_or_image_url(elem)
53
+
54
+
55
+ class Florence2Processor(ProcessorMixin):
56
+ r"""
57
+ Constructs a Florence2 processor which wraps a Florence2 image processor and a Florence2 tokenizer into a single processor.
58
+
59
+ [`Florence2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and [`BartTokenizerFast`]. See the
60
+ [`~Florence2Processor.__call__`] and [`~Florence2Processor.decode`] for more information.
61
+
62
+ Args:
63
+ image_processor ([`CLIPImageProcessor`], *optional*):
64
+ The image processor is a required input.
65
+ tokenizer ([`BartTokenizerFast`], *optional*):
66
+ The tokenizer is a required input.
67
+ """
68
+
69
+ attributes = ["image_processor", "tokenizer"]
70
+ image_processor_class = "CLIPImageProcessor"
71
+ tokenizer_class = ("BartTokenizer", "BartTokenizerFast")
72
+
73
+ def __init__(
74
+ self,
75
+ image_processor=None,
76
+ tokenizer=None,
77
+ ):
78
+ if image_processor is None:
79
+ raise ValueError("You need to specify an `image_processor`.")
80
+ if tokenizer is None:
81
+ raise ValueError("You need to specify a `tokenizer`.")
82
+ if not hasattr(image_processor, "image_seq_length"):
83
+ raise ValueError("Image processor is missing an `image_seq_length` attribute.")
84
+
85
+ self.image_seq_length = image_processor.image_seq_length
86
+
87
+ tokens_to_add = {
88
+ 'additional_special_tokens': \
89
+ tokenizer.additional_special_tokens + \
90
+ ['<od>', '</od>', '<ocr>', '</ocr>'] + \
91
+ [f'<loc_{x}>' for x in range(1000)] + \
92
+ ['<cap>', '</cap>', '<ncap>', '</ncap>','<dcap>', '</dcap>', '<grounding>', '</grounding>', '<seg>', '</seg>', '<sep>', '<region_cap>', '</region_cap>', '<region_to_desciption>', '</region_to_desciption>', '<proposal>', '</proposal>', '<poly>', '</poly>', '<and>']
93
+ }
94
+ tokenizer.add_special_tokens(tokens_to_add)
95
+
96
+ self.tasks_answer_post_processing_type = {
97
+ '<OCR>': 'pure_text',
98
+ '<OCR_WITH_REGION>': 'ocr',
99
+ '<CAPTION>': 'pure_text',
100
+ '<DETAILED_CAPTION>': 'pure_text',
101
+ '<MORE_DETAILED_CAPTION>': 'pure_text',
102
+ '<OD>': 'description_with_bboxes',
103
+ '<DENSE_REGION_CAPTION>': 'description_with_bboxes',
104
+ '<CAPTION_TO_PHRASE_GROUNDING>': "phrase_grounding",
105
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'polygons',
106
+ '<REGION_TO_SEGMENTATION>': 'polygons',
107
+ '<OPEN_VOCABULARY_DETECTION>': 'description_with_bboxes_or_polygons',
108
+ '<REGION_TO_CATEGORY>': 'pure_text',
109
+ '<REGION_TO_DESCRIPTION>': 'pure_text',
110
+ '<REGION_TO_OCR>': 'pure_text',
111
+ '<REGION_PROPOSAL>': 'bboxes'
112
+ }
113
+
114
+ self.task_prompts_without_inputs = {
115
+ '<OCR>': 'What is the text in the image?',
116
+ '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
117
+ '<CAPTION>': 'What does the image describe?',
118
+ '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
119
+ '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
120
+ '<OD>': 'Locate the objects with category name in the image.',
121
+ '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
122
+ '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
123
+ }
124
+
125
+ self.task_prompts_with_input = {
126
+ '<CAPTION_TO_PHRASE_GROUNDING>': "Locate the phrases in the caption: {input}",
127
+ '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
128
+ '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
129
+ '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
130
+ '<REGION_TO_CATEGORY>': 'What is the region {input}?',
131
+ '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
132
+ '<REGION_TO_OCR>': 'What text is in the region {input}?',
133
+ }
134
+
135
+ self.post_processor = Florence2PostProcesser(tokenizer=tokenizer)
136
+
137
+
138
+ super().__init__(image_processor, tokenizer)
139
+
140
+ def _construct_prompts(self, text):
141
+ # replace the task tokens with the task prompts if task token is in the text
142
+ prompts = []
143
+ for _text in text:
144
+ # 1. fixed task prompts without additional inputs
145
+ for task_token, task_prompt in self.task_prompts_without_inputs.items():
146
+ if task_token in _text:
147
+ assert _text == task_token, f"Task token {task_token} should be the only token in the text."
148
+ _text = task_prompt
149
+ break
150
+ # 2. task prompts with additional inputs
151
+ for task_token, task_prompt in self.task_prompts_with_input.items():
152
+ if task_token in _text:
153
+ _text = task_prompt.format(input=_text.replace(task_token, ''))
154
+ break
155
+ prompts.append(_text)
156
+ return prompts
157
+
158
+ def __call__(
159
+ self,
160
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
161
+ images: ImageInput = None,
162
+ tokenize_newline_separately: bool = True,
163
+ padding: Union[bool, str, PaddingStrategy] = False,
164
+ truncation: Union[bool, str, TruncationStrategy] = None,
165
+ max_length=None,
166
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
167
+ do_resize: bool = None,
168
+ do_normalize: bool = None,
169
+ image_mean: Optional[Union[float, List[float]]] = None,
170
+ image_std: Optional[Union[float, List[float]]] = None,
171
+ data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
172
+ input_data_format: Optional[
173
+ Union[str, "ChannelDimension"] # noqa: F821
174
+ ] = None,
175
+ resample: "PILImageResampling" = None, # noqa: F821
176
+ do_convert_rgb: bool = None,
177
+ do_thumbnail: bool = None,
178
+ do_align_long_axis: bool = None,
179
+ do_rescale: bool = None,
180
+ ) -> BatchFeature:
181
+ """
182
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
183
+ and `kwargs` arguments to BartTokenizerFast's [`~BartTokenizerFast.__call__`] if `text` is not `None` to encode
184
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
185
+ CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
186
+ of the above two methods for more information.
187
+
188
+ Args:
189
+ text (`str`, `List[str]`, `List[List[str]]`):
190
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
191
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
192
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
193
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
194
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
195
+ tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
196
+ number of channels, H and W are image height and width.
197
+ tokenize_newline_separately (`bool`, defaults to `True`):
198
+ Adds a separately tokenized '\n' at the end of the prompt.
199
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
200
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
201
+ index) among:
202
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
203
+ sequence if provided).
204
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
205
+ acceptable input length for the model if that argument is not provided.
206
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
207
+ lengths).
208
+ max_length (`int`, *optional*):
209
+ Maximum length of the returned list and optionally padding length (see above).
210
+ truncation (`bool`, *optional*):
211
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
212
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
213
+ If set, will return tensors of a particular framework. Acceptable values are:
214
+
215
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
216
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
217
+ - `'np'`: Return NumPy `np.ndarray` objects.
218
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
219
+
220
+ Returns:
221
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
222
+
223
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
224
+ is provided, the `input_ids` will also contain the suffix input ids.
225
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
226
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
227
+ `None`).
228
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
229
+ - **labels** -- Labels compatible with training if `suffix` is not None
230
+ """
231
+
232
+ return_token_type_ids = False
233
+
234
+ if images is None:
235
+ raise ValueError("`images` are expected as arguments to a `Florence2Processor` instance.")
236
+ if text is None:
237
+ logger.warning_once(
238
+ "You are using Florence-2 without a text prompt."
239
+ )
240
+ text = ""
241
+
242
+ if isinstance(text, List) and isinstance(images, List):
243
+ if len(images) < len(text):
244
+ raise ValueError(
245
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
246
+ )
247
+ if _is_str_or_image(text):
248
+ text = [text]
249
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
250
+ pass
251
+
252
+ pixel_values = self.image_processor(
253
+ images,
254
+ do_resize=do_resize,
255
+ do_normalize=do_normalize,
256
+ return_tensors=return_tensors,
257
+ image_mean=image_mean,
258
+ image_std=image_std,
259
+ input_data_format=input_data_format,
260
+ data_format=data_format,
261
+ resample=resample,
262
+ do_convert_rgb=do_convert_rgb,
263
+ )["pixel_values"]
264
+
265
+ if max_length is not None:
266
+ max_length -= self.image_seq_length # max_length has to account for the image tokens
267
+
268
+ text = self._construct_prompts(text)
269
+
270
+ inputs = self.tokenizer(
271
+ text,
272
+ return_tensors=return_tensors,
273
+ padding=padding,
274
+ max_length=max_length,
275
+ truncation=truncation,
276
+ return_token_type_ids=return_token_type_ids,
277
+ )
278
+
279
+ return_data = {**inputs, "pixel_values": pixel_values}
280
+
281
+ if return_token_type_ids:
282
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
283
+ return_data.update({"labels": labels})
284
+ return BatchFeature(data=return_data)
285
+
286
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Florence2
287
+ def batch_decode(self, *args, **kwargs):
288
+ """
289
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
290
+ refer to the docstring of this method for more information.
291
+ """
292
+ return self.tokenizer.batch_decode(*args, **kwargs)
293
+
294
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Florence2
295
+ def decode(self, *args, **kwargs):
296
+ """
297
+ This method forwards all its arguments to BartTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
298
+ the docstring of this method for more information.
299
+ """
300
+ return self.tokenizer.decode(*args, **kwargs)
301
+
302
+ @property
303
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->Florence2
304
+ def model_input_names(self):
305
+ tokenizer_input_names = self.tokenizer.model_input_names
306
+ image_processor_input_names = self.image_processor.model_input_names
307
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
308
+
309
+ def post_process_generation(self, text=None, sequence=None, transition_beam_score=None, task=None, image_size=None):
310
+ """
311
+ Post-process the output of the model to each of the task outputs.
312
+
313
+ Args:
314
+ text (`str`): The text to post-process.
315
+ task (`str`): The task to post-process the text for.
316
+ image_size (`Tuple[int, int]`): The size of the image. height x width.
317
+ """
318
+
319
+ task_answer_post_processing_type = self.tasks_answer_post_processing_type.get(task, 'pure_text')
320
+ task_answer = self.post_processor(
321
+ text=text,
322
+ sequence=sequence,
323
+ transition_beam_score=transition_beam_score,
324
+ image_size=image_size,
325
+ parse_tasks=task_answer_post_processing_type,
326
+ )[task_answer_post_processing_type]
327
+
328
+ if task_answer_post_processing_type == 'pure_text':
329
+ final_answer = task_answer
330
+ # remove the special tokens
331
+ final_answer = final_answer.replace('<s>', '').replace('</s>', '')
332
+ elif task_answer_post_processing_type in ['od', 'description_with_bboxes', 'bboxes']:
333
+ od_instances = task_answer
334
+ bboxes_od = [_od_instance['bbox'] for _od_instance in od_instances]
335
+ labels_od = [str(_od_instance['cat_name']) for _od_instance in od_instances]
336
+ final_answer = {'bboxes': bboxes_od, 'labels': labels_od}
337
+ if len(od_instances) and 'score' in od_instances[0]:
338
+ scores_od = [_od_instance['score'] for _od_instance in od_instances]
339
+ final_answer['scores'] = scores_od
340
+ elif task_answer_post_processing_type in ['ocr']:
341
+ bboxes = [_od_instance['quad_box'] for _od_instance in task_answer]
342
+ labels = [str(_od_instance['text']) for _od_instance in task_answer]
343
+ final_answer = {'quad_boxes': bboxes, 'labels': labels}
344
+ elif task_answer_post_processing_type in ['phrase_grounding']:
345
+ bboxes = []
346
+ labels = []
347
+ for _grounded_phrase in task_answer:
348
+ for _bbox in _grounded_phrase['bbox']:
349
+ bboxes.append(_bbox)
350
+ labels.append(_grounded_phrase['cat_name'])
351
+ final_answer = {'bboxes': bboxes, 'labels': labels}
352
+ elif task_answer_post_processing_type in ['description_with_polygons', 'polygons']:
353
+ labels = []
354
+ polygons = []
355
+ for result in task_answer:
356
+ label = result['cat_name']
357
+ _polygons = result['polygons']
358
+ labels.append(label)
359
+ polygons.append(_polygons)
360
+ final_answer = {'polygons': polygons, 'labels': labels}
361
+ elif task_answer_post_processing_type in ['description_with_bboxes_or_polygons']:
362
+ bboxes = []
363
+ bboxes_labels = []
364
+ polygons = []
365
+ polygons_labels = []
366
+ for result in task_answer:
367
+ label = result['cat_name']
368
+ if 'polygons' in result:
369
+ _polygons = result['polygons']
370
+ polygons.append(_polygons)
371
+ polygons_labels.append(label)
372
+ else:
373
+ _bbox = result['bbox']
374
+ bboxes.append(_bbox)
375
+ bboxes_labels.append(label)
376
+ final_answer = {'bboxes': bboxes, 'bboxes_labels': bboxes_labels, 'polygons': polygons, 'polygons_labels': polygons_labels}
377
+ else:
378
+ raise ValueError('Unknown task answer post processing type: {}'.format(task_answer_post_processing_type))
379
+
380
+ final_answer = {
381
+ task: final_answer}
382
+ return final_answer
383
+
384
+ class BoxQuantizer(object):
385
+ def __init__(self, mode, bins):
386
+ self.mode = mode
387
+ self.bins = bins
388
+
389
+ def quantize(self, boxes: torch.Tensor, size):
390
+ bins_w, bins_h = self.bins # Quantization bins.
391
+ size_w, size_h = size # Original image size.
392
+ size_per_bin_w = size_w / bins_w
393
+ size_per_bin_h = size_h / bins_h
394
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
395
+
396
+ if self.mode == 'floor':
397
+ quantized_xmin = (
398
+ xmin / size_per_bin_w).floor().clamp(0, bins_w - 1)
399
+ quantized_ymin = (
400
+ ymin / size_per_bin_h).floor().clamp(0, bins_h - 1)
401
+ quantized_xmax = (
402
+ xmax / size_per_bin_w).floor().clamp(0, bins_w - 1)
403
+ quantized_ymax = (
404
+ ymax / size_per_bin_h).floor().clamp(0, bins_h - 1)
405
+
406
+ elif self.mode == 'round':
407
+ raise NotImplementedError()
408
+
409
+ else:
410
+ raise ValueError('Incorrect quantization type.')
411
+
412
+ quantized_boxes = torch.cat(
413
+ (quantized_xmin, quantized_ymin, quantized_xmax, quantized_ymax), dim=-1
414
+ ).int()
415
+
416
+ return quantized_boxes
417
+
418
+ def dequantize(self, boxes: torch.Tensor, size):
419
+ bins_w, bins_h = self.bins # Quantization bins.
420
+ size_w, size_h = size # Original image size.
421
+ size_per_bin_w = size_w / bins_w
422
+ size_per_bin_h = size_h / bins_h
423
+ xmin, ymin, xmax, ymax = boxes.split(1, dim=-1) # Shape: 4 * [N, 1].
424
+
425
+ if self.mode == 'floor':
426
+ # Add 0.5 to use the center position of the bin as the coordinate.
427
+ dequantized_xmin = (xmin + 0.5) * size_per_bin_w
428
+ dequantized_ymin = (ymin + 0.5) * size_per_bin_h
429
+ dequantized_xmax = (xmax + 0.5) * size_per_bin_w
430
+ dequantized_ymax = (ymax + 0.5) * size_per_bin_h
431
+
432
+ elif self.mode == 'round':
433
+ raise NotImplementedError()
434
+
435
+ else:
436
+ raise ValueError('Incorrect quantization type.')
437
+
438
+ dequantized_boxes = torch.cat(
439
+ (dequantized_xmin, dequantized_ymin,
440
+ dequantized_xmax, dequantized_ymax), dim=-1
441
+ )
442
+
443
+ return dequantized_boxes
444
+
445
+
446
+ class CoordinatesQuantizer(object):
447
+ """
448
+ Quantize coornidates (Nx2)
449
+ """
450
+
451
+ def __init__(self, mode, bins):
452
+ self.mode = mode
453
+ self.bins = bins
454
+
455
+ def quantize(self, coordinates: torch.Tensor, size):
456
+ bins_w, bins_h = self.bins # Quantization bins.
457
+ size_w, size_h = size # Original image size.
458
+ size_per_bin_w = size_w / bins_w
459
+ size_per_bin_h = size_h / bins_h
460
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
461
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
462
+
463
+ if self.mode == 'floor':
464
+ quantized_x = (x / size_per_bin_w).floor().clamp(0, bins_w - 1)
465
+ quantized_y = (y / size_per_bin_h).floor().clamp(0, bins_h - 1)
466
+
467
+ elif self.mode == 'round':
468
+ raise NotImplementedError()
469
+
470
+ else:
471
+ raise ValueError('Incorrect quantization type.')
472
+
473
+ quantized_coordinates = torch.cat(
474
+ (quantized_x, quantized_y), dim=-1
475
+ ).int()
476
+
477
+ return quantized_coordinates
478
+
479
+ def dequantize(self, coordinates: torch.Tensor, size):
480
+ bins_w, bins_h = self.bins # Quantization bins.
481
+ size_w, size_h = size # Original image size.
482
+ size_per_bin_w = size_w / bins_w
483
+ size_per_bin_h = size_h / bins_h
484
+ assert coordinates.shape[-1] == 2, 'coordinates should be shape (N, 2)'
485
+ x, y = coordinates.split(1, dim=-1) # Shape: 4 * [N, 1].
486
+
487
+ if self.mode == 'floor':
488
+ # Add 0.5 to use the center position of the bin as the coordinate.
489
+ dequantized_x = (x + 0.5) * size_per_bin_w
490
+ dequantized_y = (y + 0.5) * size_per_bin_h
491
+
492
+ elif self.mode == 'round':
493
+ raise NotImplementedError()
494
+
495
+ else:
496
+ raise ValueError('Incorrect quantization type.')
497
+
498
+ dequantized_coordinates = torch.cat(
499
+ (dequantized_x, dequantized_y), dim=-1
500
+ )
501
+
502
+ return dequantized_coordinates
503
+
504
+
505
+ class Florence2PostProcesser(object):
506
+ r"""
507
+ Florence-2 post process for converting text prediction to various tasks results.
508
+
509
+ Args:
510
+ config: A dict of configs.
511
+ tokenizer: A tokenizer for decoding text to spans.
512
+ sample config:
513
+ UNIFIED_POST_PROCESS:
514
+ # commom configs
515
+ NUM_BBOX_HEIGHT_BINS: 1000
516
+ NUM_BBOX_WIDTH_BINS: 1000
517
+ COORDINATES_HEIGHT_BINS: 1000
518
+ COORDINATES_WIDTH_BINS: 1000
519
+ # task specific configs, override the common configs
520
+ PRASE_TASKS:
521
+ - TASK_NAME: 'video_dense_caption'
522
+ PATTERN: 'r<time_(\d+)><time_(\d+)>([a-zA-Z0-9 ]+)'
523
+ SCORE_MODE: 'avg_cat_name_scores'
524
+ NUM_BINS: 100
525
+ - TASK_NAME: 'od'
526
+ PATTERN: 'r<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>([a-zA-Z0-9 ]+)'
527
+ SCORE_MODE: 'avg_cat_name_scores'
528
+
529
+ Returns:
530
+ parsed_dict (dict): A dict of parsed results.
531
+ """
532
+ def __init__(
533
+ self,
534
+ tokenizer=None
535
+ ):
536
+ parse_tasks = []
537
+ parse_task_configs = {}
538
+ config = self._create_default_config()
539
+ for task in config['PARSE_TASKS']:
540
+ parse_tasks.append(task['TASK_NAME'])
541
+ parse_task_configs[task['TASK_NAME']] = task
542
+
543
+ self.config = config
544
+ self.parse_tasks = parse_tasks
545
+ self.parse_tasks_configs = parse_task_configs
546
+
547
+ self.tokenizer = tokenizer
548
+ if self.tokenizer is not None:
549
+ self.all_special_tokens = set(self.tokenizer.all_special_tokens)
550
+
551
+ self.init_quantizers()
552
+ self.black_list_of_phrase_grounding = self._create_black_list_of_phrase_grounding()
553
+
554
+ def _create_black_list_of_phrase_grounding(self):
555
+ black_list = {}
556
+
557
+ if 'phrase_grounding' in self.parse_tasks and self.parse_tasks_configs['phrase_grounding']['FILTER_BY_BLACK_LIST']:
558
+ black_list = set(
559
+ ['it', 'I', 'me', 'mine',
560
+ 'you', 'your', 'yours',
561
+ 'he', 'him', 'his',
562
+ 'she', 'her', 'hers',
563
+ 'they', 'them', 'their', 'theirs',
564
+ 'one', 'oneself',
565
+ 'we', 'us', 'our', 'ours',
566
+ 'you', 'your', 'yours',
567
+ 'they', 'them', 'their', 'theirs',
568
+ 'mine', 'yours', 'his', 'hers', 'its',
569
+ 'ours', 'yours', 'theirs',
570
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
571
+ 'ourselves', 'yourselves', 'themselves',
572
+ 'this', 'that',
573
+ 'these', 'those',
574
+ 'who', 'whom', 'whose', 'which', 'what',
575
+ 'who', 'whom', 'whose', 'which', 'that',
576
+ 'all', 'another', 'any', 'anybody', 'anyone', 'anything',
577
+ 'each', 'everybody', 'everyone', 'everything',
578
+ 'few', 'many', 'nobody', 'none', 'one', 'several',
579
+ 'some', 'somebody', 'someone', 'something',
580
+ 'each other', 'one another',
581
+ 'myself', 'yourself', 'himself', 'herself', 'itself',
582
+ 'ourselves', 'yourselves', 'themselves',
583
+ 'the image', 'image', 'images', 'the', 'a', 'an', 'a group',
584
+ 'other objects', 'lots', 'a set',
585
+ ]
586
+ )
587
+
588
+ return black_list
589
+
590
+ def _create_default_config(self):
591
+ config = {
592
+ 'NUM_BBOX_HEIGHT_BINS': 1000,
593
+ 'NUM_BBOX_WIDTH_BINS': 1000,
594
+ 'BOX_QUANTIZATION_MODE': 'floor',
595
+ 'COORDINATES_HEIGHT_BINS': 1000,
596
+ 'COORDINATES_WIDTH_BINS': 1000,
597
+ 'COORDINATES_QUANTIZATION_MODE': 'floor',
598
+ 'PARSE_TASKS': [
599
+ {
600
+ 'TASK_NAME': 'od',
601
+ 'PATTERN': r'([a-zA-Z0-9 ]+)<loc_(\\d+)><loc_(\\d+)><loc_(\\d+)><loc_(\\d+)>',
602
+ 'SCORE_MODE': 'avg_loc_scores'
603
+ },
604
+ {
605
+ 'TASK_NAME': 'ocr',
606
+ 'PATTERN': r'(.+?)<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>',
607
+ 'AREA_THRESHOLD': 0.00
608
+ },
609
+ {
610
+ 'TASK_NAME': 'phrase_grounding',
611
+ 'FILTER_BY_BLACK_LIST': True
612
+ },
613
+ {
614
+ 'TASK_NAME': 'pure_text',
615
+ },
616
+ {
617
+ 'TASK_NAME': 'description_with_bboxes',
618
+ 'SCORE_MODE': 'avg_loc_scores'
619
+ },
620
+ {
621
+ 'TASK_NAME': 'description_with_polygons',
622
+ },
623
+ {
624
+ 'TASK_NAME': 'polygons',
625
+ },
626
+ {
627
+ 'TASK_NAME': 'bboxes',
628
+ },
629
+ {
630
+ 'TASK_NAME': 'description_with_bboxes_or_polygons',
631
+ }
632
+ ]
633
+ }
634
+
635
+ return config
636
+
637
+ def init_quantizers(self):
638
+ # we have box_quantizer (od, grounding) and coordinates_quantizer (ocr, referring_segmentation)
639
+ num_bbox_height_bins = self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
640
+ num_bbox_width_bins = self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
641
+ box_quantization_mode = self.config.get('BOX_QUANTIZATION_MODE', 'floor')
642
+ self.box_quantizer = BoxQuantizer(
643
+ box_quantization_mode,
644
+ (num_bbox_width_bins, num_bbox_height_bins),
645
+ )
646
+
647
+ num_bbox_height_bins = self.config['COORDINATES_HEIGHT_BINS'] if 'COORDINATES_HEIGHT_BINS' in self.config else self.config.get('NUM_BBOX_HEIGHT_BINS', 1000)
648
+ num_bbox_width_bins = self.config['COORDINATES_WIDTH_BINS'] if 'COORDINATES_WIDTH_BINS' in self.config else self.config.get('NUM_BBOX_WIDTH_BINS', 1000)
649
+ box_quantization_mode = self.config.get('COORDINATES_QUANTIZATION_MODE') if 'COORDINATES_QUANTIZATION_MODE' in self.config else self.config.get('BOX_QUANTIZATION_MODE', 'floor')
650
+ self.coordinates_quantizer = CoordinatesQuantizer(
651
+ box_quantization_mode,
652
+ (num_bbox_width_bins, num_bbox_height_bins),
653
+ )
654
+
655
+ def decode_with_spans(self, tokenizer, token_ids):
656
+ filtered_tokens = tokenizer.convert_ids_to_tokens(
657
+ token_ids, skip_special_tokens=False)
658
+ assert len(filtered_tokens) == len(token_ids)
659
+
660
+ sub_texts = []
661
+ for token in filtered_tokens:
662
+ if token in self.all_special_tokens:
663
+ sub_texts.append(token)
664
+ else:
665
+ if isinstance(tokenizer, (BartTokenizer, BartTokenizerFast)):
666
+ sub_text = tokenizer.convert_tokens_to_string([token])
667
+ else:
668
+ raise ValueError(f'type {type(tokenizer)} not supported')
669
+ sub_texts.append(sub_text)
670
+
671
+ text = ''
672
+ spans = []
673
+ for sub_text in sub_texts:
674
+ span = (len(text), len(text) + len(sub_text)) # [start index, end index).
675
+ text += sub_text
676
+ spans.append(span)
677
+
678
+ return text, spans
679
+
680
+ def parse_od_from_text_and_spans(
681
+ self,
682
+ text,
683
+ pattern,
684
+ image_size,
685
+ phrase_centric=False
686
+ ):
687
+ parsed = list(re.finditer(pattern, text))
688
+
689
+ instances = []
690
+ for i in range(len(parsed)):
691
+ # Prepare instance.
692
+ instance = {}
693
+
694
+ if phrase_centric:
695
+ bbox_bins = [int(parsed[i].group(j)) for j in range(2, 6)]
696
+ else:
697
+ bbox_bins = [int(parsed[i].group(j)) for j in range(1, 5)]
698
+ instance['bbox'] = self.box_quantizer.dequantize(
699
+ boxes=torch.tensor(bbox_bins),
700
+ size=image_size
701
+ ).tolist()
702
+
703
+ if phrase_centric:
704
+ instance['cat_name'] = parsed[i].group(1).lower().strip()
705
+ else:
706
+ instance['cat_name'] = parsed[i].group(5).lower().strip()
707
+ instances.append(instance)
708
+
709
+ return instances
710
+
711
+ def parse_ocr_from_text_and_spans(self,
712
+ text,
713
+ pattern,
714
+ image_size,
715
+ area_threshold=-1.0,
716
+ ):
717
+ bboxes = []
718
+ labels = []
719
+ text = text.replace('<s>', '')
720
+ # ocr with regions
721
+ parsed = re.findall(pattern, text)
722
+ instances = []
723
+ image_width, image_height = image_size
724
+
725
+ for ocr_line in parsed:
726
+ ocr_content = ocr_line[0]
727
+ quad_box = ocr_line[1:]
728
+ quad_box = [int(i) for i in quad_box]
729
+ quad_box = self.coordinates_quantizer.dequantize(
730
+ torch.tensor(np.array(quad_box).reshape(-1, 2)),
731
+ size=image_size
732
+ ).reshape(-1).tolist()
733
+
734
+ if area_threshold > 0:
735
+ x_coords = [i for i in quad_box[0::2]]
736
+ y_coords = [i for i in quad_box[1::2]]
737
+
738
+ # apply the Shoelace formula
739
+ area = 0.5 * abs(sum(x_coords[i] * y_coords[i + 1] - x_coords[i + 1] * y_coords[i] for i in range(4 - 1)))
740
+
741
+ if area < (image_width * image_height) * area_threshold:
742
+ continue
743
+
744
+ bboxes.append(quad_box)
745
+ labels.append(ocr_content)
746
+ instances.append({
747
+ 'quad_box': quad_box,
748
+ 'text': ocr_content,
749
+ })
750
+ return instances
751
+
752
+ def parse_phrase_grounding_from_text_and_spans(self, text, pattern, image_size):
753
+ # ignore <s> </s> and <pad>
754
+ cur_span = 0
755
+ if text.startswith('<s>'):
756
+ cur_span += 3
757
+
758
+ text = text.replace('<s>', '')
759
+ text = text.replace('</s>', '')
760
+ text = text.replace('<pad>', '')
761
+
762
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
763
+ phrases = re.findall(pattern, text)
764
+
765
+ # pattern should be text pattern and od pattern
766
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
767
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
768
+
769
+ instances = []
770
+ for pharse_text in phrases:
771
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
772
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
773
+
774
+ if phrase_text_strip == '':
775
+ cur_span += len(pharse_text)
776
+ continue
777
+
778
+ # Prepare instance.
779
+ instance = {}
780
+
781
+ # parse phrase, get string
782
+ phrase = re.search(pattern, phrase_text_strip)
783
+ if phrase is None:
784
+ cur_span += len(pharse_text)
785
+ continue
786
+
787
+ # parse bboxes by box_pattern
788
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
789
+ if len(bboxes_parsed) == 0:
790
+ cur_span += len(pharse_text)
791
+ continue
792
+
793
+ phrase = phrase.group()
794
+ # remove leading and trailing spaces
795
+ phrase = phrase.strip()
796
+
797
+ if phrase in self.black_list_of_phrase_grounding:
798
+ cur_span += len(pharse_text)
799
+ continue
800
+
801
+ # a list of list
802
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
803
+ instance['bbox'] = self.box_quantizer.dequantize(
804
+ boxes=torch.tensor(bbox_bins),
805
+ size=image_size
806
+ ).tolist()
807
+
808
+ # exclude non-ascii characters
809
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
810
+ instance['cat_name'] = phrase
811
+
812
+ instances.append(instance)
813
+
814
+ return instances
815
+
816
+ def parse_description_with_bboxes_from_text_and_spans(
817
+ self,
818
+ text,
819
+ spans=None,
820
+ scores=None,
821
+ score_mode=None,
822
+ pattern=None,
823
+ image_size=None,
824
+ allow_empty_phrase=False
825
+ ):
826
+ def find_matched_token_indices(cur_span, token_spans):
827
+ inds = []
828
+ for i, token_span in enumerate(token_spans):
829
+ if not (token_span[1] <= cur_span[0] or token_span[0] >= cur_span[1]):
830
+ inds.append(i)
831
+ return inds
832
+
833
+ cur_span = 0
834
+ if text.startswith('<s>'):
835
+ cur_span += 3
836
+
837
+ text = text.replace('<s>', '')
838
+ text = text.replace('</s>', '')
839
+ text = text.replace('<pad>', '')
840
+
841
+ if allow_empty_phrase:
842
+ pattern = rf"(?:(?:<loc_\d+>){{4,}})"
843
+ else:
844
+ pattern = r"([^<]+(?:<loc_\d+>){4,})"
845
+ phrases = re.findall(pattern, text)
846
+
847
+ # pattern should be text pattern and od pattern
848
+ pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_)'
849
+ box_pattern = r'<loc_(\d+)><loc_(\d+)><loc_(\d+)><loc_(\d+)>'
850
+
851
+ instances = []
852
+ for pharse_text in phrases:
853
+ phrase_text_strip = pharse_text.replace('<ground>', '', 1)
854
+ phrase_text_strip = pharse_text.replace('<obj>', '', 1)
855
+
856
+ if phrase_text_strip == '' and not allow_empty_phrase:
857
+ cur_span += len(pharse_text)
858
+ continue
859
+
860
+ # parse phrase, get string
861
+ phrase = re.search(pattern, phrase_text_strip)
862
+ if phrase is None:
863
+ cur_span += len(pharse_text)
864
+ continue
865
+
866
+ phrase_span = phrase.span()
867
+ phrase = phrase.group()
868
+ # remove leading and trailing spaces
869
+ phrase = phrase.strip()
870
+
871
+ # parse bboxes by box_pattern
872
+ bboxes_parsed = list(re.finditer(box_pattern, pharse_text))
873
+ if len(bboxes_parsed) == 0:
874
+ cur_span += len(pharse_text)
875
+ continue
876
+
877
+ # a list of list
878
+ bbox_bins = [[int(_bboxes_parsed.group(j)) for j in range(1, 5)] for _bboxes_parsed in bboxes_parsed]
879
+
880
+ bboxes = self.box_quantizer.dequantize(
881
+ boxes=torch.tensor(bbox_bins),
882
+ size=image_size
883
+ ).tolist()
884
+
885
+ if score_mode == 'avg_loc_scores':
886
+ if spans is None or scores is None:
887
+ all_scores = None
888
+ else:
889
+ bbox_end_spans = [_bboxes_parsed.span(0) for _bboxes_parsed in bboxes_parsed]
890
+ all_scores = []
891
+ for _spans in bbox_end_spans:
892
+ token_inds = find_matched_token_indices((_spans[0] + cur_span, _spans[1]+ cur_span), spans)
893
+ loc_scores = [scores[token_i] for token_i in token_inds]
894
+ score = sum(loc_scores) / len(loc_scores)
895
+ all_scores.append(score)
896
+ elif score_mode == 'avg_cat_name_scores':
897
+ if spans is None or scores is None:
898
+ all_scores = None
899
+ else:
900
+ cat_name_token_inds = find_matched_token_indices((phrase_span[0] + cur_span, phrase_span[1]+cur_span), spans)
901
+ cat_name_scores = [scores[token_i] for token_i in cat_name_token_inds]
902
+ score = sum(cat_name_scores) / len(cat_name_scores)
903
+ all_scores = [score] * len(bboxes)
904
+ elif score_mode is None:
905
+ all_scores = None
906
+ else:
907
+ raise ValueError('Unknown score mode: {}'.format(score_mode))
908
+
909
+ phrase = phrase.encode('ascii',errors='ignore').decode('ascii')
910
+ for _idx, _bboxes in enumerate(bboxes):
911
+ # Prepare instance.
912
+ instance = {}
913
+ instance['bbox'] = _bboxes
914
+ # exclude non-ascii characters
915
+ instance['cat_name'] = phrase
916
+ if all_scores is not None:
917
+ instance['score'] = math.exp(all_scores[_idx])
918
+ instances.append(instance)
919
+
920
+ cur_span += len(pharse_text)
921
+
922
+ return instances
923
+
924
+ def parse_description_with_polygons_from_text_and_spans(self, text, pattern, image_size,
925
+ allow_empty_phrase=False,
926
+ polygon_sep_token='<sep>',
927
+ polygon_start_token='<poly>',
928
+ polygon_end_token='</poly>',
929
+ with_box_at_start=False,
930
+ ):
931
+
932
+ # ref_seg format: '<expression><x1><y1><x2><y2><><><sep><><><><>'
933
+ # ignore <s> </s> and <pad>
934
+
935
+ text = text.replace('<s>', '')
936
+ text = text.replace('</s>', '')
937
+ text = text.replace('<pad>', '')
938
+
939
+ if allow_empty_phrase:
940
+ pattern = rf"(?:(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
941
+ else:
942
+ # [^<]+: This part matches one or more characters that are not the < symbol.
943
+ # The ^ inside the square brackets [] is a negation, meaning it matches anything except <.
944
+ #
945
+ pattern = rf"([^<]+(?:<loc_\d+>|{re.escape(polygon_sep_token)}|{re.escape(polygon_start_token)}|{re.escape(polygon_end_token)}){{4,}})"
946
+ phrases = re.findall(pattern, text)
947
+
948
+ phrase_string_pattern = r'^\s*(.*?)(?=<od>|</od>|<box>|</box>|<bbox>|</bbox>|<loc_|<poly>)'
949
+ box_pattern = rf'((?:<loc_\d+>)+)(?:{re.escape(polygon_sep_token)}|$)'
950
+
951
+ # one polygons instance is separated by polygon_start_token and polygon_end_token
952
+ polygons_instance_pattern = rf'{re.escape(polygon_start_token)}(.*?){re.escape(polygon_end_token)}'
953
+
954
+ instances = []
955
+ for phrase_text in phrases:
956
+
957
+ # exclude loc_\d+>
958
+ # need to get span if want to include category score
959
+ phrase_text_strip = re.sub(r'^loc_\d+>', '', phrase_text, count=1)
960
+
961
+ # phrase = phrase.replace('<poly>', '')
962
+ # phrase = phrase.replace('poly>', '')
963
+
964
+ if phrase_text_strip == '' and not allow_empty_phrase:
965
+ continue
966
+
967
+
968
+ # parse phrase, get string
969
+ phrase = re.search(phrase_string_pattern, phrase_text_strip)
970
+ if phrase is None:
971
+ continue
972
+ phrase = phrase.group()
973
+ # remove leading and trailing spaces
974
+ phrase = phrase.strip()
975
+
976
+ # parse bboxes by box_pattern
977
+
978
+ # split by polygon_start_token and polygon_end_token first using polygons_instance_pattern
979
+ if polygon_start_token in phrase_text and polygon_end_token in phrase_text:
980
+ polygons_instances_parsed = list(re.finditer(polygons_instance_pattern, phrase_text))
981
+ else:
982
+ polygons_instances_parsed = [phrase_text]
983
+
984
+ for _polygons_instances_parsed in polygons_instances_parsed:
985
+ # Prepare instance.
986
+ instance = {}
987
+
988
+ # polygons_parsed= list(re.finditer(box_pattern, phrase_text))
989
+ if isinstance(_polygons_instances_parsed, str):
990
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed))
991
+ else:
992
+ polygons_parsed= list(re.finditer(box_pattern, _polygons_instances_parsed.group(1)))
993
+ if len(polygons_parsed) == 0:
994
+ continue
995
+
996
+ # a list of list (polygon)
997
+ bbox = []
998
+ polygons = []
999
+ for _polygon_parsed in polygons_parsed:
1000
+ # group 1: whole <loc_\d+>...</loc_\d+>
1001
+ _polygon = _polygon_parsed.group(1)
1002
+ # parse into list of int
1003
+ _polygon = [int(_loc_parsed.group(1)) for _loc_parsed in re.finditer(r'<loc_(\d+)>', _polygon)]
1004
+ if with_box_at_start and len(bbox) == 0:
1005
+ if len(_polygon) > 4:
1006
+ # no valid bbox prediction
1007
+ bbox = _polygon[:4]
1008
+ _polygon = _polygon[4:]
1009
+ else:
1010
+ bbox = [0, 0, 0, 0]
1011
+ # abandon last element if is not paired
1012
+ if len(_polygon) % 2 == 1:
1013
+ _polygon = _polygon[:-1]
1014
+
1015
+ # reshape into (n, 2)
1016
+ _polygon = self.coordinates_quantizer.dequantize(
1017
+ torch.tensor(np.array(_polygon).reshape(-1, 2)),
1018
+ size=image_size
1019
+ ).reshape(-1).tolist()
1020
+ # reshape back
1021
+ polygons.append(_polygon)
1022
+
1023
+ instance['cat_name'] = phrase
1024
+ instance['polygons'] = polygons
1025
+ if len(bbox) != 0:
1026
+ instance['bbox'] = self.box_quantizer.dequantize(
1027
+ boxes=torch.tensor([bbox]),
1028
+ size=image_size
1029
+ ).tolist()[0]
1030
+
1031
+ instances.append(instance)
1032
+
1033
+ return instances
1034
+
1035
+ def __call__(
1036
+ self,
1037
+ text=None,
1038
+ sequence=None,
1039
+ transition_beam_score=None,
1040
+ image_size=None,
1041
+ parse_tasks=None,
1042
+ ):
1043
+ """
1044
+ Args:
1045
+ text: model outputs
1046
+ image_size: (width, height)
1047
+ parse_tasks: a list of tasks to parse, if None, parse all tasks.
1048
+ """
1049
+ if parse_tasks is not None:
1050
+ if isinstance(parse_tasks, str):
1051
+ parse_tasks = [parse_tasks]
1052
+ for _parse_task in parse_tasks:
1053
+ assert _parse_task in self.parse_tasks, f'parse task {_parse_task} not supported'
1054
+
1055
+ # sequence or text should be provided
1056
+ assert sequence is not None or text is not None, 'sequence or text should be provided'
1057
+ assert sequence is None or text is None, 'only one of sequence and text should be provided'
1058
+
1059
+ if sequence is not None:
1060
+ sequence = sequence.tolist()[1:]
1061
+ text, spans = self.decode_with_spans(self.tokenizer, sequence)
1062
+ if transition_beam_score is not None:
1063
+ transition_beam_score = transition_beam_score.tolist()
1064
+ assert len(sequence) == len(transition_beam_score)
1065
+ else:
1066
+ spans = None
1067
+ transition_beam_score = None
1068
+
1069
+ parsed_dict = {
1070
+ 'text': text
1071
+ }
1072
+
1073
+ for task in self.parse_tasks:
1074
+ if parse_tasks is not None and task not in parse_tasks:
1075
+ continue
1076
+
1077
+ pattern = self.parse_tasks_configs[task].get('PATTERN', None)
1078
+ score_mode = self.parse_tasks_configs[task].get('SCORE_MODE', None)
1079
+
1080
+ if task == 'ocr':
1081
+ instances = self.parse_ocr_from_text_and_spans(
1082
+ text,
1083
+ pattern=pattern,
1084
+ image_size=image_size,
1085
+ area_threshold=self.parse_tasks_configs[task].get('AREA_THRESHOLD', 0.0),
1086
+ )
1087
+ parsed_dict['ocr'] = instances
1088
+ elif task == 'phrase_grounding':
1089
+ instances = self.parse_phrase_grounding_from_text_and_spans(
1090
+ text,
1091
+ pattern=pattern,
1092
+ image_size=image_size,
1093
+ )
1094
+ parsed_dict['phrase_grounding'] = instances
1095
+ elif task == 'pure_text':
1096
+ parsed_dict['pure_text'] = text
1097
+ elif task == 'description_with_bboxes':
1098
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1099
+ text,
1100
+ spans=spans,
1101
+ scores=transition_beam_score,
1102
+ score_mode=score_mode,
1103
+ pattern=pattern,
1104
+ image_size=image_size,
1105
+ )
1106
+ parsed_dict['description_with_bboxes'] = instances
1107
+ elif task == 'description_with_polygons':
1108
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1109
+ text,
1110
+ pattern=pattern,
1111
+ image_size=image_size,
1112
+ )
1113
+ parsed_dict['description_with_polygons'] = instances
1114
+ elif task == 'polygons':
1115
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1116
+ text,
1117
+ pattern=pattern,
1118
+ image_size=image_size,
1119
+ allow_empty_phrase=True,
1120
+ )
1121
+ parsed_dict['polygons'] = instances
1122
+ elif task == 'bboxes':
1123
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1124
+ text,
1125
+ pattern=pattern,
1126
+ image_size=image_size,
1127
+ allow_empty_phrase=True,
1128
+ )
1129
+ parsed_dict['bboxes'] = instances
1130
+ elif task == 'description_with_bboxes_or_polygons':
1131
+ if '<poly>' in text:
1132
+ # only support either polygons or bboxes, not both at the same time
1133
+ instances = self.parse_description_with_polygons_from_text_and_spans(
1134
+ text,
1135
+ pattern=pattern,
1136
+ image_size=image_size,
1137
+ )
1138
+ else:
1139
+ instances = self.parse_description_with_bboxes_from_text_and_spans(
1140
+ text,
1141
+ pattern=pattern,
1142
+ image_size=image_size,
1143
+ )
1144
+ parsed_dict['description_with_bboxes_or_polygons'] = instances
1145
+ else:
1146
+ raise ValueError("task {} is not supported".format(task))
1147
+
1148
+ return parsed_dict
LLM/Florence-2-base/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Florence-2-base/tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_max_length": 1024
3
+ }
4
+
LLM/Florence-2-base/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/config.json ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLForConditionalGeneration"
4
+ ],
5
+ "image_token_id": 151655,
6
+ "model_type": "qwen3_vl",
7
+ "text_config": {
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "bos_token_id": 151643,
11
+ "dtype": "bfloat16",
12
+ "eos_token_id": 151645,
13
+ "head_dim": 128,
14
+ "hidden_act": "silu",
15
+ "hidden_size": 2048,
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 6144,
18
+ "max_position_embeddings": 262144,
19
+ "model_type": "qwen3_vl_text",
20
+ "num_attention_heads": 16,
21
+ "num_hidden_layers": 28,
22
+ "num_key_value_heads": 8,
23
+ "rms_norm_eps": 1e-06,
24
+ "rope_scaling": {
25
+ "mrope_interleaved": true,
26
+ "mrope_section": [
27
+ 24,
28
+ 20,
29
+ 20
30
+ ],
31
+ "rope_type": "default"
32
+ },
33
+ "rope_theta": 5000000,
34
+ "tie_word_embeddings": true,
35
+ "use_cache": true,
36
+ "vocab_size": 151936
37
+ },
38
+ "tie_word_embeddings": true,
39
+ "transformers_version": "4.57.0.dev0",
40
+ "video_token_id": 151656,
41
+ "vision_config": {
42
+ "deepstack_visual_indexes": [
43
+ 5,
44
+ 11,
45
+ 17
46
+ ],
47
+ "depth": 24,
48
+ "hidden_act": "gelu_pytorch_tanh",
49
+ "hidden_size": 1024,
50
+ "in_channels": 3,
51
+ "initializer_range": 0.02,
52
+ "intermediate_size": 4096,
53
+ "model_type": "qwen3_vl",
54
+ "num_heads": 16,
55
+ "num_position_embeddings": 2304,
56
+ "out_hidden_size": 2048,
57
+ "patch_size": 16,
58
+ "spatial_merge_size": 2,
59
+ "temporal_patch_size": 2
60
+ },
61
+ "vision_end_token_id": 151653,
62
+ "vision_start_token_id": 151652,
63
+ "quantization_config": {
64
+ "activation_scheme": "dynamic",
65
+ "fmt": "e4m3",
66
+ "quant_method": "fp8",
67
+ "ignored_layers": [
68
+ "lm_head",
69
+ "model.visual.merger.linear_fc1",
70
+ "model.visual.merger.linear_fc2",
71
+ "model.visual.merger.norm",
72
+ "model.visual.patch_embed.proj",
73
+ "model.visual.pos_embed",
74
+ "visual.merger.linear_fc1",
75
+ "visual.merger.linear_fc2",
76
+ "visual.merger.norm",
77
+ "visual.patch_embed.proj",
78
+ "visual.pos_embed",
79
+ "model.visual.blocks.0.attn.proj",
80
+ "model.visual.blocks.0.attn.qkv",
81
+ "model.visual.blocks.0.mlp.linear_fc1",
82
+ "model.visual.blocks.0.mlp.linear_fc2",
83
+ "visual.blocks.0.attn.proj",
84
+ "visual.blocks.0.attn.qkv_proj",
85
+ "visual.blocks.0.mlp.linear_fc1",
86
+ "visual.blocks.0.mlp.linear_fc2",
87
+ "model.visual.blocks.1.attn.proj",
88
+ "model.visual.blocks.1.attn.qkv",
89
+ "model.visual.blocks.1.mlp.linear_fc1",
90
+ "model.visual.blocks.1.mlp.linear_fc2",
91
+ "visual.blocks.1.attn.proj",
92
+ "visual.blocks.1.attn.qkv_proj",
93
+ "visual.blocks.1.mlp.linear_fc1",
94
+ "visual.blocks.1.mlp.linear_fc2",
95
+ "model.visual.blocks.2.attn.proj",
96
+ "model.visual.blocks.2.attn.qkv",
97
+ "model.visual.blocks.2.mlp.linear_fc1",
98
+ "model.visual.blocks.2.mlp.linear_fc2",
99
+ "visual.blocks.2.attn.proj",
100
+ "visual.blocks.2.attn.qkv_proj",
101
+ "visual.blocks.2.mlp.linear_fc1",
102
+ "visual.blocks.2.mlp.linear_fc2",
103
+ "model.visual.blocks.3.attn.proj",
104
+ "model.visual.blocks.3.attn.qkv",
105
+ "model.visual.blocks.3.mlp.linear_fc1",
106
+ "model.visual.blocks.3.mlp.linear_fc2",
107
+ "visual.blocks.3.attn.proj",
108
+ "visual.blocks.3.attn.qkv_proj",
109
+ "visual.blocks.3.mlp.linear_fc1",
110
+ "visual.blocks.3.mlp.linear_fc2",
111
+ "model.visual.blocks.4.attn.proj",
112
+ "model.visual.blocks.4.attn.qkv",
113
+ "model.visual.blocks.4.mlp.linear_fc1",
114
+ "model.visual.blocks.4.mlp.linear_fc2",
115
+ "visual.blocks.4.attn.proj",
116
+ "visual.blocks.4.attn.qkv_proj",
117
+ "visual.blocks.4.mlp.linear_fc1",
118
+ "visual.blocks.4.mlp.linear_fc2",
119
+ "model.visual.blocks.5.attn.proj",
120
+ "model.visual.blocks.5.attn.qkv",
121
+ "model.visual.blocks.5.mlp.linear_fc1",
122
+ "model.visual.blocks.5.mlp.linear_fc2",
123
+ "visual.blocks.5.attn.proj",
124
+ "visual.blocks.5.attn.qkv_proj",
125
+ "visual.blocks.5.mlp.linear_fc1",
126
+ "visual.blocks.5.mlp.linear_fc2",
127
+ "model.visual.blocks.6.attn.proj",
128
+ "model.visual.blocks.6.attn.qkv",
129
+ "model.visual.blocks.6.mlp.linear_fc1",
130
+ "model.visual.blocks.6.mlp.linear_fc2",
131
+ "visual.blocks.6.attn.proj",
132
+ "visual.blocks.6.attn.qkv_proj",
133
+ "visual.blocks.6.mlp.linear_fc1",
134
+ "visual.blocks.6.mlp.linear_fc2",
135
+ "model.visual.blocks.7.attn.proj",
136
+ "model.visual.blocks.7.attn.qkv",
137
+ "model.visual.blocks.7.mlp.linear_fc1",
138
+ "model.visual.blocks.7.mlp.linear_fc2",
139
+ "visual.blocks.7.attn.proj",
140
+ "visual.blocks.7.attn.qkv_proj",
141
+ "visual.blocks.7.mlp.linear_fc1",
142
+ "visual.blocks.7.mlp.linear_fc2",
143
+ "model.visual.blocks.8.attn.proj",
144
+ "model.visual.blocks.8.attn.qkv",
145
+ "model.visual.blocks.8.mlp.linear_fc1",
146
+ "model.visual.blocks.8.mlp.linear_fc2",
147
+ "visual.blocks.8.attn.proj",
148
+ "visual.blocks.8.attn.qkv_proj",
149
+ "visual.blocks.8.mlp.linear_fc1",
150
+ "visual.blocks.8.mlp.linear_fc2",
151
+ "model.visual.blocks.9.attn.proj",
152
+ "model.visual.blocks.9.attn.qkv",
153
+ "model.visual.blocks.9.mlp.linear_fc1",
154
+ "model.visual.blocks.9.mlp.linear_fc2",
155
+ "visual.blocks.9.attn.proj",
156
+ "visual.blocks.9.attn.qkv_proj",
157
+ "visual.blocks.9.mlp.linear_fc1",
158
+ "visual.blocks.9.mlp.linear_fc2",
159
+ "model.visual.blocks.10.attn.proj",
160
+ "model.visual.blocks.10.attn.qkv",
161
+ "model.visual.blocks.10.mlp.linear_fc1",
162
+ "model.visual.blocks.10.mlp.linear_fc2",
163
+ "visual.blocks.10.attn.proj",
164
+ "visual.blocks.10.attn.qkv_proj",
165
+ "visual.blocks.10.mlp.linear_fc1",
166
+ "visual.blocks.10.mlp.linear_fc2",
167
+ "model.visual.blocks.11.attn.proj",
168
+ "model.visual.blocks.11.attn.qkv",
169
+ "model.visual.blocks.11.mlp.linear_fc1",
170
+ "model.visual.blocks.11.mlp.linear_fc2",
171
+ "visual.blocks.11.attn.proj",
172
+ "visual.blocks.11.attn.qkv_proj",
173
+ "visual.blocks.11.mlp.linear_fc1",
174
+ "visual.blocks.11.mlp.linear_fc2",
175
+ "model.visual.blocks.12.attn.proj",
176
+ "model.visual.blocks.12.attn.qkv",
177
+ "model.visual.blocks.12.mlp.linear_fc1",
178
+ "model.visual.blocks.12.mlp.linear_fc2",
179
+ "visual.blocks.12.attn.proj",
180
+ "visual.blocks.12.attn.qkv_proj",
181
+ "visual.blocks.12.mlp.linear_fc1",
182
+ "visual.blocks.12.mlp.linear_fc2",
183
+ "model.visual.blocks.13.attn.proj",
184
+ "model.visual.blocks.13.attn.qkv",
185
+ "model.visual.blocks.13.mlp.linear_fc1",
186
+ "model.visual.blocks.13.mlp.linear_fc2",
187
+ "visual.blocks.13.attn.proj",
188
+ "visual.blocks.13.attn.qkv_proj",
189
+ "visual.blocks.13.mlp.linear_fc1",
190
+ "visual.blocks.13.mlp.linear_fc2",
191
+ "model.visual.blocks.14.attn.proj",
192
+ "model.visual.blocks.14.attn.qkv",
193
+ "model.visual.blocks.14.mlp.linear_fc1",
194
+ "model.visual.blocks.14.mlp.linear_fc2",
195
+ "visual.blocks.14.attn.proj",
196
+ "visual.blocks.14.attn.qkv_proj",
197
+ "visual.blocks.14.mlp.linear_fc1",
198
+ "visual.blocks.14.mlp.linear_fc2",
199
+ "model.visual.blocks.15.attn.proj",
200
+ "model.visual.blocks.15.attn.qkv",
201
+ "model.visual.blocks.15.mlp.linear_fc1",
202
+ "model.visual.blocks.15.mlp.linear_fc2",
203
+ "visual.blocks.15.attn.proj",
204
+ "visual.blocks.15.attn.qkv_proj",
205
+ "visual.blocks.15.mlp.linear_fc1",
206
+ "visual.blocks.15.mlp.linear_fc2",
207
+ "model.visual.blocks.16.attn.proj",
208
+ "model.visual.blocks.16.attn.qkv",
209
+ "model.visual.blocks.16.mlp.linear_fc1",
210
+ "model.visual.blocks.16.mlp.linear_fc2",
211
+ "visual.blocks.16.attn.proj",
212
+ "visual.blocks.16.attn.qkv_proj",
213
+ "visual.blocks.16.mlp.linear_fc1",
214
+ "visual.blocks.16.mlp.linear_fc2",
215
+ "model.visual.blocks.17.attn.proj",
216
+ "model.visual.blocks.17.attn.qkv",
217
+ "model.visual.blocks.17.mlp.linear_fc1",
218
+ "model.visual.blocks.17.mlp.linear_fc2",
219
+ "visual.blocks.17.attn.proj",
220
+ "visual.blocks.17.attn.qkv_proj",
221
+ "visual.blocks.17.mlp.linear_fc1",
222
+ "visual.blocks.17.mlp.linear_fc2",
223
+ "model.visual.blocks.18.attn.proj",
224
+ "model.visual.blocks.18.attn.qkv",
225
+ "model.visual.blocks.18.mlp.linear_fc1",
226
+ "model.visual.blocks.18.mlp.linear_fc2",
227
+ "visual.blocks.18.attn.proj",
228
+ "visual.blocks.18.attn.qkv_proj",
229
+ "visual.blocks.18.mlp.linear_fc1",
230
+ "visual.blocks.18.mlp.linear_fc2",
231
+ "model.visual.blocks.19.attn.proj",
232
+ "model.visual.blocks.19.attn.qkv",
233
+ "model.visual.blocks.19.mlp.linear_fc1",
234
+ "model.visual.blocks.19.mlp.linear_fc2",
235
+ "visual.blocks.19.attn.proj",
236
+ "visual.blocks.19.attn.qkv_proj",
237
+ "visual.blocks.19.mlp.linear_fc1",
238
+ "visual.blocks.19.mlp.linear_fc2",
239
+ "model.visual.blocks.20.attn.proj",
240
+ "model.visual.blocks.20.attn.qkv",
241
+ "model.visual.blocks.20.mlp.linear_fc1",
242
+ "model.visual.blocks.20.mlp.linear_fc2",
243
+ "visual.blocks.20.attn.proj",
244
+ "visual.blocks.20.attn.qkv_proj",
245
+ "visual.blocks.20.mlp.linear_fc1",
246
+ "visual.blocks.20.mlp.linear_fc2",
247
+ "model.visual.blocks.21.attn.proj",
248
+ "model.visual.blocks.21.attn.qkv",
249
+ "model.visual.blocks.21.mlp.linear_fc1",
250
+ "model.visual.blocks.21.mlp.linear_fc2",
251
+ "visual.blocks.21.attn.proj",
252
+ "visual.blocks.21.attn.qkv_proj",
253
+ "visual.blocks.21.mlp.linear_fc1",
254
+ "visual.blocks.21.mlp.linear_fc2",
255
+ "model.visual.blocks.22.attn.proj",
256
+ "model.visual.blocks.22.attn.qkv",
257
+ "model.visual.blocks.22.mlp.linear_fc1",
258
+ "model.visual.blocks.22.mlp.linear_fc2",
259
+ "visual.blocks.22.attn.proj",
260
+ "visual.blocks.22.attn.qkv_proj",
261
+ "visual.blocks.22.mlp.linear_fc1",
262
+ "visual.blocks.22.mlp.linear_fc2",
263
+ "model.visual.blocks.23.attn.proj",
264
+ "model.visual.blocks.23.attn.qkv",
265
+ "model.visual.blocks.23.mlp.linear_fc1",
266
+ "model.visual.blocks.23.mlp.linear_fc2",
267
+ "visual.blocks.23.attn.proj",
268
+ "visual.blocks.23.attn.qkv_proj",
269
+ "visual.blocks.23.mlp.linear_fc1",
270
+ "visual.blocks.23.mlp.linear_fc2",
271
+ "model.visual.blocks.24.attn.proj",
272
+ "model.visual.blocks.24.attn.qkv",
273
+ "model.visual.blocks.24.mlp.linear_fc1",
274
+ "model.visual.blocks.24.mlp.linear_fc2",
275
+ "visual.blocks.24.attn.proj",
276
+ "visual.blocks.24.attn.qkv_proj",
277
+ "visual.blocks.24.mlp.linear_fc1",
278
+ "visual.blocks.24.mlp.linear_fc2",
279
+ "model.visual.blocks.25.attn.proj",
280
+ "model.visual.blocks.25.attn.qkv",
281
+ "model.visual.blocks.25.mlp.linear_fc1",
282
+ "model.visual.blocks.25.mlp.linear_fc2",
283
+ "visual.blocks.25.attn.proj",
284
+ "visual.blocks.25.attn.qkv_proj",
285
+ "visual.blocks.25.mlp.linear_fc1",
286
+ "visual.blocks.25.mlp.linear_fc2",
287
+ "model.visual.blocks.26.attn.proj",
288
+ "model.visual.blocks.26.attn.qkv",
289
+ "model.visual.blocks.26.mlp.linear_fc1",
290
+ "model.visual.blocks.26.mlp.linear_fc2",
291
+ "visual.blocks.26.attn.proj",
292
+ "visual.blocks.26.attn.qkv_proj",
293
+ "visual.blocks.26.mlp.linear_fc1",
294
+ "visual.blocks.26.mlp.linear_fc2",
295
+ "model.visual.deepstack_merger_list.0.linear_fc1",
296
+ "model.visual.deepstack_merger_list.0.linear_fc2",
297
+ "model.visual.deepstack_merger_list.0.norm",
298
+ "visual.deepstack_merger_list.0.linear_fc1",
299
+ "visual.deepstack_merger_list.0.linear_fc2",
300
+ "visual.deepstack_merger_list.0.norm",
301
+ "model.visual.deepstack_merger_list.1.linear_fc1",
302
+ "model.visual.deepstack_merger_list.1.linear_fc2",
303
+ "model.visual.deepstack_merger_list.1.norm",
304
+ "visual.deepstack_merger_list.1.linear_fc1",
305
+ "visual.deepstack_merger_list.1.linear_fc2",
306
+ "visual.deepstack_merger_list.1.norm",
307
+ "model.visual.deepstack_merger_list.2.linear_fc1",
308
+ "model.visual.deepstack_merger_list.2.linear_fc2",
309
+ "model.visual.deepstack_merger_list.2.norm",
310
+ "visual.deepstack_merger_list.2.linear_fc1",
311
+ "visual.deepstack_merger_list.2.linear_fc2",
312
+ "visual.deepstack_merger_list.2.norm"
313
+ ],
314
+ "weight_block_size": [
315
+ 128,
316
+ 128
317
+ ]
318
+ }
319
+ }
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/model.safetensors.index.json ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {},
3
+ "weight_map": {
4
+ "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00001.safetensors",
5
+ "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
6
+ "model.language_model.layers.4.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
7
+ "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
8
+ "model.language_model.layers.4.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
9
+ "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
10
+ "model.language_model.layers.4.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
11
+ "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
12
+ "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
13
+ "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
14
+ "model.language_model.layers.4.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
15
+ "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
16
+ "model.language_model.layers.4.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
17
+ "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
18
+ "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
19
+ "model.language_model.layers.4.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
20
+ "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
21
+ "model.language_model.layers.4.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
22
+ "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00001.safetensors",
23
+ "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
24
+ "model.language_model.layers.0.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
25
+ "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
26
+ "model.language_model.layers.0.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
27
+ "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
28
+ "model.language_model.layers.0.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
29
+ "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
30
+ "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
31
+ "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
32
+ "model.language_model.layers.0.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
33
+ "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
34
+ "model.language_model.layers.0.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
35
+ "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
36
+ "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
37
+ "model.language_model.layers.0.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
38
+ "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
39
+ "model.language_model.layers.0.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
40
+ "model.language_model.layers.17.input_layernorm.weight": "model-00001-of-00001.safetensors",
41
+ "model.language_model.layers.17.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
42
+ "model.language_model.layers.17.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
43
+ "model.language_model.layers.17.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
44
+ "model.language_model.layers.17.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
45
+ "model.language_model.layers.17.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
46
+ "model.language_model.layers.17.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
47
+ "model.language_model.layers.17.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
48
+ "model.language_model.layers.17.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
49
+ "model.language_model.layers.17.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
50
+ "model.language_model.layers.17.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
51
+ "model.language_model.layers.17.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
52
+ "model.language_model.layers.17.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
53
+ "model.language_model.layers.17.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
54
+ "model.language_model.layers.17.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
55
+ "model.language_model.layers.17.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
56
+ "model.language_model.layers.17.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
57
+ "model.language_model.layers.17.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
58
+ "model.language_model.layers.24.input_layernorm.weight": "model-00001-of-00001.safetensors",
59
+ "model.language_model.layers.24.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
60
+ "model.language_model.layers.24.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
61
+ "model.language_model.layers.24.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
62
+ "model.language_model.layers.24.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
63
+ "model.language_model.layers.24.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
64
+ "model.language_model.layers.24.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
65
+ "model.language_model.layers.24.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
66
+ "model.language_model.layers.24.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
67
+ "model.language_model.layers.24.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
68
+ "model.language_model.layers.24.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
69
+ "model.language_model.layers.24.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
70
+ "model.language_model.layers.24.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
71
+ "model.language_model.layers.24.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
72
+ "model.language_model.layers.24.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
73
+ "model.language_model.layers.24.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
74
+ "model.language_model.layers.24.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
75
+ "model.language_model.layers.24.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
76
+ "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00001.safetensors",
77
+ "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
78
+ "model.language_model.layers.1.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
79
+ "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
80
+ "model.language_model.layers.1.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
81
+ "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
82
+ "model.language_model.layers.1.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
83
+ "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
84
+ "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
85
+ "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
86
+ "model.language_model.layers.1.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
87
+ "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
88
+ "model.language_model.layers.1.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
89
+ "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
90
+ "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
91
+ "model.language_model.layers.1.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
92
+ "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
93
+ "model.language_model.layers.1.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
94
+ "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00001.safetensors",
95
+ "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
96
+ "model.language_model.layers.7.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
97
+ "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
98
+ "model.language_model.layers.7.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
99
+ "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
100
+ "model.language_model.layers.7.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
101
+ "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
102
+ "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
103
+ "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
104
+ "model.language_model.layers.7.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
105
+ "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
106
+ "model.language_model.layers.7.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
107
+ "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
108
+ "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
109
+ "model.language_model.layers.7.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
110
+ "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
111
+ "model.language_model.layers.7.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
112
+ "model.language_model.layers.23.input_layernorm.weight": "model-00001-of-00001.safetensors",
113
+ "model.language_model.layers.23.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
114
+ "model.language_model.layers.23.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
115
+ "model.language_model.layers.23.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
116
+ "model.language_model.layers.23.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
117
+ "model.language_model.layers.23.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
118
+ "model.language_model.layers.23.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
119
+ "model.language_model.layers.23.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
120
+ "model.language_model.layers.23.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
121
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
122
+ "model.language_model.layers.23.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
123
+ "model.language_model.layers.23.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
124
+ "model.language_model.layers.23.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
125
+ "model.language_model.layers.23.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
126
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
127
+ "model.language_model.layers.23.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
128
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
129
+ "model.language_model.layers.23.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
130
+ "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00001.safetensors",
131
+ "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
132
+ "model.language_model.layers.13.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
133
+ "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
134
+ "model.language_model.layers.13.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
135
+ "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
136
+ "model.language_model.layers.13.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
137
+ "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
138
+ "model.language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
139
+ "model.language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
140
+ "model.language_model.layers.13.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
141
+ "model.language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
142
+ "model.language_model.layers.13.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
143
+ "model.language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
144
+ "model.language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
145
+ "model.language_model.layers.13.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
146
+ "model.language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
147
+ "model.language_model.layers.13.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
148
+ "model.language_model.layers.20.input_layernorm.weight": "model-00001-of-00001.safetensors",
149
+ "model.language_model.layers.20.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
150
+ "model.language_model.layers.20.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
151
+ "model.language_model.layers.20.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
152
+ "model.language_model.layers.20.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
153
+ "model.language_model.layers.20.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
154
+ "model.language_model.layers.20.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
155
+ "model.language_model.layers.20.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
156
+ "model.language_model.layers.20.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
157
+ "model.language_model.layers.20.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
158
+ "model.language_model.layers.20.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
159
+ "model.language_model.layers.20.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
160
+ "model.language_model.layers.20.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
161
+ "model.language_model.layers.20.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
162
+ "model.language_model.layers.20.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
163
+ "model.language_model.layers.20.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
164
+ "model.language_model.layers.20.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
165
+ "model.language_model.layers.20.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
166
+ "model.language_model.layers.27.input_layernorm.weight": "model-00001-of-00001.safetensors",
167
+ "model.language_model.layers.27.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
168
+ "model.language_model.layers.27.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
169
+ "model.language_model.layers.27.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
170
+ "model.language_model.layers.27.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
171
+ "model.language_model.layers.27.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
172
+ "model.language_model.layers.27.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
173
+ "model.language_model.layers.27.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
174
+ "model.language_model.layers.27.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
175
+ "model.language_model.layers.27.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
176
+ "model.language_model.layers.27.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
177
+ "model.language_model.layers.27.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
178
+ "model.language_model.layers.27.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
179
+ "model.language_model.layers.27.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
180
+ "model.language_model.layers.27.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
181
+ "model.language_model.layers.27.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
182
+ "model.language_model.layers.27.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
183
+ "model.language_model.layers.27.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
184
+ "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00001.safetensors",
185
+ "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
186
+ "model.language_model.layers.14.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
187
+ "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
188
+ "model.language_model.layers.14.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
189
+ "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
190
+ "model.language_model.layers.14.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
191
+ "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
192
+ "model.language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
193
+ "model.language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
194
+ "model.language_model.layers.14.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
195
+ "model.language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
196
+ "model.language_model.layers.14.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
197
+ "model.language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
198
+ "model.language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
199
+ "model.language_model.layers.14.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
200
+ "model.language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
201
+ "model.language_model.layers.14.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
202
+ "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00001.safetensors",
203
+ "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
204
+ "model.language_model.layers.10.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
205
+ "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
206
+ "model.language_model.layers.10.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
207
+ "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
208
+ "model.language_model.layers.10.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
209
+ "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
210
+ "model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
211
+ "model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
212
+ "model.language_model.layers.10.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
213
+ "model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
214
+ "model.language_model.layers.10.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
215
+ "model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
216
+ "model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
217
+ "model.language_model.layers.10.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
218
+ "model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
219
+ "model.language_model.layers.10.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
220
+ "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00001.safetensors",
221
+ "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
222
+ "model.language_model.layers.11.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
223
+ "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
224
+ "model.language_model.layers.11.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
225
+ "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
226
+ "model.language_model.layers.11.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
227
+ "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
228
+ "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
229
+ "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
230
+ "model.language_model.layers.11.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
231
+ "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
232
+ "model.language_model.layers.11.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
233
+ "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
234
+ "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
235
+ "model.language_model.layers.11.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
236
+ "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
237
+ "model.language_model.layers.11.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
238
+ "model.language_model.layers.21.input_layernorm.weight": "model-00001-of-00001.safetensors",
239
+ "model.language_model.layers.21.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
240
+ "model.language_model.layers.21.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
241
+ "model.language_model.layers.21.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
242
+ "model.language_model.layers.21.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
243
+ "model.language_model.layers.21.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
244
+ "model.language_model.layers.21.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
245
+ "model.language_model.layers.21.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
246
+ "model.language_model.layers.21.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
247
+ "model.language_model.layers.21.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
248
+ "model.language_model.layers.21.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
249
+ "model.language_model.layers.21.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
250
+ "model.language_model.layers.21.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
251
+ "model.language_model.layers.21.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
252
+ "model.language_model.layers.21.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
253
+ "model.language_model.layers.21.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
254
+ "model.language_model.layers.21.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
255
+ "model.language_model.layers.21.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
256
+ "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00001.safetensors",
257
+ "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
258
+ "model.language_model.layers.12.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
259
+ "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
260
+ "model.language_model.layers.12.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
261
+ "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
262
+ "model.language_model.layers.12.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
263
+ "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
264
+ "model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
265
+ "model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
266
+ "model.language_model.layers.12.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
267
+ "model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
268
+ "model.language_model.layers.12.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
269
+ "model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
270
+ "model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
271
+ "model.language_model.layers.12.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
272
+ "model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
273
+ "model.language_model.layers.12.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
274
+ "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00001.safetensors",
275
+ "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
276
+ "model.language_model.layers.6.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
277
+ "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
278
+ "model.language_model.layers.6.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
279
+ "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
280
+ "model.language_model.layers.6.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
281
+ "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
282
+ "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
283
+ "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
284
+ "model.language_model.layers.6.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
285
+ "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
286
+ "model.language_model.layers.6.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
287
+ "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
288
+ "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
289
+ "model.language_model.layers.6.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
290
+ "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
291
+ "model.language_model.layers.6.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
292
+ "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00001.safetensors",
293
+ "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
294
+ "model.language_model.layers.2.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
295
+ "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
296
+ "model.language_model.layers.2.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
297
+ "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
298
+ "model.language_model.layers.2.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
299
+ "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
300
+ "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
301
+ "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
302
+ "model.language_model.layers.2.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
303
+ "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
304
+ "model.language_model.layers.2.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
305
+ "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
306
+ "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
307
+ "model.language_model.layers.2.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
308
+ "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
309
+ "model.language_model.layers.2.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
310
+ "lm_head.weight": "model-00001-of-00001.safetensors",
311
+ "model.language_model.embed_tokens.weight": "model-00001-of-00001.safetensors",
312
+ "model.language_model.norm.weight": "model-00001-of-00001.safetensors",
313
+ "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00001.safetensors",
314
+ "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00001.safetensors",
315
+ "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00001.safetensors",
316
+ "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00001.safetensors",
317
+ "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
318
+ "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
319
+ "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
320
+ "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
321
+ "model.visual.blocks.0.norm1.bias": "model-00001-of-00001.safetensors",
322
+ "model.visual.blocks.0.norm1.weight": "model-00001-of-00001.safetensors",
323
+ "model.visual.blocks.0.norm2.bias": "model-00001-of-00001.safetensors",
324
+ "model.visual.blocks.0.norm2.weight": "model-00001-of-00001.safetensors",
325
+ "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00001.safetensors",
326
+ "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00001.safetensors",
327
+ "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00001.safetensors",
328
+ "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00001.safetensors",
329
+ "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
330
+ "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
331
+ "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
332
+ "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
333
+ "model.visual.blocks.1.norm1.bias": "model-00001-of-00001.safetensors",
334
+ "model.visual.blocks.1.norm1.weight": "model-00001-of-00001.safetensors",
335
+ "model.visual.blocks.1.norm2.bias": "model-00001-of-00001.safetensors",
336
+ "model.visual.blocks.1.norm2.weight": "model-00001-of-00001.safetensors",
337
+ "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00001.safetensors",
338
+ "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00001.safetensors",
339
+ "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00001.safetensors",
340
+ "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00001.safetensors",
341
+ "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
342
+ "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
343
+ "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
344
+ "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
345
+ "model.visual.blocks.10.norm1.bias": "model-00001-of-00001.safetensors",
346
+ "model.visual.blocks.10.norm1.weight": "model-00001-of-00001.safetensors",
347
+ "model.visual.blocks.10.norm2.bias": "model-00001-of-00001.safetensors",
348
+ "model.visual.blocks.10.norm2.weight": "model-00001-of-00001.safetensors",
349
+ "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00001.safetensors",
350
+ "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00001.safetensors",
351
+ "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00001.safetensors",
352
+ "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00001.safetensors",
353
+ "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
354
+ "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
355
+ "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
356
+ "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
357
+ "model.visual.blocks.11.norm1.bias": "model-00001-of-00001.safetensors",
358
+ "model.visual.blocks.11.norm1.weight": "model-00001-of-00001.safetensors",
359
+ "model.visual.blocks.11.norm2.bias": "model-00001-of-00001.safetensors",
360
+ "model.visual.blocks.11.norm2.weight": "model-00001-of-00001.safetensors",
361
+ "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00001.safetensors",
362
+ "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00001.safetensors",
363
+ "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00001.safetensors",
364
+ "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00001.safetensors",
365
+ "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
366
+ "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
367
+ "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
368
+ "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
369
+ "model.visual.blocks.12.norm1.bias": "model-00001-of-00001.safetensors",
370
+ "model.visual.blocks.12.norm1.weight": "model-00001-of-00001.safetensors",
371
+ "model.visual.blocks.12.norm2.bias": "model-00001-of-00001.safetensors",
372
+ "model.visual.blocks.12.norm2.weight": "model-00001-of-00001.safetensors",
373
+ "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00001.safetensors",
374
+ "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00001.safetensors",
375
+ "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00001.safetensors",
376
+ "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00001.safetensors",
377
+ "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
378
+ "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
379
+ "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
380
+ "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
381
+ "model.visual.blocks.13.norm1.bias": "model-00001-of-00001.safetensors",
382
+ "model.visual.blocks.13.norm1.weight": "model-00001-of-00001.safetensors",
383
+ "model.visual.blocks.13.norm2.bias": "model-00001-of-00001.safetensors",
384
+ "model.visual.blocks.13.norm2.weight": "model-00001-of-00001.safetensors",
385
+ "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00001.safetensors",
386
+ "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00001.safetensors",
387
+ "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00001.safetensors",
388
+ "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00001.safetensors",
389
+ "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
390
+ "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
391
+ "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
392
+ "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
393
+ "model.visual.blocks.14.norm1.bias": "model-00001-of-00001.safetensors",
394
+ "model.visual.blocks.14.norm1.weight": "model-00001-of-00001.safetensors",
395
+ "model.visual.blocks.14.norm2.bias": "model-00001-of-00001.safetensors",
396
+ "model.visual.blocks.14.norm2.weight": "model-00001-of-00001.safetensors",
397
+ "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00001.safetensors",
398
+ "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00001.safetensors",
399
+ "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00001.safetensors",
400
+ "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00001.safetensors",
401
+ "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
402
+ "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
403
+ "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
404
+ "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
405
+ "model.visual.blocks.15.norm1.bias": "model-00001-of-00001.safetensors",
406
+ "model.visual.blocks.15.norm1.weight": "model-00001-of-00001.safetensors",
407
+ "model.visual.blocks.15.norm2.bias": "model-00001-of-00001.safetensors",
408
+ "model.visual.blocks.15.norm2.weight": "model-00001-of-00001.safetensors",
409
+ "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00001.safetensors",
410
+ "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00001.safetensors",
411
+ "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00001.safetensors",
412
+ "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00001.safetensors",
413
+ "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
414
+ "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
415
+ "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
416
+ "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
417
+ "model.visual.blocks.16.norm1.bias": "model-00001-of-00001.safetensors",
418
+ "model.visual.blocks.16.norm1.weight": "model-00001-of-00001.safetensors",
419
+ "model.visual.blocks.16.norm2.bias": "model-00001-of-00001.safetensors",
420
+ "model.visual.blocks.16.norm2.weight": "model-00001-of-00001.safetensors",
421
+ "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00001.safetensors",
422
+ "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00001.safetensors",
423
+ "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00001.safetensors",
424
+ "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00001.safetensors",
425
+ "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
426
+ "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
427
+ "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
428
+ "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
429
+ "model.visual.blocks.17.norm1.bias": "model-00001-of-00001.safetensors",
430
+ "model.visual.blocks.17.norm1.weight": "model-00001-of-00001.safetensors",
431
+ "model.visual.blocks.17.norm2.bias": "model-00001-of-00001.safetensors",
432
+ "model.visual.blocks.17.norm2.weight": "model-00001-of-00001.safetensors",
433
+ "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00001.safetensors",
434
+ "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00001.safetensors",
435
+ "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00001.safetensors",
436
+ "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00001.safetensors",
437
+ "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
438
+ "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
439
+ "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
440
+ "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
441
+ "model.visual.blocks.18.norm1.bias": "model-00001-of-00001.safetensors",
442
+ "model.visual.blocks.18.norm1.weight": "model-00001-of-00001.safetensors",
443
+ "model.visual.blocks.18.norm2.bias": "model-00001-of-00001.safetensors",
444
+ "model.visual.blocks.18.norm2.weight": "model-00001-of-00001.safetensors",
445
+ "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00001.safetensors",
446
+ "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00001.safetensors",
447
+ "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00001.safetensors",
448
+ "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00001.safetensors",
449
+ "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
450
+ "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
451
+ "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
452
+ "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
453
+ "model.visual.blocks.19.norm1.bias": "model-00001-of-00001.safetensors",
454
+ "model.visual.blocks.19.norm1.weight": "model-00001-of-00001.safetensors",
455
+ "model.visual.blocks.19.norm2.bias": "model-00001-of-00001.safetensors",
456
+ "model.visual.blocks.19.norm2.weight": "model-00001-of-00001.safetensors",
457
+ "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00001.safetensors",
458
+ "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00001.safetensors",
459
+ "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00001.safetensors",
460
+ "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00001.safetensors",
461
+ "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
462
+ "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
463
+ "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
464
+ "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
465
+ "model.visual.blocks.2.norm1.bias": "model-00001-of-00001.safetensors",
466
+ "model.visual.blocks.2.norm1.weight": "model-00001-of-00001.safetensors",
467
+ "model.visual.blocks.2.norm2.bias": "model-00001-of-00001.safetensors",
468
+ "model.visual.blocks.2.norm2.weight": "model-00001-of-00001.safetensors",
469
+ "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00001.safetensors",
470
+ "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00001.safetensors",
471
+ "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00001.safetensors",
472
+ "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00001.safetensors",
473
+ "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
474
+ "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
475
+ "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
476
+ "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
477
+ "model.visual.blocks.20.norm1.bias": "model-00001-of-00001.safetensors",
478
+ "model.visual.blocks.20.norm1.weight": "model-00001-of-00001.safetensors",
479
+ "model.visual.blocks.20.norm2.bias": "model-00001-of-00001.safetensors",
480
+ "model.visual.blocks.20.norm2.weight": "model-00001-of-00001.safetensors",
481
+ "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00001.safetensors",
482
+ "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00001.safetensors",
483
+ "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00001.safetensors",
484
+ "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00001.safetensors",
485
+ "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
486
+ "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
487
+ "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
488
+ "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
489
+ "model.visual.blocks.21.norm1.bias": "model-00001-of-00001.safetensors",
490
+ "model.visual.blocks.21.norm1.weight": "model-00001-of-00001.safetensors",
491
+ "model.visual.blocks.21.norm2.bias": "model-00001-of-00001.safetensors",
492
+ "model.visual.blocks.21.norm2.weight": "model-00001-of-00001.safetensors",
493
+ "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00001.safetensors",
494
+ "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00001.safetensors",
495
+ "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00001.safetensors",
496
+ "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00001.safetensors",
497
+ "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
498
+ "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
499
+ "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
500
+ "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
501
+ "model.visual.blocks.22.norm1.bias": "model-00001-of-00001.safetensors",
502
+ "model.visual.blocks.22.norm1.weight": "model-00001-of-00001.safetensors",
503
+ "model.visual.blocks.22.norm2.bias": "model-00001-of-00001.safetensors",
504
+ "model.visual.blocks.22.norm2.weight": "model-00001-of-00001.safetensors",
505
+ "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00001.safetensors",
506
+ "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00001.safetensors",
507
+ "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00001.safetensors",
508
+ "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00001.safetensors",
509
+ "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
510
+ "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
511
+ "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
512
+ "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
513
+ "model.visual.blocks.23.norm1.bias": "model-00001-of-00001.safetensors",
514
+ "model.visual.blocks.23.norm1.weight": "model-00001-of-00001.safetensors",
515
+ "model.visual.blocks.23.norm2.bias": "model-00001-of-00001.safetensors",
516
+ "model.visual.blocks.23.norm2.weight": "model-00001-of-00001.safetensors",
517
+ "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00001.safetensors",
518
+ "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00001.safetensors",
519
+ "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00001.safetensors",
520
+ "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00001.safetensors",
521
+ "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
522
+ "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
523
+ "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
524
+ "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
525
+ "model.visual.blocks.3.norm1.bias": "model-00001-of-00001.safetensors",
526
+ "model.visual.blocks.3.norm1.weight": "model-00001-of-00001.safetensors",
527
+ "model.visual.blocks.3.norm2.bias": "model-00001-of-00001.safetensors",
528
+ "model.visual.blocks.3.norm2.weight": "model-00001-of-00001.safetensors",
529
+ "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00001.safetensors",
530
+ "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00001.safetensors",
531
+ "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00001.safetensors",
532
+ "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00001.safetensors",
533
+ "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
534
+ "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
535
+ "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
536
+ "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
537
+ "model.visual.blocks.4.norm1.bias": "model-00001-of-00001.safetensors",
538
+ "model.visual.blocks.4.norm1.weight": "model-00001-of-00001.safetensors",
539
+ "model.visual.blocks.4.norm2.bias": "model-00001-of-00001.safetensors",
540
+ "model.visual.blocks.4.norm2.weight": "model-00001-of-00001.safetensors",
541
+ "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00001.safetensors",
542
+ "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00001.safetensors",
543
+ "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00001.safetensors",
544
+ "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00001.safetensors",
545
+ "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
546
+ "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
547
+ "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
548
+ "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
549
+ "model.visual.blocks.5.norm1.bias": "model-00001-of-00001.safetensors",
550
+ "model.visual.blocks.5.norm1.weight": "model-00001-of-00001.safetensors",
551
+ "model.visual.blocks.5.norm2.bias": "model-00001-of-00001.safetensors",
552
+ "model.visual.blocks.5.norm2.weight": "model-00001-of-00001.safetensors",
553
+ "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00001.safetensors",
554
+ "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00001.safetensors",
555
+ "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00001.safetensors",
556
+ "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00001.safetensors",
557
+ "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
558
+ "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
559
+ "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
560
+ "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
561
+ "model.visual.blocks.6.norm1.bias": "model-00001-of-00001.safetensors",
562
+ "model.visual.blocks.6.norm1.weight": "model-00001-of-00001.safetensors",
563
+ "model.visual.blocks.6.norm2.bias": "model-00001-of-00001.safetensors",
564
+ "model.visual.blocks.6.norm2.weight": "model-00001-of-00001.safetensors",
565
+ "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00001.safetensors",
566
+ "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00001.safetensors",
567
+ "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00001.safetensors",
568
+ "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00001.safetensors",
569
+ "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
570
+ "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
571
+ "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
572
+ "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
573
+ "model.visual.blocks.7.norm1.bias": "model-00001-of-00001.safetensors",
574
+ "model.visual.blocks.7.norm1.weight": "model-00001-of-00001.safetensors",
575
+ "model.visual.blocks.7.norm2.bias": "model-00001-of-00001.safetensors",
576
+ "model.visual.blocks.7.norm2.weight": "model-00001-of-00001.safetensors",
577
+ "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00001.safetensors",
578
+ "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00001.safetensors",
579
+ "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00001.safetensors",
580
+ "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00001.safetensors",
581
+ "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
582
+ "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
583
+ "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
584
+ "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
585
+ "model.visual.blocks.8.norm1.bias": "model-00001-of-00001.safetensors",
586
+ "model.visual.blocks.8.norm1.weight": "model-00001-of-00001.safetensors",
587
+ "model.visual.blocks.8.norm2.bias": "model-00001-of-00001.safetensors",
588
+ "model.visual.blocks.8.norm2.weight": "model-00001-of-00001.safetensors",
589
+ "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00001.safetensors",
590
+ "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00001.safetensors",
591
+ "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00001.safetensors",
592
+ "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00001.safetensors",
593
+ "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00001.safetensors",
594
+ "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00001.safetensors",
595
+ "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00001.safetensors",
596
+ "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00001.safetensors",
597
+ "model.visual.blocks.9.norm1.bias": "model-00001-of-00001.safetensors",
598
+ "model.visual.blocks.9.norm1.weight": "model-00001-of-00001.safetensors",
599
+ "model.visual.blocks.9.norm2.bias": "model-00001-of-00001.safetensors",
600
+ "model.visual.blocks.9.norm2.weight": "model-00001-of-00001.safetensors",
601
+ "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00001.safetensors",
602
+ "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00001.safetensors",
603
+ "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00001.safetensors",
604
+ "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00001.safetensors",
605
+ "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00001.safetensors",
606
+ "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00001.safetensors",
607
+ "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00001.safetensors",
608
+ "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00001.safetensors",
609
+ "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00001.safetensors",
610
+ "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00001.safetensors",
611
+ "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00001.safetensors",
612
+ "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00001.safetensors",
613
+ "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00001.safetensors",
614
+ "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00001.safetensors",
615
+ "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00001.safetensors",
616
+ "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00001.safetensors",
617
+ "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00001.safetensors",
618
+ "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00001.safetensors",
619
+ "model.visual.merger.linear_fc1.bias": "model-00001-of-00001.safetensors",
620
+ "model.visual.merger.linear_fc1.weight": "model-00001-of-00001.safetensors",
621
+ "model.visual.merger.linear_fc2.bias": "model-00001-of-00001.safetensors",
622
+ "model.visual.merger.linear_fc2.weight": "model-00001-of-00001.safetensors",
623
+ "model.visual.merger.norm.bias": "model-00001-of-00001.safetensors",
624
+ "model.visual.merger.norm.weight": "model-00001-of-00001.safetensors",
625
+ "model.visual.patch_embed.proj.bias": "model-00001-of-00001.safetensors",
626
+ "model.visual.patch_embed.proj.weight": "model-00001-of-00001.safetensors",
627
+ "model.visual.pos_embed.weight": "model-00001-of-00001.safetensors",
628
+ "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00001.safetensors",
629
+ "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
630
+ "model.language_model.layers.8.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
631
+ "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
632
+ "model.language_model.layers.8.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
633
+ "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
634
+ "model.language_model.layers.8.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
635
+ "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
636
+ "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
637
+ "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
638
+ "model.language_model.layers.8.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
639
+ "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
640
+ "model.language_model.layers.8.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
641
+ "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
642
+ "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
643
+ "model.language_model.layers.8.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
644
+ "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
645
+ "model.language_model.layers.8.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
646
+ "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00001.safetensors",
647
+ "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
648
+ "model.language_model.layers.15.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
649
+ "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
650
+ "model.language_model.layers.15.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
651
+ "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
652
+ "model.language_model.layers.15.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
653
+ "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
654
+ "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
655
+ "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
656
+ "model.language_model.layers.15.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
657
+ "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
658
+ "model.language_model.layers.15.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
659
+ "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
660
+ "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
661
+ "model.language_model.layers.15.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
662
+ "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
663
+ "model.language_model.layers.15.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
664
+ "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00001.safetensors",
665
+ "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
666
+ "model.language_model.layers.5.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
667
+ "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
668
+ "model.language_model.layers.5.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
669
+ "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
670
+ "model.language_model.layers.5.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
671
+ "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
672
+ "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
673
+ "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
674
+ "model.language_model.layers.5.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
675
+ "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
676
+ "model.language_model.layers.5.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
677
+ "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
678
+ "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
679
+ "model.language_model.layers.5.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
680
+ "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
681
+ "model.language_model.layers.5.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
682
+ "model.language_model.layers.25.input_layernorm.weight": "model-00001-of-00001.safetensors",
683
+ "model.language_model.layers.25.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
684
+ "model.language_model.layers.25.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
685
+ "model.language_model.layers.25.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
686
+ "model.language_model.layers.25.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
687
+ "model.language_model.layers.25.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
688
+ "model.language_model.layers.25.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
689
+ "model.language_model.layers.25.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
690
+ "model.language_model.layers.25.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
691
+ "model.language_model.layers.25.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
692
+ "model.language_model.layers.25.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
693
+ "model.language_model.layers.25.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
694
+ "model.language_model.layers.25.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
695
+ "model.language_model.layers.25.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
696
+ "model.language_model.layers.25.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
697
+ "model.language_model.layers.25.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
698
+ "model.language_model.layers.25.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
699
+ "model.language_model.layers.25.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
700
+ "model.language_model.layers.18.input_layernorm.weight": "model-00001-of-00001.safetensors",
701
+ "model.language_model.layers.18.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
702
+ "model.language_model.layers.18.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
703
+ "model.language_model.layers.18.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
704
+ "model.language_model.layers.18.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
705
+ "model.language_model.layers.18.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
706
+ "model.language_model.layers.18.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
707
+ "model.language_model.layers.18.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
708
+ "model.language_model.layers.18.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
709
+ "model.language_model.layers.18.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
710
+ "model.language_model.layers.18.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
711
+ "model.language_model.layers.18.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
712
+ "model.language_model.layers.18.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
713
+ "model.language_model.layers.18.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
714
+ "model.language_model.layers.18.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
715
+ "model.language_model.layers.18.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
716
+ "model.language_model.layers.18.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
717
+ "model.language_model.layers.18.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
718
+ "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00001.safetensors",
719
+ "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
720
+ "model.language_model.layers.9.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
721
+ "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
722
+ "model.language_model.layers.9.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
723
+ "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
724
+ "model.language_model.layers.9.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
725
+ "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
726
+ "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
727
+ "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
728
+ "model.language_model.layers.9.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
729
+ "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
730
+ "model.language_model.layers.9.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
731
+ "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
732
+ "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
733
+ "model.language_model.layers.9.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
734
+ "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
735
+ "model.language_model.layers.9.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
736
+ "model.language_model.layers.22.input_layernorm.weight": "model-00001-of-00001.safetensors",
737
+ "model.language_model.layers.22.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
738
+ "model.language_model.layers.22.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
739
+ "model.language_model.layers.22.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
740
+ "model.language_model.layers.22.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
741
+ "model.language_model.layers.22.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
742
+ "model.language_model.layers.22.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
743
+ "model.language_model.layers.22.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
744
+ "model.language_model.layers.22.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
745
+ "model.language_model.layers.22.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
746
+ "model.language_model.layers.22.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
747
+ "model.language_model.layers.22.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
748
+ "model.language_model.layers.22.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
749
+ "model.language_model.layers.22.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
750
+ "model.language_model.layers.22.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
751
+ "model.language_model.layers.22.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
752
+ "model.language_model.layers.22.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
753
+ "model.language_model.layers.22.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
754
+ "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00001.safetensors",
755
+ "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
756
+ "model.language_model.layers.3.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
757
+ "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
758
+ "model.language_model.layers.3.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
759
+ "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
760
+ "model.language_model.layers.3.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
761
+ "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
762
+ "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
763
+ "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
764
+ "model.language_model.layers.3.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
765
+ "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
766
+ "model.language_model.layers.3.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
767
+ "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
768
+ "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
769
+ "model.language_model.layers.3.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
770
+ "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
771
+ "model.language_model.layers.3.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
772
+ "model.language_model.layers.19.input_layernorm.weight": "model-00001-of-00001.safetensors",
773
+ "model.language_model.layers.19.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
774
+ "model.language_model.layers.19.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
775
+ "model.language_model.layers.19.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
776
+ "model.language_model.layers.19.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
777
+ "model.language_model.layers.19.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
778
+ "model.language_model.layers.19.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
779
+ "model.language_model.layers.19.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
780
+ "model.language_model.layers.19.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
781
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
782
+ "model.language_model.layers.19.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
783
+ "model.language_model.layers.19.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
784
+ "model.language_model.layers.19.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
785
+ "model.language_model.layers.19.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
786
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
787
+ "model.language_model.layers.19.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
788
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
789
+ "model.language_model.layers.19.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
790
+ "model.language_model.layers.16.input_layernorm.weight": "model-00001-of-00001.safetensors",
791
+ "model.language_model.layers.16.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
792
+ "model.language_model.layers.16.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
793
+ "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
794
+ "model.language_model.layers.16.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
795
+ "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
796
+ "model.language_model.layers.16.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
797
+ "model.language_model.layers.16.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
798
+ "model.language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
799
+ "model.language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
800
+ "model.language_model.layers.16.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
801
+ "model.language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
802
+ "model.language_model.layers.16.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
803
+ "model.language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
804
+ "model.language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
805
+ "model.language_model.layers.16.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
806
+ "model.language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
807
+ "model.language_model.layers.16.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
808
+ "model.language_model.layers.26.input_layernorm.weight": "model-00001-of-00001.safetensors",
809
+ "model.language_model.layers.26.mlp.down_proj.weight": "model-00001-of-00001.safetensors",
810
+ "model.language_model.layers.26.mlp.down_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
811
+ "model.language_model.layers.26.mlp.gate_proj.weight": "model-00001-of-00001.safetensors",
812
+ "model.language_model.layers.26.mlp.gate_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
813
+ "model.language_model.layers.26.mlp.up_proj.weight": "model-00001-of-00001.safetensors",
814
+ "model.language_model.layers.26.mlp.up_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
815
+ "model.language_model.layers.26.post_attention_layernorm.weight": "model-00001-of-00001.safetensors",
816
+ "model.language_model.layers.26.self_attn.k_norm.weight": "model-00001-of-00001.safetensors",
817
+ "model.language_model.layers.26.self_attn.k_proj.weight": "model-00001-of-00001.safetensors",
818
+ "model.language_model.layers.26.self_attn.k_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
819
+ "model.language_model.layers.26.self_attn.o_proj.weight": "model-00001-of-00001.safetensors",
820
+ "model.language_model.layers.26.self_attn.o_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
821
+ "model.language_model.layers.26.self_attn.q_norm.weight": "model-00001-of-00001.safetensors",
822
+ "model.language_model.layers.26.self_attn.q_proj.weight": "model-00001-of-00001.safetensors",
823
+ "model.language_model.layers.26.self_attn.q_proj.weight_scale_inv": "model-00001-of-00001.safetensors",
824
+ "model.language_model.layers.26.self_attn.v_proj.weight": "model-00001-of-00001.safetensors",
825
+ "model.language_model.layers.26.self_attn.v_proj.weight_scale_inv": "model-00001-of-00001.safetensors"
826
+ }
827
+ }
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 16777216,
4
+ "shortest_edge": 65536
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "image_processor_type": "Qwen2VLImageProcessorFast"
21
+ }
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/tokenizer_config.json ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "chat_template": "{%- set image_count = namespace(value=0) %}\n{%- set video_count = namespace(value=0) %}\n{%- macro render_content(content, do_vision_count) %}\n {%- if content is string %}\n {{- content }}\n {%- else %}\n {%- for item in content %}\n {%- if 'image' in item or 'image_url' in item or item.type == 'image' %}\n {%- if do_vision_count %}\n {%- set image_count.value = image_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}\n <|vision_start|><|image_pad|><|vision_end|>\n {%- elif 'video' in item or item.type == 'video' %}\n {%- if do_vision_count %}\n {%- set video_count.value = video_count.value + 1 %}\n {%- endif %}\n {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}\n <|vision_start|><|video_pad|><|vision_end|>\n {%- elif 'text' in item %}\n {{- item.text }}\n {%- endif %}\n {%- endfor %}\n {%- endif %}\n{%- endmacro %}\n{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- render_content(messages[0].content, false) + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0].role == 'system' %}\n {{- '<|im_start|>system\\n' + render_content(messages[0].content, false) + '<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" %}\n {%- set content = render_content(message.content, false) %}\n {%- if not(content.startswith('<tool_response>') and content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- set content = render_content(message.content, True) %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {%- set reasoning_content = '' %}\n {%- if message.reasoning_content is string %}\n {%- set reasoning_content = message.reasoning_content %}\n {%- else %}\n {%- if '</think>' in content %}\n {%- set reasoning_content = content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n {%- set content = content.split('</think>')[-1].lstrip('\\n') %}\n {%- endif %}\n {%- endif %}\n {%- if loop.index0 > ns.last_query_index %}\n {%- if loop.last or (not loop.last and reasoning_content) %}\n {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- else %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- endif %}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n<think>\\n' }}\n{%- endif %}\n",
231
+ "clean_up_tokenization_spaces": false,
232
+ "eos_token": "<|im_end|>",
233
+ "errors": "replace",
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "split_special_tokens": false,
237
+ "tokenizer_class": "Qwen2Tokenizer",
238
+ "unk_token": null
239
+ }
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/video_preprocessor_config.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "size": {
3
+ "longest_edge": 25165824,
4
+ "shortest_edge": 4096
5
+ },
6
+ "patch_size": 16,
7
+ "temporal_patch_size": 2,
8
+ "merge_size": 2,
9
+ "image_mean": [
10
+ 0.5,
11
+ 0.5,
12
+ 0.5
13
+ ],
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "processor_class": "Qwen3VLProcessor",
20
+ "video_processor_type": "Qwen3VLVideoProcessor"
21
+ }
LLM/Qwen-VL/Qwen3-VL-2B-Thinking-FP8/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/README.md ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model:
4
+ - Qwen/Qwen3-VL-4B-Instruct
5
+ pipeline_tag: image-text-to-text
6
+ tags:
7
+ - abliterated
8
+ - uncensored
9
+ library_name: transformers
10
+ ---
11
+
12
+ # huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated-FP8
13
+
14
+
15
+ This is an uncensored version of [Qwen/Qwen3-VL-4B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct) created with abliteration (see [remove-refusals-with-transformers](https://github.com/Sumandora/remove-refusals-with-transformers) to know more about it).
16
+
17
+ It was only the text part that was processed, not the image part.
18
+
19
+ The abliterated model will no longer say "I can’t describe or analyze this image."
20
+
21
+ This FP8 version was converted from [huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated](https://huggingface.co/huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated). For the forwarding method, refer to [finegrained_fp8](https://huggingface.co/docs/transformers/quantization/finegrained_fp8).
22
+
23
+ ## Chat with Image
24
+
25
+ ```
26
+ from transformers import Qwen3VLForConditionalGeneration, AutoProcessor, BitsAndBytesConfig
27
+ import os
28
+ import torch
29
+
30
+ cpu_count = os.cpu_count()
31
+ print(f"Number of CPU cores in the system: {cpu_count}")
32
+ half_cpu_count = cpu_count // 2
33
+ os.environ["MKL_NUM_THREADS"] = str(half_cpu_count)
34
+ os.environ["OMP_NUM_THREADS"] = str(half_cpu_count)
35
+ torch.set_num_threads(half_cpu_count)
36
+
37
+ MODEL_ID = "huihui-ai/Huihui-Qwen3-VL-4B-Instruct-abliterated-FP8"
38
+
39
+ # default: Load the model on the available device(s)
40
+ model = Qwen3VLForConditionalGeneration.from_pretrained(
41
+ MODEL_ID,
42
+ device_map="auto",
43
+ trust_remote_code=True,
44
+ dtype=torch.bfloat16,
45
+ low_cpu_mem_usage=True,
46
+ )
47
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
48
+ # model = Qwen3VLForConditionalGeneration.from_pretrained(
49
+ # "Qwen/Qwen3-VL-235B-A22B-Instruct",
50
+ # dtype=torch.bfloat16,
51
+ # attn_implementation="flash_attention_2",
52
+ # device_map="auto",
53
+ # )
54
+
55
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
56
+
57
+
58
+ image_path = "/png/cars.jpg"
59
+
60
+ messages = [
61
+ {
62
+ "role": "user",
63
+ "content": [
64
+ {
65
+ "type": "image", "image": f"{image_path}",
66
+ },
67
+ {"type": "text", "text": "Describe this image."},
68
+ ],
69
+ }
70
+ ]
71
+
72
+ # Preparation for inference
73
+ inputs = processor.apply_chat_template(
74
+ messages,
75
+ tokenize=True,
76
+ add_generation_prompt=True,
77
+ return_dict=True,
78
+ return_tensors="pt"
79
+ ).to(model.device)
80
+
81
+ # Inference: Generation of the output
82
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
83
+ generated_ids_trimmed = [
84
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
85
+ ]
86
+ output_text = processor.batch_decode(
87
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
88
+ )
89
+ print(output_text)
90
+
91
+ ```
92
+
93
+
94
+ ### Usage Warnings
95
+
96
+
97
+ - **Risk of Sensitive or Controversial Outputs**: This model’s safety filtering has been significantly reduced, potentially generating sensitive, controversial, or inappropriate content. Users should exercise caution and rigorously review generated outputs.
98
+
99
+ - **Not Suitable for All Audiences**: Due to limited content filtering, the model’s outputs may be inappropriate for public settings, underage users, or applications requiring high security.
100
+
101
+ - **Legal and Ethical Responsibilities**: Users must ensure their usage complies with local laws and ethical standards. Generated content may carry legal or ethical risks, and users are solely responsible for any consequences.
102
+
103
+ - **Research and Experimental Use**: It is recommended to use this model for research, testing, or controlled environments, avoiding direct use in production or public-facing commercial applications.
104
+
105
+ - **Monitoring and Review Recommendations**: Users are strongly advised to monitor model outputs in real-time and conduct manual reviews when necessary to prevent the dissemination of inappropriate content.
106
+
107
+ - **No Default Safety Guarantees**: Unlike standard models, this model has not undergone rigorous safety optimization. huihui.ai bears no responsibility for any consequences arising from its use.
108
+
109
+
110
+ ### Donation
111
+ ##### Your donation helps us continue our further development and improvement, a cup of coffee can do it.
112
+ - bitcoin:
113
+ ```
114
+ bc1qqnkhuchxw0zqjh2ku3lu4hq45hc6gy84uk70ge
115
+ ```
116
+ - Support our work on [Ko-fi](https://ko-fi.com/huihuiai)!
117
+
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/added_tokens.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</think>": 151668,
3
+ "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
+ "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
+ "<|box_end|>": 151649,
9
+ "<|box_start|>": 151648,
10
+ "<|endoftext|>": 151643,
11
+ "<|file_sep|>": 151664,
12
+ "<|fim_middle|>": 151660,
13
+ "<|fim_pad|>": 151662,
14
+ "<|fim_prefix|>": 151659,
15
+ "<|fim_suffix|>": 151661,
16
+ "<|im_end|>": 151645,
17
+ "<|im_start|>": 151644,
18
+ "<|image_pad|>": 151655,
19
+ "<|object_ref_end|>": 151647,
20
+ "<|object_ref_start|>": 151646,
21
+ "<|quad_end|>": 151651,
22
+ "<|quad_start|>": 151650,
23
+ "<|repo_name|>": 151663,
24
+ "<|video_pad|>": 151656,
25
+ "<|vision_end|>": 151653,
26
+ "<|vision_pad|>": 151654,
27
+ "<|vision_start|>": 151652
28
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/config.json ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "image_token_id": 151655,
7
+ "model_type": "qwen3_vl",
8
+ "quantization_config": {
9
+ "activation_scheme": "dynamic",
10
+ "modules_to_not_convert": [
11
+ "lm_head",
12
+ "model.visual.merger.linear_fc1",
13
+ "model.visual.merger.linear_fc2",
14
+ "model.visual.merger.norm",
15
+ "model.visual.patch_embed.proj",
16
+ "model.visual.pos_embed",
17
+ "visual.merger.linear_fc1",
18
+ "visual.merger.linear_fc2",
19
+ "visual.merger.norm",
20
+ "visual.patch_embed.proj",
21
+ "visual.pos_embed",
22
+ "model.visual.blocks.0.attn.proj",
23
+ "model.visual.blocks.0.attn.qkv",
24
+ "model.visual.blocks.0.mlp.linear_fc1",
25
+ "model.visual.blocks.0.mlp.linear_fc2",
26
+ "visual.blocks.0.attn.proj",
27
+ "visual.blocks.0.attn.qkv_proj",
28
+ "visual.blocks.0.mlp.linear_fc1",
29
+ "visual.blocks.0.mlp.linear_fc2",
30
+ "model.visual.blocks.1.attn.proj",
31
+ "model.visual.blocks.1.attn.qkv",
32
+ "model.visual.blocks.1.mlp.linear_fc1",
33
+ "model.visual.blocks.1.mlp.linear_fc2",
34
+ "visual.blocks.1.attn.proj",
35
+ "visual.blocks.1.attn.qkv_proj",
36
+ "visual.blocks.1.mlp.linear_fc1",
37
+ "visual.blocks.1.mlp.linear_fc2",
38
+ "model.visual.blocks.2.attn.proj",
39
+ "model.visual.blocks.2.attn.qkv",
40
+ "model.visual.blocks.2.mlp.linear_fc1",
41
+ "model.visual.blocks.2.mlp.linear_fc2",
42
+ "visual.blocks.2.attn.proj",
43
+ "visual.blocks.2.attn.qkv_proj",
44
+ "visual.blocks.2.mlp.linear_fc1",
45
+ "visual.blocks.2.mlp.linear_fc2",
46
+ "model.visual.blocks.3.attn.proj",
47
+ "model.visual.blocks.3.attn.qkv",
48
+ "model.visual.blocks.3.mlp.linear_fc1",
49
+ "model.visual.blocks.3.mlp.linear_fc2",
50
+ "visual.blocks.3.attn.proj",
51
+ "visual.blocks.3.attn.qkv_proj",
52
+ "visual.blocks.3.mlp.linear_fc1",
53
+ "visual.blocks.3.mlp.linear_fc2",
54
+ "model.visual.blocks.4.attn.proj",
55
+ "model.visual.blocks.4.attn.qkv",
56
+ "model.visual.blocks.4.mlp.linear_fc1",
57
+ "model.visual.blocks.4.mlp.linear_fc2",
58
+ "visual.blocks.4.attn.proj",
59
+ "visual.blocks.4.attn.qkv_proj",
60
+ "visual.blocks.4.mlp.linear_fc1",
61
+ "visual.blocks.4.mlp.linear_fc2",
62
+ "model.visual.blocks.5.attn.proj",
63
+ "model.visual.blocks.5.attn.qkv",
64
+ "model.visual.blocks.5.mlp.linear_fc1",
65
+ "model.visual.blocks.5.mlp.linear_fc2",
66
+ "visual.blocks.5.attn.proj",
67
+ "visual.blocks.5.attn.qkv_proj",
68
+ "visual.blocks.5.mlp.linear_fc1",
69
+ "visual.blocks.5.mlp.linear_fc2",
70
+ "model.visual.blocks.6.attn.proj",
71
+ "model.visual.blocks.6.attn.qkv",
72
+ "model.visual.blocks.6.mlp.linear_fc1",
73
+ "model.visual.blocks.6.mlp.linear_fc2",
74
+ "visual.blocks.6.attn.proj",
75
+ "visual.blocks.6.attn.qkv_proj",
76
+ "visual.blocks.6.mlp.linear_fc1",
77
+ "visual.blocks.6.mlp.linear_fc2",
78
+ "model.visual.blocks.7.attn.proj",
79
+ "model.visual.blocks.7.attn.qkv",
80
+ "model.visual.blocks.7.mlp.linear_fc1",
81
+ "model.visual.blocks.7.mlp.linear_fc2",
82
+ "visual.blocks.7.attn.proj",
83
+ "visual.blocks.7.attn.qkv_proj",
84
+ "visual.blocks.7.mlp.linear_fc1",
85
+ "visual.blocks.7.mlp.linear_fc2",
86
+ "model.visual.blocks.8.attn.proj",
87
+ "model.visual.blocks.8.attn.qkv",
88
+ "model.visual.blocks.8.mlp.linear_fc1",
89
+ "model.visual.blocks.8.mlp.linear_fc2",
90
+ "visual.blocks.8.attn.proj",
91
+ "visual.blocks.8.attn.qkv_proj",
92
+ "visual.blocks.8.mlp.linear_fc1",
93
+ "visual.blocks.8.mlp.linear_fc2",
94
+ "model.visual.blocks.9.attn.proj",
95
+ "model.visual.blocks.9.attn.qkv",
96
+ "model.visual.blocks.9.mlp.linear_fc1",
97
+ "model.visual.blocks.9.mlp.linear_fc2",
98
+ "visual.blocks.9.attn.proj",
99
+ "visual.blocks.9.attn.qkv_proj",
100
+ "visual.blocks.9.mlp.linear_fc1",
101
+ "visual.blocks.9.mlp.linear_fc2",
102
+ "model.visual.blocks.10.attn.proj",
103
+ "model.visual.blocks.10.attn.qkv",
104
+ "model.visual.blocks.10.mlp.linear_fc1",
105
+ "model.visual.blocks.10.mlp.linear_fc2",
106
+ "visual.blocks.10.attn.proj",
107
+ "visual.blocks.10.attn.qkv_proj",
108
+ "visual.blocks.10.mlp.linear_fc1",
109
+ "visual.blocks.10.mlp.linear_fc2",
110
+ "model.visual.blocks.11.attn.proj",
111
+ "model.visual.blocks.11.attn.qkv",
112
+ "model.visual.blocks.11.mlp.linear_fc1",
113
+ "model.visual.blocks.11.mlp.linear_fc2",
114
+ "visual.blocks.11.attn.proj",
115
+ "visual.blocks.11.attn.qkv_proj",
116
+ "visual.blocks.11.mlp.linear_fc1",
117
+ "visual.blocks.11.mlp.linear_fc2",
118
+ "model.visual.blocks.12.attn.proj",
119
+ "model.visual.blocks.12.attn.qkv",
120
+ "model.visual.blocks.12.mlp.linear_fc1",
121
+ "model.visual.blocks.12.mlp.linear_fc2",
122
+ "visual.blocks.12.attn.proj",
123
+ "visual.blocks.12.attn.qkv_proj",
124
+ "visual.blocks.12.mlp.linear_fc1",
125
+ "visual.blocks.12.mlp.linear_fc2",
126
+ "model.visual.blocks.13.attn.proj",
127
+ "model.visual.blocks.13.attn.qkv",
128
+ "model.visual.blocks.13.mlp.linear_fc1",
129
+ "model.visual.blocks.13.mlp.linear_fc2",
130
+ "visual.blocks.13.attn.proj",
131
+ "visual.blocks.13.attn.qkv_proj",
132
+ "visual.blocks.13.mlp.linear_fc1",
133
+ "visual.blocks.13.mlp.linear_fc2",
134
+ "model.visual.blocks.14.attn.proj",
135
+ "model.visual.blocks.14.attn.qkv",
136
+ "model.visual.blocks.14.mlp.linear_fc1",
137
+ "model.visual.blocks.14.mlp.linear_fc2",
138
+ "visual.blocks.14.attn.proj",
139
+ "visual.blocks.14.attn.qkv_proj",
140
+ "visual.blocks.14.mlp.linear_fc1",
141
+ "visual.blocks.14.mlp.linear_fc2",
142
+ "model.visual.blocks.15.attn.proj",
143
+ "model.visual.blocks.15.attn.qkv",
144
+ "model.visual.blocks.15.mlp.linear_fc1",
145
+ "model.visual.blocks.15.mlp.linear_fc2",
146
+ "visual.blocks.15.attn.proj",
147
+ "visual.blocks.15.attn.qkv_proj",
148
+ "visual.blocks.15.mlp.linear_fc1",
149
+ "visual.blocks.15.mlp.linear_fc2",
150
+ "model.visual.blocks.16.attn.proj",
151
+ "model.visual.blocks.16.attn.qkv",
152
+ "model.visual.blocks.16.mlp.linear_fc1",
153
+ "model.visual.blocks.16.mlp.linear_fc2",
154
+ "visual.blocks.16.attn.proj",
155
+ "visual.blocks.16.attn.qkv_proj",
156
+ "visual.blocks.16.mlp.linear_fc1",
157
+ "visual.blocks.16.mlp.linear_fc2",
158
+ "model.visual.blocks.17.attn.proj",
159
+ "model.visual.blocks.17.attn.qkv",
160
+ "model.visual.blocks.17.mlp.linear_fc1",
161
+ "model.visual.blocks.17.mlp.linear_fc2",
162
+ "visual.blocks.17.attn.proj",
163
+ "visual.blocks.17.attn.qkv_proj",
164
+ "visual.blocks.17.mlp.linear_fc1",
165
+ "visual.blocks.17.mlp.linear_fc2",
166
+ "model.visual.blocks.18.attn.proj",
167
+ "model.visual.blocks.18.attn.qkv",
168
+ "model.visual.blocks.18.mlp.linear_fc1",
169
+ "model.visual.blocks.18.mlp.linear_fc2",
170
+ "visual.blocks.18.attn.proj",
171
+ "visual.blocks.18.attn.qkv_proj",
172
+ "visual.blocks.18.mlp.linear_fc1",
173
+ "visual.blocks.18.mlp.linear_fc2",
174
+ "model.visual.blocks.19.attn.proj",
175
+ "model.visual.blocks.19.attn.qkv",
176
+ "model.visual.blocks.19.mlp.linear_fc1",
177
+ "model.visual.blocks.19.mlp.linear_fc2",
178
+ "visual.blocks.19.attn.proj",
179
+ "visual.blocks.19.attn.qkv_proj",
180
+ "visual.blocks.19.mlp.linear_fc1",
181
+ "visual.blocks.19.mlp.linear_fc2",
182
+ "model.visual.blocks.20.attn.proj",
183
+ "model.visual.blocks.20.attn.qkv",
184
+ "model.visual.blocks.20.mlp.linear_fc1",
185
+ "model.visual.blocks.20.mlp.linear_fc2",
186
+ "visual.blocks.20.attn.proj",
187
+ "visual.blocks.20.attn.qkv_proj",
188
+ "visual.blocks.20.mlp.linear_fc1",
189
+ "visual.blocks.20.mlp.linear_fc2",
190
+ "model.visual.blocks.21.attn.proj",
191
+ "model.visual.blocks.21.attn.qkv",
192
+ "model.visual.blocks.21.mlp.linear_fc1",
193
+ "model.visual.blocks.21.mlp.linear_fc2",
194
+ "visual.blocks.21.attn.proj",
195
+ "visual.blocks.21.attn.qkv_proj",
196
+ "visual.blocks.21.mlp.linear_fc1",
197
+ "visual.blocks.21.mlp.linear_fc2",
198
+ "model.visual.blocks.22.attn.proj",
199
+ "model.visual.blocks.22.attn.qkv",
200
+ "model.visual.blocks.22.mlp.linear_fc1",
201
+ "model.visual.blocks.22.mlp.linear_fc2",
202
+ "visual.blocks.22.attn.proj",
203
+ "visual.blocks.22.attn.qkv_proj",
204
+ "visual.blocks.22.mlp.linear_fc1",
205
+ "visual.blocks.22.mlp.linear_fc2",
206
+ "model.visual.blocks.23.attn.proj",
207
+ "model.visual.blocks.23.attn.qkv",
208
+ "model.visual.blocks.23.mlp.linear_fc1",
209
+ "model.visual.blocks.23.mlp.linear_fc2",
210
+ "visual.blocks.23.attn.proj",
211
+ "visual.blocks.23.attn.qkv_proj",
212
+ "visual.blocks.23.mlp.linear_fc1",
213
+ "visual.blocks.23.mlp.linear_fc2",
214
+ "model.visual.blocks.24.attn.proj",
215
+ "model.visual.blocks.24.attn.qkv",
216
+ "model.visual.blocks.24.mlp.linear_fc1",
217
+ "model.visual.blocks.24.mlp.linear_fc2",
218
+ "visual.blocks.24.attn.proj",
219
+ "visual.blocks.24.attn.qkv_proj",
220
+ "visual.blocks.24.mlp.linear_fc1",
221
+ "visual.blocks.24.mlp.linear_fc2",
222
+ "model.visual.blocks.25.attn.proj",
223
+ "model.visual.blocks.25.attn.qkv",
224
+ "model.visual.blocks.25.mlp.linear_fc1",
225
+ "model.visual.blocks.25.mlp.linear_fc2",
226
+ "visual.blocks.25.attn.proj",
227
+ "visual.blocks.25.attn.qkv_proj",
228
+ "visual.blocks.25.mlp.linear_fc1",
229
+ "visual.blocks.25.mlp.linear_fc2",
230
+ "model.visual.blocks.26.attn.proj",
231
+ "model.visual.blocks.26.attn.qkv",
232
+ "model.visual.blocks.26.mlp.linear_fc1",
233
+ "model.visual.blocks.26.mlp.linear_fc2",
234
+ "visual.blocks.26.attn.proj",
235
+ "visual.blocks.26.attn.qkv_proj",
236
+ "visual.blocks.26.mlp.linear_fc1",
237
+ "visual.blocks.26.mlp.linear_fc2",
238
+ "model.visual.deepstack_merger_list.0.linear_fc1",
239
+ "model.visual.deepstack_merger_list.0.linear_fc2",
240
+ "model.visual.deepstack_merger_list.0.norm",
241
+ "visual.deepstack_merger_list.0.linear_fc1",
242
+ "visual.deepstack_merger_list.0.linear_fc2",
243
+ "visual.deepstack_merger_list.0.norm",
244
+ "model.visual.deepstack_merger_list.1.linear_fc1",
245
+ "model.visual.deepstack_merger_list.1.linear_fc2",
246
+ "model.visual.deepstack_merger_list.1.norm",
247
+ "visual.deepstack_merger_list.1.linear_fc1",
248
+ "visual.deepstack_merger_list.1.linear_fc2",
249
+ "visual.deepstack_merger_list.1.norm",
250
+ "model.visual.deepstack_merger_list.2.linear_fc1",
251
+ "model.visual.deepstack_merger_list.2.linear_fc2",
252
+ "model.visual.deepstack_merger_list.2.norm",
253
+ "visual.deepstack_merger_list.2.linear_fc1",
254
+ "visual.deepstack_merger_list.2.linear_fc2",
255
+ "visual.deepstack_merger_list.2.norm"
256
+ ],
257
+ "quant_method": "fp8",
258
+ "weight_block_size": [
259
+ 128,
260
+ 128
261
+ ]
262
+ },
263
+ "text_config": {
264
+ "attention_bias": false,
265
+ "attention_dropout": 0.0,
266
+ "bos_token_id": 151643,
267
+ "dtype": "bfloat16",
268
+ "eos_token_id": 151645,
269
+ "head_dim": 128,
270
+ "hidden_act": "silu",
271
+ "hidden_size": 2560,
272
+ "initializer_range": 0.02,
273
+ "intermediate_size": 9728,
274
+ "max_position_embeddings": 262144,
275
+ "model_type": "qwen3_vl_text",
276
+ "num_attention_heads": 32,
277
+ "num_hidden_layers": 36,
278
+ "num_key_value_heads": 8,
279
+ "rms_norm_eps": 1e-06,
280
+ "rope_scaling": {
281
+ "mrope_interleaved": true,
282
+ "mrope_section": [
283
+ 24,
284
+ 20,
285
+ 20
286
+ ],
287
+ "rope_type": "default"
288
+ },
289
+ "rope_theta": 5000000,
290
+ "tie_word_embeddings": true,
291
+ "use_cache": true,
292
+ "vocab_size": 151936
293
+ },
294
+ "tie_word_embeddings": true,
295
+ "transformers_version": "4.57.0.dev0",
296
+ "video_token_id": 151656,
297
+ "vision_config": {
298
+ "deepstack_visual_indexes": [
299
+ 5,
300
+ 11,
301
+ 17
302
+ ],
303
+ "depth": 24,
304
+ "dtype": "bfloat16",
305
+ "hidden_act": "gelu_pytorch_tanh",
306
+ "hidden_size": 1024,
307
+ "in_channels": 3,
308
+ "initializer_range": 0.02,
309
+ "intermediate_size": 4096,
310
+ "model_type": "qwen3_vl",
311
+ "num_heads": 16,
312
+ "num_position_embeddings": 2304,
313
+ "out_hidden_size": 2560,
314
+ "patch_size": 16,
315
+ "spatial_merge_size": 2,
316
+ "temporal_patch_size": 2
317
+ },
318
+ "vision_end_token_id": 151653,
319
+ "vision_start_token_id": 151652
320
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "temperature": 0.7,
10
+ "top_k": 20,
11
+ "top_p": 0.8,
12
+ "transformers_version": "4.57.0.dev0"
13
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/model.safetensors.index.json ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 4438037568,
4
+ "total_size": 5242759296
5
+ },
6
+ "weight_map": {
7
+ "model.language_model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.language_model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.language_model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.language_model.layers.0.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
11
+ "model.language_model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
12
+ "model.language_model.layers.0.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
13
+ "model.language_model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.language_model.layers.0.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
15
+ "model.language_model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
16
+ "model.language_model.layers.0.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
17
+ "model.language_model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
18
+ "model.language_model.layers.0.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
19
+ "model.language_model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.language_model.layers.0.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
21
+ "model.language_model.layers.0.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
22
+ "model.language_model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.language_model.layers.0.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
24
+ "model.language_model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.language_model.layers.0.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
26
+ "model.language_model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.language_model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.language_model.layers.1.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
29
+ "model.language_model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
30
+ "model.language_model.layers.1.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
31
+ "model.language_model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.language_model.layers.1.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
33
+ "model.language_model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
34
+ "model.language_model.layers.1.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
35
+ "model.language_model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
36
+ "model.language_model.layers.1.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
37
+ "model.language_model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.language_model.layers.1.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
39
+ "model.language_model.layers.1.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
40
+ "model.language_model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.language_model.layers.1.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
42
+ "model.language_model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.language_model.layers.1.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
44
+ "model.language_model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.language_model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.language_model.layers.10.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
47
+ "model.language_model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
48
+ "model.language_model.layers.10.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
49
+ "model.language_model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.language_model.layers.10.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
51
+ "model.language_model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
52
+ "model.language_model.layers.10.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
53
+ "model.language_model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
54
+ "model.language_model.layers.10.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
55
+ "model.language_model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.language_model.layers.10.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
57
+ "model.language_model.layers.10.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
58
+ "model.language_model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.language_model.layers.10.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
60
+ "model.language_model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.language_model.layers.10.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
62
+ "model.language_model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.language_model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.language_model.layers.11.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
65
+ "model.language_model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
66
+ "model.language_model.layers.11.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
67
+ "model.language_model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.language_model.layers.11.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
69
+ "model.language_model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
70
+ "model.language_model.layers.11.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
71
+ "model.language_model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
72
+ "model.language_model.layers.11.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
73
+ "model.language_model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
74
+ "model.language_model.layers.11.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
75
+ "model.language_model.layers.11.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
76
+ "model.language_model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
77
+ "model.language_model.layers.11.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
78
+ "model.language_model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
79
+ "model.language_model.layers.11.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
80
+ "model.language_model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.language_model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.language_model.layers.12.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
83
+ "model.language_model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
84
+ "model.language_model.layers.12.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
85
+ "model.language_model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.language_model.layers.12.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
87
+ "model.language_model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
88
+ "model.language_model.layers.12.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
89
+ "model.language_model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
90
+ "model.language_model.layers.12.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
91
+ "model.language_model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
92
+ "model.language_model.layers.12.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
93
+ "model.language_model.layers.12.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
94
+ "model.language_model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
95
+ "model.language_model.layers.12.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
96
+ "model.language_model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
97
+ "model.language_model.layers.12.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
98
+ "model.language_model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
99
+ "model.language_model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
100
+ "model.language_model.layers.13.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
101
+ "model.language_model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
102
+ "model.language_model.layers.13.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
103
+ "model.language_model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
104
+ "model.language_model.layers.13.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
105
+ "model.language_model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
106
+ "model.language_model.layers.13.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
107
+ "model.language_model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
108
+ "model.language_model.layers.13.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
109
+ "model.language_model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
110
+ "model.language_model.layers.13.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
111
+ "model.language_model.layers.13.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
112
+ "model.language_model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
113
+ "model.language_model.layers.13.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
114
+ "model.language_model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
115
+ "model.language_model.layers.13.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
116
+ "model.language_model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
117
+ "model.language_model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
118
+ "model.language_model.layers.14.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
119
+ "model.language_model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
120
+ "model.language_model.layers.14.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
121
+ "model.language_model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
122
+ "model.language_model.layers.14.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
123
+ "model.language_model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
124
+ "model.language_model.layers.14.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
125
+ "model.language_model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
126
+ "model.language_model.layers.14.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
127
+ "model.language_model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
128
+ "model.language_model.layers.14.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
129
+ "model.language_model.layers.14.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
130
+ "model.language_model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
131
+ "model.language_model.layers.14.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
132
+ "model.language_model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
133
+ "model.language_model.layers.14.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
134
+ "model.language_model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
135
+ "model.language_model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
136
+ "model.language_model.layers.15.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
137
+ "model.language_model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
138
+ "model.language_model.layers.15.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
139
+ "model.language_model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
140
+ "model.language_model.layers.15.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
141
+ "model.language_model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
142
+ "model.language_model.layers.15.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
143
+ "model.language_model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
144
+ "model.language_model.layers.15.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
145
+ "model.language_model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
146
+ "model.language_model.layers.15.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
147
+ "model.language_model.layers.15.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
148
+ "model.language_model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.language_model.layers.15.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
150
+ "model.language_model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.language_model.layers.15.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
152
+ "model.language_model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
153
+ "model.language_model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
154
+ "model.language_model.layers.16.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
155
+ "model.language_model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
156
+ "model.language_model.layers.16.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
157
+ "model.language_model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
158
+ "model.language_model.layers.16.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
159
+ "model.language_model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
160
+ "model.language_model.layers.16.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
161
+ "model.language_model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
162
+ "model.language_model.layers.16.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
163
+ "model.language_model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
164
+ "model.language_model.layers.16.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
165
+ "model.language_model.layers.16.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
166
+ "model.language_model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.language_model.layers.16.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
168
+ "model.language_model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.language_model.layers.16.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
170
+ "model.language_model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.language_model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.language_model.layers.17.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
173
+ "model.language_model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
174
+ "model.language_model.layers.17.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
175
+ "model.language_model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.language_model.layers.17.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
177
+ "model.language_model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
178
+ "model.language_model.layers.17.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
179
+ "model.language_model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
180
+ "model.language_model.layers.17.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
181
+ "model.language_model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.language_model.layers.17.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
183
+ "model.language_model.layers.17.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
184
+ "model.language_model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.language_model.layers.17.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
186
+ "model.language_model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.language_model.layers.17.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
188
+ "model.language_model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
189
+ "model.language_model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.language_model.layers.18.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
191
+ "model.language_model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
192
+ "model.language_model.layers.18.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
193
+ "model.language_model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.language_model.layers.18.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
195
+ "model.language_model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
196
+ "model.language_model.layers.18.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
197
+ "model.language_model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
198
+ "model.language_model.layers.18.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
199
+ "model.language_model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.language_model.layers.18.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
201
+ "model.language_model.layers.18.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
202
+ "model.language_model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
203
+ "model.language_model.layers.18.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
204
+ "model.language_model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
205
+ "model.language_model.layers.18.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
206
+ "model.language_model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
207
+ "model.language_model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
208
+ "model.language_model.layers.19.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
209
+ "model.language_model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
210
+ "model.language_model.layers.19.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
211
+ "model.language_model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
212
+ "model.language_model.layers.19.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
213
+ "model.language_model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
214
+ "model.language_model.layers.19.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
215
+ "model.language_model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
216
+ "model.language_model.layers.19.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
217
+ "model.language_model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
218
+ "model.language_model.layers.19.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
219
+ "model.language_model.layers.19.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
220
+ "model.language_model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.language_model.layers.19.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
222
+ "model.language_model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
223
+ "model.language_model.layers.19.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
224
+ "model.language_model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
225
+ "model.language_model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
226
+ "model.language_model.layers.2.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
227
+ "model.language_model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
228
+ "model.language_model.layers.2.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
229
+ "model.language_model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
230
+ "model.language_model.layers.2.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
231
+ "model.language_model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
232
+ "model.language_model.layers.2.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
233
+ "model.language_model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
234
+ "model.language_model.layers.2.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
235
+ "model.language_model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
236
+ "model.language_model.layers.2.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
237
+ "model.language_model.layers.2.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
238
+ "model.language_model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
239
+ "model.language_model.layers.2.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
240
+ "model.language_model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
241
+ "model.language_model.layers.2.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
242
+ "model.language_model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
243
+ "model.language_model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
244
+ "model.language_model.layers.20.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
245
+ "model.language_model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
246
+ "model.language_model.layers.20.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
247
+ "model.language_model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
248
+ "model.language_model.layers.20.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
249
+ "model.language_model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
250
+ "model.language_model.layers.20.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
251
+ "model.language_model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
252
+ "model.language_model.layers.20.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
253
+ "model.language_model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
254
+ "model.language_model.layers.20.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
255
+ "model.language_model.layers.20.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
256
+ "model.language_model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
257
+ "model.language_model.layers.20.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
258
+ "model.language_model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
259
+ "model.language_model.layers.20.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
260
+ "model.language_model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
261
+ "model.language_model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
262
+ "model.language_model.layers.21.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
263
+ "model.language_model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "model.language_model.layers.21.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
265
+ "model.language_model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
266
+ "model.language_model.layers.21.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
267
+ "model.language_model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
268
+ "model.language_model.layers.21.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
269
+ "model.language_model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
270
+ "model.language_model.layers.21.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
271
+ "model.language_model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
272
+ "model.language_model.layers.21.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
273
+ "model.language_model.layers.21.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
274
+ "model.language_model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
275
+ "model.language_model.layers.21.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
276
+ "model.language_model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
277
+ "model.language_model.layers.21.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
278
+ "model.language_model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
279
+ "model.language_model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
280
+ "model.language_model.layers.22.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
281
+ "model.language_model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
282
+ "model.language_model.layers.22.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
283
+ "model.language_model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
284
+ "model.language_model.layers.22.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
285
+ "model.language_model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
286
+ "model.language_model.layers.22.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
287
+ "model.language_model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
288
+ "model.language_model.layers.22.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
289
+ "model.language_model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
290
+ "model.language_model.layers.22.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
291
+ "model.language_model.layers.22.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
292
+ "model.language_model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
293
+ "model.language_model.layers.22.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
294
+ "model.language_model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
295
+ "model.language_model.layers.22.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
296
+ "model.language_model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
297
+ "model.language_model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
298
+ "model.language_model.layers.23.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
299
+ "model.language_model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
300
+ "model.language_model.layers.23.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
301
+ "model.language_model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
302
+ "model.language_model.layers.23.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
303
+ "model.language_model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
304
+ "model.language_model.layers.23.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
305
+ "model.language_model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
306
+ "model.language_model.layers.23.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
307
+ "model.language_model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
308
+ "model.language_model.layers.23.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
309
+ "model.language_model.layers.23.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
310
+ "model.language_model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
311
+ "model.language_model.layers.23.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
312
+ "model.language_model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
313
+ "model.language_model.layers.23.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
314
+ "model.language_model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
315
+ "model.language_model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
316
+ "model.language_model.layers.24.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
317
+ "model.language_model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
318
+ "model.language_model.layers.24.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
319
+ "model.language_model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
320
+ "model.language_model.layers.24.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
321
+ "model.language_model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
322
+ "model.language_model.layers.24.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
323
+ "model.language_model.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
324
+ "model.language_model.layers.24.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
325
+ "model.language_model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
326
+ "model.language_model.layers.24.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
327
+ "model.language_model.layers.24.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
328
+ "model.language_model.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
329
+ "model.language_model.layers.24.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
330
+ "model.language_model.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
331
+ "model.language_model.layers.24.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
332
+ "model.language_model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
333
+ "model.language_model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
334
+ "model.language_model.layers.25.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
335
+ "model.language_model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
336
+ "model.language_model.layers.25.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
337
+ "model.language_model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
338
+ "model.language_model.layers.25.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
339
+ "model.language_model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
340
+ "model.language_model.layers.25.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
341
+ "model.language_model.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
342
+ "model.language_model.layers.25.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
343
+ "model.language_model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
344
+ "model.language_model.layers.25.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
345
+ "model.language_model.layers.25.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
346
+ "model.language_model.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.language_model.layers.25.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
348
+ "model.language_model.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.language_model.layers.25.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
350
+ "model.language_model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
351
+ "model.language_model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
352
+ "model.language_model.layers.26.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
353
+ "model.language_model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
354
+ "model.language_model.layers.26.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
355
+ "model.language_model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
356
+ "model.language_model.layers.26.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
357
+ "model.language_model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
358
+ "model.language_model.layers.26.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
359
+ "model.language_model.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
360
+ "model.language_model.layers.26.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
361
+ "model.language_model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
362
+ "model.language_model.layers.26.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
363
+ "model.language_model.layers.26.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
364
+ "model.language_model.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
365
+ "model.language_model.layers.26.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
366
+ "model.language_model.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
367
+ "model.language_model.layers.26.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
368
+ "model.language_model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
369
+ "model.language_model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
370
+ "model.language_model.layers.27.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
371
+ "model.language_model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
372
+ "model.language_model.layers.27.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
373
+ "model.language_model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
374
+ "model.language_model.layers.27.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
375
+ "model.language_model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
376
+ "model.language_model.layers.27.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
377
+ "model.language_model.layers.27.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
378
+ "model.language_model.layers.27.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
379
+ "model.language_model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
380
+ "model.language_model.layers.27.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
381
+ "model.language_model.layers.27.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
382
+ "model.language_model.layers.27.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
383
+ "model.language_model.layers.27.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
384
+ "model.language_model.layers.27.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
385
+ "model.language_model.layers.27.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
386
+ "model.language_model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
387
+ "model.language_model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
388
+ "model.language_model.layers.28.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
389
+ "model.language_model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
390
+ "model.language_model.layers.28.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
391
+ "model.language_model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
392
+ "model.language_model.layers.28.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
393
+ "model.language_model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
394
+ "model.language_model.layers.28.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
395
+ "model.language_model.layers.28.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
396
+ "model.language_model.layers.28.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
397
+ "model.language_model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
398
+ "model.language_model.layers.28.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
399
+ "model.language_model.layers.28.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
400
+ "model.language_model.layers.28.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
401
+ "model.language_model.layers.28.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
402
+ "model.language_model.layers.28.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
403
+ "model.language_model.layers.28.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
404
+ "model.language_model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors",
405
+ "model.language_model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
406
+ "model.language_model.layers.29.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
407
+ "model.language_model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
408
+ "model.language_model.layers.29.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
409
+ "model.language_model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
410
+ "model.language_model.layers.29.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
411
+ "model.language_model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
412
+ "model.language_model.layers.29.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
413
+ "model.language_model.layers.29.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
414
+ "model.language_model.layers.29.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
415
+ "model.language_model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
416
+ "model.language_model.layers.29.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
417
+ "model.language_model.layers.29.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
418
+ "model.language_model.layers.29.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
419
+ "model.language_model.layers.29.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
420
+ "model.language_model.layers.29.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
421
+ "model.language_model.layers.29.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
422
+ "model.language_model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
423
+ "model.language_model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
424
+ "model.language_model.layers.3.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
425
+ "model.language_model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
426
+ "model.language_model.layers.3.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
427
+ "model.language_model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
428
+ "model.language_model.layers.3.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
429
+ "model.language_model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
430
+ "model.language_model.layers.3.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
431
+ "model.language_model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
432
+ "model.language_model.layers.3.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
433
+ "model.language_model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
434
+ "model.language_model.layers.3.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
435
+ "model.language_model.layers.3.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
436
+ "model.language_model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
437
+ "model.language_model.layers.3.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
438
+ "model.language_model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
439
+ "model.language_model.layers.3.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
440
+ "model.language_model.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors",
441
+ "model.language_model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
442
+ "model.language_model.layers.30.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
443
+ "model.language_model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
444
+ "model.language_model.layers.30.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
445
+ "model.language_model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
446
+ "model.language_model.layers.30.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
447
+ "model.language_model.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
448
+ "model.language_model.layers.30.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
449
+ "model.language_model.layers.30.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
450
+ "model.language_model.layers.30.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
451
+ "model.language_model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
452
+ "model.language_model.layers.30.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
453
+ "model.language_model.layers.30.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
454
+ "model.language_model.layers.30.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
455
+ "model.language_model.layers.30.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
456
+ "model.language_model.layers.30.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
457
+ "model.language_model.layers.30.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
458
+ "model.language_model.layers.31.input_layernorm.weight": "model-00001-of-00002.safetensors",
459
+ "model.language_model.layers.31.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
460
+ "model.language_model.layers.31.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
461
+ "model.language_model.layers.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
462
+ "model.language_model.layers.31.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
463
+ "model.language_model.layers.31.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
464
+ "model.language_model.layers.31.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
465
+ "model.language_model.layers.31.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
466
+ "model.language_model.layers.31.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
467
+ "model.language_model.layers.31.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
468
+ "model.language_model.layers.31.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
469
+ "model.language_model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
470
+ "model.language_model.layers.31.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
471
+ "model.language_model.layers.31.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
472
+ "model.language_model.layers.31.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
473
+ "model.language_model.layers.31.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
474
+ "model.language_model.layers.31.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
475
+ "model.language_model.layers.31.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
476
+ "model.language_model.layers.32.input_layernorm.weight": "model-00001-of-00002.safetensors",
477
+ "model.language_model.layers.32.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
478
+ "model.language_model.layers.32.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
479
+ "model.language_model.layers.32.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
480
+ "model.language_model.layers.32.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
481
+ "model.language_model.layers.32.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
482
+ "model.language_model.layers.32.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
483
+ "model.language_model.layers.32.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
484
+ "model.language_model.layers.32.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
485
+ "model.language_model.layers.32.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
486
+ "model.language_model.layers.32.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
487
+ "model.language_model.layers.32.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
488
+ "model.language_model.layers.32.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
489
+ "model.language_model.layers.32.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
490
+ "model.language_model.layers.32.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
491
+ "model.language_model.layers.32.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
492
+ "model.language_model.layers.32.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
493
+ "model.language_model.layers.32.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
494
+ "model.language_model.layers.33.input_layernorm.weight": "model-00002-of-00002.safetensors",
495
+ "model.language_model.layers.33.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
496
+ "model.language_model.layers.33.mlp.down_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
497
+ "model.language_model.layers.33.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
498
+ "model.language_model.layers.33.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
499
+ "model.language_model.layers.33.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
500
+ "model.language_model.layers.33.mlp.up_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
501
+ "model.language_model.layers.33.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
502
+ "model.language_model.layers.33.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
503
+ "model.language_model.layers.33.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
504
+ "model.language_model.layers.33.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
505
+ "model.language_model.layers.33.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
506
+ "model.language_model.layers.33.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
507
+ "model.language_model.layers.33.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
508
+ "model.language_model.layers.33.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
509
+ "model.language_model.layers.33.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
510
+ "model.language_model.layers.33.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
511
+ "model.language_model.layers.33.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
512
+ "model.language_model.layers.34.input_layernorm.weight": "model-00002-of-00002.safetensors",
513
+ "model.language_model.layers.34.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
514
+ "model.language_model.layers.34.mlp.down_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
515
+ "model.language_model.layers.34.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
516
+ "model.language_model.layers.34.mlp.gate_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
517
+ "model.language_model.layers.34.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
518
+ "model.language_model.layers.34.mlp.up_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
519
+ "model.language_model.layers.34.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
520
+ "model.language_model.layers.34.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
521
+ "model.language_model.layers.34.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
522
+ "model.language_model.layers.34.self_attn.k_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
523
+ "model.language_model.layers.34.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
524
+ "model.language_model.layers.34.self_attn.o_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
525
+ "model.language_model.layers.34.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
526
+ "model.language_model.layers.34.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
527
+ "model.language_model.layers.34.self_attn.q_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
528
+ "model.language_model.layers.34.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
529
+ "model.language_model.layers.34.self_attn.v_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
530
+ "model.language_model.layers.35.input_layernorm.weight": "model-00002-of-00002.safetensors",
531
+ "model.language_model.layers.35.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
532
+ "model.language_model.layers.35.mlp.down_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
533
+ "model.language_model.layers.35.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
534
+ "model.language_model.layers.35.mlp.gate_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
535
+ "model.language_model.layers.35.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
536
+ "model.language_model.layers.35.mlp.up_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
537
+ "model.language_model.layers.35.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
538
+ "model.language_model.layers.35.self_attn.k_norm.weight": "model-00002-of-00002.safetensors",
539
+ "model.language_model.layers.35.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
540
+ "model.language_model.layers.35.self_attn.k_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
541
+ "model.language_model.layers.35.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
542
+ "model.language_model.layers.35.self_attn.o_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
543
+ "model.language_model.layers.35.self_attn.q_norm.weight": "model-00002-of-00002.safetensors",
544
+ "model.language_model.layers.35.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
545
+ "model.language_model.layers.35.self_attn.q_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
546
+ "model.language_model.layers.35.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
547
+ "model.language_model.layers.35.self_attn.v_proj.weight_scale_inv": "model-00002-of-00002.safetensors",
548
+ "model.language_model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
549
+ "model.language_model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
550
+ "model.language_model.layers.4.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
551
+ "model.language_model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
552
+ "model.language_model.layers.4.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
553
+ "model.language_model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
554
+ "model.language_model.layers.4.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
555
+ "model.language_model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
556
+ "model.language_model.layers.4.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
557
+ "model.language_model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
558
+ "model.language_model.layers.4.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
559
+ "model.language_model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
560
+ "model.language_model.layers.4.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
561
+ "model.language_model.layers.4.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
562
+ "model.language_model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
563
+ "model.language_model.layers.4.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
564
+ "model.language_model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
565
+ "model.language_model.layers.4.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
566
+ "model.language_model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
567
+ "model.language_model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
568
+ "model.language_model.layers.5.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
569
+ "model.language_model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
570
+ "model.language_model.layers.5.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
571
+ "model.language_model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
572
+ "model.language_model.layers.5.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
573
+ "model.language_model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
574
+ "model.language_model.layers.5.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
575
+ "model.language_model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
576
+ "model.language_model.layers.5.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
577
+ "model.language_model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
578
+ "model.language_model.layers.5.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
579
+ "model.language_model.layers.5.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
580
+ "model.language_model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
581
+ "model.language_model.layers.5.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
582
+ "model.language_model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
583
+ "model.language_model.layers.5.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
584
+ "model.language_model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
585
+ "model.language_model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
586
+ "model.language_model.layers.6.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
587
+ "model.language_model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
588
+ "model.language_model.layers.6.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
589
+ "model.language_model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
590
+ "model.language_model.layers.6.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
591
+ "model.language_model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
592
+ "model.language_model.layers.6.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
593
+ "model.language_model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
594
+ "model.language_model.layers.6.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
595
+ "model.language_model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
596
+ "model.language_model.layers.6.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
597
+ "model.language_model.layers.6.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
598
+ "model.language_model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
599
+ "model.language_model.layers.6.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
600
+ "model.language_model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
601
+ "model.language_model.layers.6.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
602
+ "model.language_model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
603
+ "model.language_model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
604
+ "model.language_model.layers.7.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
605
+ "model.language_model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
606
+ "model.language_model.layers.7.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
607
+ "model.language_model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
608
+ "model.language_model.layers.7.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
609
+ "model.language_model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
610
+ "model.language_model.layers.7.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
611
+ "model.language_model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
612
+ "model.language_model.layers.7.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
613
+ "model.language_model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
614
+ "model.language_model.layers.7.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
615
+ "model.language_model.layers.7.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
616
+ "model.language_model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
617
+ "model.language_model.layers.7.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
618
+ "model.language_model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
619
+ "model.language_model.layers.7.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
620
+ "model.language_model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
621
+ "model.language_model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
622
+ "model.language_model.layers.8.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
623
+ "model.language_model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
624
+ "model.language_model.layers.8.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
625
+ "model.language_model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
626
+ "model.language_model.layers.8.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
627
+ "model.language_model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
628
+ "model.language_model.layers.8.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
629
+ "model.language_model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
630
+ "model.language_model.layers.8.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
631
+ "model.language_model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
632
+ "model.language_model.layers.8.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
633
+ "model.language_model.layers.8.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
634
+ "model.language_model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
635
+ "model.language_model.layers.8.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
636
+ "model.language_model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
637
+ "model.language_model.layers.8.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
638
+ "model.language_model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
639
+ "model.language_model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
640
+ "model.language_model.layers.9.mlp.down_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
641
+ "model.language_model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
642
+ "model.language_model.layers.9.mlp.gate_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
643
+ "model.language_model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
644
+ "model.language_model.layers.9.mlp.up_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
645
+ "model.language_model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
646
+ "model.language_model.layers.9.self_attn.k_norm.weight": "model-00001-of-00002.safetensors",
647
+ "model.language_model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
648
+ "model.language_model.layers.9.self_attn.k_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
649
+ "model.language_model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
650
+ "model.language_model.layers.9.self_attn.o_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
651
+ "model.language_model.layers.9.self_attn.q_norm.weight": "model-00001-of-00002.safetensors",
652
+ "model.language_model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
653
+ "model.language_model.layers.9.self_attn.q_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
654
+ "model.language_model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
655
+ "model.language_model.layers.9.self_attn.v_proj.weight_scale_inv": "model-00001-of-00002.safetensors",
656
+ "model.language_model.norm.weight": "model-00002-of-00002.safetensors",
657
+ "model.visual.blocks.0.attn.proj.bias": "model-00001-of-00002.safetensors",
658
+ "model.visual.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
659
+ "model.visual.blocks.0.attn.qkv.bias": "model-00001-of-00002.safetensors",
660
+ "model.visual.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
661
+ "model.visual.blocks.0.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
662
+ "model.visual.blocks.0.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
663
+ "model.visual.blocks.0.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
664
+ "model.visual.blocks.0.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
665
+ "model.visual.blocks.0.norm1.bias": "model-00001-of-00002.safetensors",
666
+ "model.visual.blocks.0.norm1.weight": "model-00001-of-00002.safetensors",
667
+ "model.visual.blocks.0.norm2.bias": "model-00001-of-00002.safetensors",
668
+ "model.visual.blocks.0.norm2.weight": "model-00001-of-00002.safetensors",
669
+ "model.visual.blocks.1.attn.proj.bias": "model-00001-of-00002.safetensors",
670
+ "model.visual.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
671
+ "model.visual.blocks.1.attn.qkv.bias": "model-00001-of-00002.safetensors",
672
+ "model.visual.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
673
+ "model.visual.blocks.1.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
674
+ "model.visual.blocks.1.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
675
+ "model.visual.blocks.1.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
676
+ "model.visual.blocks.1.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
677
+ "model.visual.blocks.1.norm1.bias": "model-00001-of-00002.safetensors",
678
+ "model.visual.blocks.1.norm1.weight": "model-00001-of-00002.safetensors",
679
+ "model.visual.blocks.1.norm2.bias": "model-00001-of-00002.safetensors",
680
+ "model.visual.blocks.1.norm2.weight": "model-00001-of-00002.safetensors",
681
+ "model.visual.blocks.10.attn.proj.bias": "model-00001-of-00002.safetensors",
682
+ "model.visual.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
683
+ "model.visual.blocks.10.attn.qkv.bias": "model-00001-of-00002.safetensors",
684
+ "model.visual.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
685
+ "model.visual.blocks.10.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
686
+ "model.visual.blocks.10.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
687
+ "model.visual.blocks.10.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
688
+ "model.visual.blocks.10.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
689
+ "model.visual.blocks.10.norm1.bias": "model-00001-of-00002.safetensors",
690
+ "model.visual.blocks.10.norm1.weight": "model-00001-of-00002.safetensors",
691
+ "model.visual.blocks.10.norm2.bias": "model-00001-of-00002.safetensors",
692
+ "model.visual.blocks.10.norm2.weight": "model-00001-of-00002.safetensors",
693
+ "model.visual.blocks.11.attn.proj.bias": "model-00001-of-00002.safetensors",
694
+ "model.visual.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
695
+ "model.visual.blocks.11.attn.qkv.bias": "model-00001-of-00002.safetensors",
696
+ "model.visual.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
697
+ "model.visual.blocks.11.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
698
+ "model.visual.blocks.11.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
699
+ "model.visual.blocks.11.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
700
+ "model.visual.blocks.11.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
701
+ "model.visual.blocks.11.norm1.bias": "model-00001-of-00002.safetensors",
702
+ "model.visual.blocks.11.norm1.weight": "model-00001-of-00002.safetensors",
703
+ "model.visual.blocks.11.norm2.bias": "model-00001-of-00002.safetensors",
704
+ "model.visual.blocks.11.norm2.weight": "model-00001-of-00002.safetensors",
705
+ "model.visual.blocks.12.attn.proj.bias": "model-00001-of-00002.safetensors",
706
+ "model.visual.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
707
+ "model.visual.blocks.12.attn.qkv.bias": "model-00001-of-00002.safetensors",
708
+ "model.visual.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
709
+ "model.visual.blocks.12.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
710
+ "model.visual.blocks.12.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
711
+ "model.visual.blocks.12.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
712
+ "model.visual.blocks.12.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
713
+ "model.visual.blocks.12.norm1.bias": "model-00001-of-00002.safetensors",
714
+ "model.visual.blocks.12.norm1.weight": "model-00001-of-00002.safetensors",
715
+ "model.visual.blocks.12.norm2.bias": "model-00001-of-00002.safetensors",
716
+ "model.visual.blocks.12.norm2.weight": "model-00001-of-00002.safetensors",
717
+ "model.visual.blocks.13.attn.proj.bias": "model-00001-of-00002.safetensors",
718
+ "model.visual.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
719
+ "model.visual.blocks.13.attn.qkv.bias": "model-00001-of-00002.safetensors",
720
+ "model.visual.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
721
+ "model.visual.blocks.13.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
722
+ "model.visual.blocks.13.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
723
+ "model.visual.blocks.13.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
724
+ "model.visual.blocks.13.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
725
+ "model.visual.blocks.13.norm1.bias": "model-00001-of-00002.safetensors",
726
+ "model.visual.blocks.13.norm1.weight": "model-00001-of-00002.safetensors",
727
+ "model.visual.blocks.13.norm2.bias": "model-00001-of-00002.safetensors",
728
+ "model.visual.blocks.13.norm2.weight": "model-00001-of-00002.safetensors",
729
+ "model.visual.blocks.14.attn.proj.bias": "model-00001-of-00002.safetensors",
730
+ "model.visual.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
731
+ "model.visual.blocks.14.attn.qkv.bias": "model-00001-of-00002.safetensors",
732
+ "model.visual.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
733
+ "model.visual.blocks.14.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
734
+ "model.visual.blocks.14.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
735
+ "model.visual.blocks.14.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
736
+ "model.visual.blocks.14.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
737
+ "model.visual.blocks.14.norm1.bias": "model-00001-of-00002.safetensors",
738
+ "model.visual.blocks.14.norm1.weight": "model-00001-of-00002.safetensors",
739
+ "model.visual.blocks.14.norm2.bias": "model-00001-of-00002.safetensors",
740
+ "model.visual.blocks.14.norm2.weight": "model-00001-of-00002.safetensors",
741
+ "model.visual.blocks.15.attn.proj.bias": "model-00001-of-00002.safetensors",
742
+ "model.visual.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
743
+ "model.visual.blocks.15.attn.qkv.bias": "model-00001-of-00002.safetensors",
744
+ "model.visual.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
745
+ "model.visual.blocks.15.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
746
+ "model.visual.blocks.15.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
747
+ "model.visual.blocks.15.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
748
+ "model.visual.blocks.15.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
749
+ "model.visual.blocks.15.norm1.bias": "model-00001-of-00002.safetensors",
750
+ "model.visual.blocks.15.norm1.weight": "model-00001-of-00002.safetensors",
751
+ "model.visual.blocks.15.norm2.bias": "model-00001-of-00002.safetensors",
752
+ "model.visual.blocks.15.norm2.weight": "model-00001-of-00002.safetensors",
753
+ "model.visual.blocks.16.attn.proj.bias": "model-00001-of-00002.safetensors",
754
+ "model.visual.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
755
+ "model.visual.blocks.16.attn.qkv.bias": "model-00001-of-00002.safetensors",
756
+ "model.visual.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
757
+ "model.visual.blocks.16.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
758
+ "model.visual.blocks.16.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
759
+ "model.visual.blocks.16.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
760
+ "model.visual.blocks.16.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
761
+ "model.visual.blocks.16.norm1.bias": "model-00001-of-00002.safetensors",
762
+ "model.visual.blocks.16.norm1.weight": "model-00001-of-00002.safetensors",
763
+ "model.visual.blocks.16.norm2.bias": "model-00001-of-00002.safetensors",
764
+ "model.visual.blocks.16.norm2.weight": "model-00001-of-00002.safetensors",
765
+ "model.visual.blocks.17.attn.proj.bias": "model-00001-of-00002.safetensors",
766
+ "model.visual.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
767
+ "model.visual.blocks.17.attn.qkv.bias": "model-00001-of-00002.safetensors",
768
+ "model.visual.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
769
+ "model.visual.blocks.17.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
770
+ "model.visual.blocks.17.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
771
+ "model.visual.blocks.17.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
772
+ "model.visual.blocks.17.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
773
+ "model.visual.blocks.17.norm1.bias": "model-00001-of-00002.safetensors",
774
+ "model.visual.blocks.17.norm1.weight": "model-00001-of-00002.safetensors",
775
+ "model.visual.blocks.17.norm2.bias": "model-00001-of-00002.safetensors",
776
+ "model.visual.blocks.17.norm2.weight": "model-00001-of-00002.safetensors",
777
+ "model.visual.blocks.18.attn.proj.bias": "model-00001-of-00002.safetensors",
778
+ "model.visual.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
779
+ "model.visual.blocks.18.attn.qkv.bias": "model-00001-of-00002.safetensors",
780
+ "model.visual.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
781
+ "model.visual.blocks.18.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
782
+ "model.visual.blocks.18.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
783
+ "model.visual.blocks.18.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
784
+ "model.visual.blocks.18.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
785
+ "model.visual.blocks.18.norm1.bias": "model-00001-of-00002.safetensors",
786
+ "model.visual.blocks.18.norm1.weight": "model-00001-of-00002.safetensors",
787
+ "model.visual.blocks.18.norm2.bias": "model-00001-of-00002.safetensors",
788
+ "model.visual.blocks.18.norm2.weight": "model-00001-of-00002.safetensors",
789
+ "model.visual.blocks.19.attn.proj.bias": "model-00001-of-00002.safetensors",
790
+ "model.visual.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
791
+ "model.visual.blocks.19.attn.qkv.bias": "model-00001-of-00002.safetensors",
792
+ "model.visual.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
793
+ "model.visual.blocks.19.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
794
+ "model.visual.blocks.19.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
795
+ "model.visual.blocks.19.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
796
+ "model.visual.blocks.19.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
797
+ "model.visual.blocks.19.norm1.bias": "model-00001-of-00002.safetensors",
798
+ "model.visual.blocks.19.norm1.weight": "model-00001-of-00002.safetensors",
799
+ "model.visual.blocks.19.norm2.bias": "model-00001-of-00002.safetensors",
800
+ "model.visual.blocks.19.norm2.weight": "model-00001-of-00002.safetensors",
801
+ "model.visual.blocks.2.attn.proj.bias": "model-00001-of-00002.safetensors",
802
+ "model.visual.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
803
+ "model.visual.blocks.2.attn.qkv.bias": "model-00001-of-00002.safetensors",
804
+ "model.visual.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
805
+ "model.visual.blocks.2.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
806
+ "model.visual.blocks.2.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
807
+ "model.visual.blocks.2.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
808
+ "model.visual.blocks.2.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
809
+ "model.visual.blocks.2.norm1.bias": "model-00001-of-00002.safetensors",
810
+ "model.visual.blocks.2.norm1.weight": "model-00001-of-00002.safetensors",
811
+ "model.visual.blocks.2.norm2.bias": "model-00001-of-00002.safetensors",
812
+ "model.visual.blocks.2.norm2.weight": "model-00001-of-00002.safetensors",
813
+ "model.visual.blocks.20.attn.proj.bias": "model-00001-of-00002.safetensors",
814
+ "model.visual.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
815
+ "model.visual.blocks.20.attn.qkv.bias": "model-00001-of-00002.safetensors",
816
+ "model.visual.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
817
+ "model.visual.blocks.20.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
818
+ "model.visual.blocks.20.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
819
+ "model.visual.blocks.20.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
820
+ "model.visual.blocks.20.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
821
+ "model.visual.blocks.20.norm1.bias": "model-00001-of-00002.safetensors",
822
+ "model.visual.blocks.20.norm1.weight": "model-00001-of-00002.safetensors",
823
+ "model.visual.blocks.20.norm2.bias": "model-00001-of-00002.safetensors",
824
+ "model.visual.blocks.20.norm2.weight": "model-00001-of-00002.safetensors",
825
+ "model.visual.blocks.21.attn.proj.bias": "model-00001-of-00002.safetensors",
826
+ "model.visual.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
827
+ "model.visual.blocks.21.attn.qkv.bias": "model-00001-of-00002.safetensors",
828
+ "model.visual.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
829
+ "model.visual.blocks.21.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
830
+ "model.visual.blocks.21.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
831
+ "model.visual.blocks.21.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
832
+ "model.visual.blocks.21.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
833
+ "model.visual.blocks.21.norm1.bias": "model-00001-of-00002.safetensors",
834
+ "model.visual.blocks.21.norm1.weight": "model-00001-of-00002.safetensors",
835
+ "model.visual.blocks.21.norm2.bias": "model-00001-of-00002.safetensors",
836
+ "model.visual.blocks.21.norm2.weight": "model-00001-of-00002.safetensors",
837
+ "model.visual.blocks.22.attn.proj.bias": "model-00001-of-00002.safetensors",
838
+ "model.visual.blocks.22.attn.proj.weight": "model-00001-of-00002.safetensors",
839
+ "model.visual.blocks.22.attn.qkv.bias": "model-00001-of-00002.safetensors",
840
+ "model.visual.blocks.22.attn.qkv.weight": "model-00001-of-00002.safetensors",
841
+ "model.visual.blocks.22.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
842
+ "model.visual.blocks.22.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
843
+ "model.visual.blocks.22.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
844
+ "model.visual.blocks.22.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
845
+ "model.visual.blocks.22.norm1.bias": "model-00001-of-00002.safetensors",
846
+ "model.visual.blocks.22.norm1.weight": "model-00001-of-00002.safetensors",
847
+ "model.visual.blocks.22.norm2.bias": "model-00001-of-00002.safetensors",
848
+ "model.visual.blocks.22.norm2.weight": "model-00001-of-00002.safetensors",
849
+ "model.visual.blocks.23.attn.proj.bias": "model-00001-of-00002.safetensors",
850
+ "model.visual.blocks.23.attn.proj.weight": "model-00001-of-00002.safetensors",
851
+ "model.visual.blocks.23.attn.qkv.bias": "model-00001-of-00002.safetensors",
852
+ "model.visual.blocks.23.attn.qkv.weight": "model-00001-of-00002.safetensors",
853
+ "model.visual.blocks.23.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
854
+ "model.visual.blocks.23.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
855
+ "model.visual.blocks.23.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
856
+ "model.visual.blocks.23.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
857
+ "model.visual.blocks.23.norm1.bias": "model-00001-of-00002.safetensors",
858
+ "model.visual.blocks.23.norm1.weight": "model-00001-of-00002.safetensors",
859
+ "model.visual.blocks.23.norm2.bias": "model-00001-of-00002.safetensors",
860
+ "model.visual.blocks.23.norm2.weight": "model-00001-of-00002.safetensors",
861
+ "model.visual.blocks.3.attn.proj.bias": "model-00001-of-00002.safetensors",
862
+ "model.visual.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
863
+ "model.visual.blocks.3.attn.qkv.bias": "model-00001-of-00002.safetensors",
864
+ "model.visual.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
865
+ "model.visual.blocks.3.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
866
+ "model.visual.blocks.3.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
867
+ "model.visual.blocks.3.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
868
+ "model.visual.blocks.3.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
869
+ "model.visual.blocks.3.norm1.bias": "model-00001-of-00002.safetensors",
870
+ "model.visual.blocks.3.norm1.weight": "model-00001-of-00002.safetensors",
871
+ "model.visual.blocks.3.norm2.bias": "model-00001-of-00002.safetensors",
872
+ "model.visual.blocks.3.norm2.weight": "model-00001-of-00002.safetensors",
873
+ "model.visual.blocks.4.attn.proj.bias": "model-00001-of-00002.safetensors",
874
+ "model.visual.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
875
+ "model.visual.blocks.4.attn.qkv.bias": "model-00001-of-00002.safetensors",
876
+ "model.visual.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
877
+ "model.visual.blocks.4.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
878
+ "model.visual.blocks.4.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
879
+ "model.visual.blocks.4.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
880
+ "model.visual.blocks.4.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
881
+ "model.visual.blocks.4.norm1.bias": "model-00001-of-00002.safetensors",
882
+ "model.visual.blocks.4.norm1.weight": "model-00001-of-00002.safetensors",
883
+ "model.visual.blocks.4.norm2.bias": "model-00001-of-00002.safetensors",
884
+ "model.visual.blocks.4.norm2.weight": "model-00001-of-00002.safetensors",
885
+ "model.visual.blocks.5.attn.proj.bias": "model-00001-of-00002.safetensors",
886
+ "model.visual.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
887
+ "model.visual.blocks.5.attn.qkv.bias": "model-00001-of-00002.safetensors",
888
+ "model.visual.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
889
+ "model.visual.blocks.5.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
890
+ "model.visual.blocks.5.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
891
+ "model.visual.blocks.5.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
892
+ "model.visual.blocks.5.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
893
+ "model.visual.blocks.5.norm1.bias": "model-00001-of-00002.safetensors",
894
+ "model.visual.blocks.5.norm1.weight": "model-00001-of-00002.safetensors",
895
+ "model.visual.blocks.5.norm2.bias": "model-00001-of-00002.safetensors",
896
+ "model.visual.blocks.5.norm2.weight": "model-00001-of-00002.safetensors",
897
+ "model.visual.blocks.6.attn.proj.bias": "model-00001-of-00002.safetensors",
898
+ "model.visual.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
899
+ "model.visual.blocks.6.attn.qkv.bias": "model-00001-of-00002.safetensors",
900
+ "model.visual.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
901
+ "model.visual.blocks.6.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
902
+ "model.visual.blocks.6.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
903
+ "model.visual.blocks.6.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
904
+ "model.visual.blocks.6.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
905
+ "model.visual.blocks.6.norm1.bias": "model-00001-of-00002.safetensors",
906
+ "model.visual.blocks.6.norm1.weight": "model-00001-of-00002.safetensors",
907
+ "model.visual.blocks.6.norm2.bias": "model-00001-of-00002.safetensors",
908
+ "model.visual.blocks.6.norm2.weight": "model-00001-of-00002.safetensors",
909
+ "model.visual.blocks.7.attn.proj.bias": "model-00001-of-00002.safetensors",
910
+ "model.visual.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
911
+ "model.visual.blocks.7.attn.qkv.bias": "model-00001-of-00002.safetensors",
912
+ "model.visual.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
913
+ "model.visual.blocks.7.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
914
+ "model.visual.blocks.7.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
915
+ "model.visual.blocks.7.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
916
+ "model.visual.blocks.7.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
917
+ "model.visual.blocks.7.norm1.bias": "model-00001-of-00002.safetensors",
918
+ "model.visual.blocks.7.norm1.weight": "model-00001-of-00002.safetensors",
919
+ "model.visual.blocks.7.norm2.bias": "model-00001-of-00002.safetensors",
920
+ "model.visual.blocks.7.norm2.weight": "model-00001-of-00002.safetensors",
921
+ "model.visual.blocks.8.attn.proj.bias": "model-00001-of-00002.safetensors",
922
+ "model.visual.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
923
+ "model.visual.blocks.8.attn.qkv.bias": "model-00001-of-00002.safetensors",
924
+ "model.visual.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
925
+ "model.visual.blocks.8.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
926
+ "model.visual.blocks.8.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
927
+ "model.visual.blocks.8.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
928
+ "model.visual.blocks.8.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
929
+ "model.visual.blocks.8.norm1.bias": "model-00001-of-00002.safetensors",
930
+ "model.visual.blocks.8.norm1.weight": "model-00001-of-00002.safetensors",
931
+ "model.visual.blocks.8.norm2.bias": "model-00001-of-00002.safetensors",
932
+ "model.visual.blocks.8.norm2.weight": "model-00001-of-00002.safetensors",
933
+ "model.visual.blocks.9.attn.proj.bias": "model-00001-of-00002.safetensors",
934
+ "model.visual.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
935
+ "model.visual.blocks.9.attn.qkv.bias": "model-00001-of-00002.safetensors",
936
+ "model.visual.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
937
+ "model.visual.blocks.9.mlp.linear_fc1.bias": "model-00001-of-00002.safetensors",
938
+ "model.visual.blocks.9.mlp.linear_fc1.weight": "model-00001-of-00002.safetensors",
939
+ "model.visual.blocks.9.mlp.linear_fc2.bias": "model-00001-of-00002.safetensors",
940
+ "model.visual.blocks.9.mlp.linear_fc2.weight": "model-00001-of-00002.safetensors",
941
+ "model.visual.blocks.9.norm1.bias": "model-00001-of-00002.safetensors",
942
+ "model.visual.blocks.9.norm1.weight": "model-00001-of-00002.safetensors",
943
+ "model.visual.blocks.9.norm2.bias": "model-00001-of-00002.safetensors",
944
+ "model.visual.blocks.9.norm2.weight": "model-00001-of-00002.safetensors",
945
+ "model.visual.deepstack_merger_list.0.linear_fc1.bias": "model-00001-of-00002.safetensors",
946
+ "model.visual.deepstack_merger_list.0.linear_fc1.weight": "model-00001-of-00002.safetensors",
947
+ "model.visual.deepstack_merger_list.0.linear_fc2.bias": "model-00001-of-00002.safetensors",
948
+ "model.visual.deepstack_merger_list.0.linear_fc2.weight": "model-00001-of-00002.safetensors",
949
+ "model.visual.deepstack_merger_list.0.norm.bias": "model-00001-of-00002.safetensors",
950
+ "model.visual.deepstack_merger_list.0.norm.weight": "model-00001-of-00002.safetensors",
951
+ "model.visual.deepstack_merger_list.1.linear_fc1.bias": "model-00001-of-00002.safetensors",
952
+ "model.visual.deepstack_merger_list.1.linear_fc1.weight": "model-00001-of-00002.safetensors",
953
+ "model.visual.deepstack_merger_list.1.linear_fc2.bias": "model-00001-of-00002.safetensors",
954
+ "model.visual.deepstack_merger_list.1.linear_fc2.weight": "model-00001-of-00002.safetensors",
955
+ "model.visual.deepstack_merger_list.1.norm.bias": "model-00001-of-00002.safetensors",
956
+ "model.visual.deepstack_merger_list.1.norm.weight": "model-00001-of-00002.safetensors",
957
+ "model.visual.deepstack_merger_list.2.linear_fc1.bias": "model-00001-of-00002.safetensors",
958
+ "model.visual.deepstack_merger_list.2.linear_fc1.weight": "model-00001-of-00002.safetensors",
959
+ "model.visual.deepstack_merger_list.2.linear_fc2.bias": "model-00001-of-00002.safetensors",
960
+ "model.visual.deepstack_merger_list.2.linear_fc2.weight": "model-00001-of-00002.safetensors",
961
+ "model.visual.deepstack_merger_list.2.norm.bias": "model-00001-of-00002.safetensors",
962
+ "model.visual.deepstack_merger_list.2.norm.weight": "model-00001-of-00002.safetensors",
963
+ "model.visual.merger.linear_fc1.bias": "model-00001-of-00002.safetensors",
964
+ "model.visual.merger.linear_fc1.weight": "model-00001-of-00002.safetensors",
965
+ "model.visual.merger.linear_fc2.bias": "model-00001-of-00002.safetensors",
966
+ "model.visual.merger.linear_fc2.weight": "model-00001-of-00002.safetensors",
967
+ "model.visual.merger.norm.bias": "model-00001-of-00002.safetensors",
968
+ "model.visual.merger.norm.weight": "model-00001-of-00002.safetensors",
969
+ "model.visual.patch_embed.proj.bias": "model-00001-of-00002.safetensors",
970
+ "model.visual.patch_embed.proj.weight": "model-00001-of-00002.safetensors",
971
+ "model.visual.pos_embed.weight": "model-00001-of-00002.safetensors"
972
+ }
973
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_pad": null,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
+ "image_std": [
20
+ 0.5,
21
+ 0.5,
22
+ 0.5
23
+ ],
24
+ "input_data_format": null,
25
+ "max_pixels": null,
26
+ "merge_size": 2,
27
+ "min_pixels": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
+ "size": {
35
+ "longest_edge": 16777216,
36
+ "shortest_edge": 65536
37
+ },
38
+ "temporal_patch_size": 2
39
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/tokenizer_config.json ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
+ }
213
+ },
214
+ "additional_special_tokens": [
215
+ "<|im_start|>",
216
+ "<|im_end|>",
217
+ "<|object_ref_start|>",
218
+ "<|object_ref_end|>",
219
+ "<|box_start|>",
220
+ "<|box_end|>",
221
+ "<|quad_start|>",
222
+ "<|quad_end|>",
223
+ "<|vision_start|>",
224
+ "<|vision_end|>",
225
+ "<|vision_pad|>",
226
+ "<|image_pad|>",
227
+ "<|video_pad|>"
228
+ ],
229
+ "bos_token": null,
230
+ "clean_up_tokenization_spaces": false,
231
+ "eos_token": "<|im_end|>",
232
+ "errors": "replace",
233
+ "extra_special_tokens": {},
234
+ "model_max_length": 262144,
235
+ "pad_token": "<|endoftext|>",
236
+ "processor_class": "Qwen3VLProcessor",
237
+ "split_special_tokens": false,
238
+ "tokenizer_class": "Qwen2Tokenizer",
239
+ "unk_token": null
240
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/video_preprocessor_config.json ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_sample_frames": true,
12
+ "fps": 2,
13
+ "image_mean": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "image_std": [
19
+ 0.5,
20
+ 0.5,
21
+ 0.5
22
+ ],
23
+ "input_data_format": null,
24
+ "max_frames": 768,
25
+ "merge_size": 2,
26
+ "min_frames": 4,
27
+ "num_frames": null,
28
+ "pad_size": null,
29
+ "patch_size": 16,
30
+ "processor_class": "Qwen3VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_metadata": false,
34
+ "size": {
35
+ "longest_edge": 25165824,
36
+ "shortest_edge": 4096
37
+ },
38
+ "temporal_patch_size": 2,
39
+ "video_metadata": null,
40
+ "video_processor_type": "Qwen3VLVideoProcessor"
41
+ }
LLM/Qwen3-VL-4B-Instruct-abliterated-FP8/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
RMBG/RMBG-2.0/BiRefNet_config.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import PretrainedConfig
2
+
3
+ class BiRefNetConfig(PretrainedConfig):
4
+ model_type = "SegformerForSemanticSegmentation"
5
+ def __init__(
6
+ self,
7
+ bb_pretrained=False,
8
+ **kwargs
9
+ ):
10
+ self.bb_pretrained = bb_pretrained
11
+ super().__init__(**kwargs)
RMBG/RMBG-2.0/__pycache__/BiRefNet_config.cpython-312.pyc ADDED
Binary file (759 Bytes). View file
 
RMBG/RMBG-2.0/birefnet.py ADDED
@@ -0,0 +1,2244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### config.py
2
+
3
+ import os
4
+ import math
5
+
6
+
7
+ class Config():
8
+ def __init__(self) -> None:
9
+ # PATH settings
10
+ self.sys_home_dir = os.path.expanduser('~') # Make up your file system as: SYS_HOME_DIR/codes/dis/BiRefNet, SYS_HOME_DIR/datasets/dis/xx, SYS_HOME_DIR/weights/xx
11
+
12
+ # TASK settings
13
+ self.task = ['DIS5K', 'COD', 'HRSOD', 'DIS5K+HRSOD+HRS10K', 'P3M-10k'][0]
14
+ self.training_set = {
15
+ 'DIS5K': ['DIS-TR', 'DIS-TR+DIS-TE1+DIS-TE2+DIS-TE3+DIS-TE4'][0],
16
+ 'COD': 'TR-COD10K+TR-CAMO',
17
+ 'HRSOD': ['TR-DUTS', 'TR-HRSOD', 'TR-UHRSD', 'TR-DUTS+TR-HRSOD', 'TR-DUTS+TR-UHRSD', 'TR-HRSOD+TR-UHRSD', 'TR-DUTS+TR-HRSOD+TR-UHRSD'][5],
18
+ 'DIS5K+HRSOD+HRS10K': 'DIS-TE1+DIS-TE2+DIS-TE3+DIS-TE4+DIS-TR+TE-HRS10K+TE-HRSOD+TE-UHRSD+TR-HRS10K+TR-HRSOD+TR-UHRSD', # leave DIS-VD for evaluation.
19
+ 'P3M-10k': 'TR-P3M-10k',
20
+ }[self.task]
21
+ self.prompt4loc = ['dense', 'sparse'][0]
22
+
23
+ # Faster-Training settings
24
+ self.load_all = True
25
+ self.compile = True # 1. Trigger CPU memory leak in some extend, which is an inherent problem of PyTorch.
26
+ # Machines with > 70GB CPU memory can run the whole training on DIS5K with default setting.
27
+ # 2. Higher PyTorch version may fix it: https://github.com/pytorch/pytorch/issues/119607.
28
+ # 3. But compile in Pytorch > 2.0.1 seems to bring no acceleration for training.
29
+ self.precisionHigh = True
30
+
31
+ # MODEL settings
32
+ self.ms_supervision = True
33
+ self.out_ref = self.ms_supervision and True
34
+ self.dec_ipt = True
35
+ self.dec_ipt_split = True
36
+ self.cxt_num = [0, 3][1] # multi-scale skip connections from encoder
37
+ self.mul_scl_ipt = ['', 'add', 'cat'][2]
38
+ self.dec_att = ['', 'ASPP', 'ASPPDeformable'][2]
39
+ self.squeeze_block = ['', 'BasicDecBlk_x1', 'ResBlk_x4', 'ASPP_x3', 'ASPPDeformable_x3'][1]
40
+ self.dec_blk = ['BasicDecBlk', 'ResBlk', 'HierarAttDecBlk'][0]
41
+
42
+ # TRAINING settings
43
+ self.batch_size = 4
44
+ self.IoU_finetune_last_epochs = [
45
+ 0,
46
+ {
47
+ 'DIS5K': -50,
48
+ 'COD': -20,
49
+ 'HRSOD': -20,
50
+ 'DIS5K+HRSOD+HRS10K': -20,
51
+ 'P3M-10k': -20,
52
+ }[self.task]
53
+ ][1] # choose 0 to skip
54
+ self.lr = (1e-4 if 'DIS5K' in self.task else 1e-5) * math.sqrt(self.batch_size / 4) # DIS needs high lr to converge faster. Adapt the lr linearly
55
+ self.size = 1024
56
+ self.num_workers = max(4, self.batch_size) # will be decrease to min(it, batch_size) at the initialization of the data_loader
57
+
58
+ # Backbone settings
59
+ self.bb = [
60
+ 'vgg16', 'vgg16bn', 'resnet50', # 0, 1, 2
61
+ 'swin_v1_t', 'swin_v1_s', # 3, 4
62
+ 'swin_v1_b', 'swin_v1_l', # 5-bs9, 6-bs4
63
+ 'pvt_v2_b0', 'pvt_v2_b1', # 7, 8
64
+ 'pvt_v2_b2', 'pvt_v2_b5', # 9-bs10, 10-bs5
65
+ ][6]
66
+ self.lateral_channels_in_collection = {
67
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
68
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
69
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
70
+ 'swin_v1_t': [768, 384, 192, 96], 'swin_v1_s': [768, 384, 192, 96],
71
+ 'pvt_v2_b0': [256, 160, 64, 32], 'pvt_v2_b1': [512, 320, 128, 64],
72
+ }[self.bb]
73
+ if self.mul_scl_ipt == 'cat':
74
+ self.lateral_channels_in_collection = [channel * 2 for channel in self.lateral_channels_in_collection]
75
+ self.cxt = self.lateral_channels_in_collection[1:][::-1][-self.cxt_num:] if self.cxt_num else []
76
+
77
+ # MODEL settings - inactive
78
+ self.lat_blk = ['BasicLatBlk'][0]
79
+ self.dec_channels_inter = ['fixed', 'adap'][0]
80
+ self.refine = ['', 'itself', 'RefUNet', 'Refiner', 'RefinerPVTInChannels4'][0]
81
+ self.progressive_ref = self.refine and True
82
+ self.ender = self.progressive_ref and False
83
+ self.scale = self.progressive_ref and 2
84
+ self.auxiliary_classification = False # Only for DIS5K, where class labels are saved in `dataset.py`.
85
+ self.refine_iteration = 1
86
+ self.freeze_bb = False
87
+ self.model = [
88
+ 'BiRefNet',
89
+ ][0]
90
+ if self.dec_blk == 'HierarAttDecBlk':
91
+ self.batch_size = 2 ** [0, 1, 2, 3, 4][2]
92
+
93
+ # TRAINING settings - inactive
94
+ self.preproc_methods = ['flip', 'enhance', 'rotate', 'pepper', 'crop'][:4]
95
+ self.optimizer = ['Adam', 'AdamW'][1]
96
+ self.lr_decay_epochs = [1e5] # Set to negative N to decay the lr in the last N-th epoch.
97
+ self.lr_decay_rate = 0.5
98
+ # Loss
99
+ self.lambdas_pix_last = {
100
+ # not 0 means opening this loss
101
+ # original rate -- 1 : 30 : 1.5 : 0.2, bce x 30
102
+ 'bce': 30 * 1, # high performance
103
+ 'iou': 0.5 * 1, # 0 / 255
104
+ 'iou_patch': 0.5 * 0, # 0 / 255, win_size = (64, 64)
105
+ 'mse': 150 * 0, # can smooth the saliency map
106
+ 'triplet': 3 * 0,
107
+ 'reg': 100 * 0,
108
+ 'ssim': 10 * 1, # help contours,
109
+ 'cnt': 5 * 0, # help contours
110
+ 'structure': 5 * 0, # structure loss from codes of MVANet. A little improvement on DIS-TE[1,2,3], a bit more decrease on DIS-TE4.
111
+ }
112
+ self.lambdas_cls = {
113
+ 'ce': 5.0
114
+ }
115
+ # Adv
116
+ self.lambda_adv_g = 10. * 0 # turn to 0 to avoid adv training
117
+ self.lambda_adv_d = 3. * (self.lambda_adv_g > 0)
118
+
119
+ # PATH settings - inactive
120
+ self.data_root_dir = os.path.join(self.sys_home_dir, 'datasets/dis')
121
+ self.weights_root_dir = os.path.join(self.sys_home_dir, 'weights')
122
+ self.weights = {
123
+ 'pvt_v2_b2': os.path.join(self.weights_root_dir, 'pvt_v2_b2.pth'),
124
+ 'pvt_v2_b5': os.path.join(self.weights_root_dir, ['pvt_v2_b5.pth', 'pvt_v2_b5_22k.pth'][0]),
125
+ 'swin_v1_b': os.path.join(self.weights_root_dir, ['swin_base_patch4_window12_384_22kto1k.pth', 'swin_base_patch4_window12_384_22k.pth'][0]),
126
+ 'swin_v1_l': os.path.join(self.weights_root_dir, ['swin_large_patch4_window12_384_22kto1k.pth', 'swin_large_patch4_window12_384_22k.pth'][0]),
127
+ 'swin_v1_t': os.path.join(self.weights_root_dir, ['swin_tiny_patch4_window7_224_22kto1k_finetune.pth'][0]),
128
+ 'swin_v1_s': os.path.join(self.weights_root_dir, ['swin_small_patch4_window7_224_22kto1k_finetune.pth'][0]),
129
+ 'pvt_v2_b0': os.path.join(self.weights_root_dir, ['pvt_v2_b0.pth'][0]),
130
+ 'pvt_v2_b1': os.path.join(self.weights_root_dir, ['pvt_v2_b1.pth'][0]),
131
+ }
132
+
133
+ # Callbacks - inactive
134
+ self.verbose_eval = True
135
+ self.only_S_MAE = False
136
+ self.use_fp16 = False # Bugs. It may cause nan in training.
137
+ self.SDPA_enabled = False # Bugs. Slower and errors occur in multi-GPUs
138
+
139
+ # others
140
+ self.device = [0, 'cpu'][0] # .to(0) == .to('cuda:0')
141
+
142
+ self.batch_size_valid = 1
143
+ self.rand_seed = 7
144
+ # run_sh_file = [f for f in os.listdir('.') if 'train.sh' == f] + [os.path.join('..', f) for f in os.listdir('..') if 'train.sh' == f]
145
+ # with open(run_sh_file[0], 'r') as f:
146
+ # lines = f.readlines()
147
+ # self.save_last = int([l.strip() for l in lines if '"{}")'.format(self.task) in l and 'val_last=' in l][0].split('val_last=')[-1].split()[0])
148
+ # self.save_step = int([l.strip() for l in lines if '"{}")'.format(self.task) in l and 'step=' in l][0].split('step=')[-1].split()[0])
149
+ # self.val_step = [0, self.save_step][0]
150
+
151
+ def print_task(self) -> None:
152
+ # Return task for choosing settings in shell scripts.
153
+ print(self.task)
154
+
155
+
156
+
157
+ ### models/backbones/pvt_v2.py
158
+
159
+ import torch
160
+ import torch.nn as nn
161
+ from functools import partial
162
+
163
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
164
+ from timm.models.registry import register_model
165
+
166
+ import math
167
+
168
+ # from config import Config
169
+
170
+ # config = Config()
171
+
172
+ class Mlp(nn.Module):
173
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
174
+ super().__init__()
175
+ out_features = out_features or in_features
176
+ hidden_features = hidden_features or in_features
177
+ self.fc1 = nn.Linear(in_features, hidden_features)
178
+ self.dwconv = DWConv(hidden_features)
179
+ self.act = act_layer()
180
+ self.fc2 = nn.Linear(hidden_features, out_features)
181
+ self.drop = nn.Dropout(drop)
182
+
183
+ self.apply(self._init_weights)
184
+
185
+ def _init_weights(self, m):
186
+ if isinstance(m, nn.Linear):
187
+ trunc_normal_(m.weight, std=.02)
188
+ if isinstance(m, nn.Linear) and m.bias is not None:
189
+ nn.init.constant_(m.bias, 0)
190
+ elif isinstance(m, nn.LayerNorm):
191
+ nn.init.constant_(m.bias, 0)
192
+ nn.init.constant_(m.weight, 1.0)
193
+ elif isinstance(m, nn.Conv2d):
194
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
195
+ fan_out //= m.groups
196
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
197
+ if m.bias is not None:
198
+ m.bias.data.zero_()
199
+
200
+ def forward(self, x, H, W):
201
+ x = self.fc1(x)
202
+ x = self.dwconv(x, H, W)
203
+ x = self.act(x)
204
+ x = self.drop(x)
205
+ x = self.fc2(x)
206
+ x = self.drop(x)
207
+ return x
208
+
209
+
210
+ class Attention(nn.Module):
211
+ def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0., sr_ratio=1):
212
+ super().__init__()
213
+ assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
214
+
215
+ self.dim = dim
216
+ self.num_heads = num_heads
217
+ head_dim = dim // num_heads
218
+ self.scale = qk_scale or head_dim ** -0.5
219
+
220
+ self.q = nn.Linear(dim, dim, bias=qkv_bias)
221
+ self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias)
222
+ self.attn_drop_prob = attn_drop
223
+ self.attn_drop = nn.Dropout(attn_drop)
224
+ self.proj = nn.Linear(dim, dim)
225
+ self.proj_drop = nn.Dropout(proj_drop)
226
+
227
+ self.sr_ratio = sr_ratio
228
+ if sr_ratio > 1:
229
+ self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio)
230
+ self.norm = nn.LayerNorm(dim)
231
+
232
+ self.apply(self._init_weights)
233
+
234
+ def _init_weights(self, m):
235
+ if isinstance(m, nn.Linear):
236
+ trunc_normal_(m.weight, std=.02)
237
+ if isinstance(m, nn.Linear) and m.bias is not None:
238
+ nn.init.constant_(m.bias, 0)
239
+ elif isinstance(m, nn.LayerNorm):
240
+ nn.init.constant_(m.bias, 0)
241
+ nn.init.constant_(m.weight, 1.0)
242
+ elif isinstance(m, nn.Conv2d):
243
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
244
+ fan_out //= m.groups
245
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
246
+ if m.bias is not None:
247
+ m.bias.data.zero_()
248
+
249
+ def forward(self, x, H, W):
250
+ B, N, C = x.shape
251
+ q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
252
+
253
+ if self.sr_ratio > 1:
254
+ x_ = x.permute(0, 2, 1).reshape(B, C, H, W)
255
+ x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1)
256
+ x_ = self.norm(x_)
257
+ kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
258
+ else:
259
+ kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
260
+ k, v = kv[0], kv[1]
261
+
262
+ if config.SDPA_enabled:
263
+ x = torch.nn.functional.scaled_dot_product_attention(
264
+ q, k, v,
265
+ attn_mask=None, dropout_p=self.attn_drop_prob, is_causal=False
266
+ ).transpose(1, 2).reshape(B, N, C)
267
+ else:
268
+ attn = (q @ k.transpose(-2, -1)) * self.scale
269
+ attn = attn.softmax(dim=-1)
270
+ attn = self.attn_drop(attn)
271
+
272
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
273
+ x = self.proj(x)
274
+ x = self.proj_drop(x)
275
+
276
+ return x
277
+
278
+
279
+ class Block(nn.Module):
280
+
281
+ def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
282
+ drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1):
283
+ super().__init__()
284
+ self.norm1 = norm_layer(dim)
285
+ self.attn = Attention(
286
+ dim,
287
+ num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
288
+ attn_drop=attn_drop, proj_drop=drop, sr_ratio=sr_ratio)
289
+ # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
290
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
291
+ self.norm2 = norm_layer(dim)
292
+ mlp_hidden_dim = int(dim * mlp_ratio)
293
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
294
+
295
+ self.apply(self._init_weights)
296
+
297
+ def _init_weights(self, m):
298
+ if isinstance(m, nn.Linear):
299
+ trunc_normal_(m.weight, std=.02)
300
+ if isinstance(m, nn.Linear) and m.bias is not None:
301
+ nn.init.constant_(m.bias, 0)
302
+ elif isinstance(m, nn.LayerNorm):
303
+ nn.init.constant_(m.bias, 0)
304
+ nn.init.constant_(m.weight, 1.0)
305
+ elif isinstance(m, nn.Conv2d):
306
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
307
+ fan_out //= m.groups
308
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
309
+ if m.bias is not None:
310
+ m.bias.data.zero_()
311
+
312
+ def forward(self, x, H, W):
313
+ x = x + self.drop_path(self.attn(self.norm1(x), H, W))
314
+ x = x + self.drop_path(self.mlp(self.norm2(x), H, W))
315
+
316
+ return x
317
+
318
+
319
+ class OverlapPatchEmbed(nn.Module):
320
+ """ Image to Patch Embedding
321
+ """
322
+
323
+ def __init__(self, img_size=224, patch_size=7, stride=4, in_channels=3, embed_dim=768):
324
+ super().__init__()
325
+ img_size = to_2tuple(img_size)
326
+ patch_size = to_2tuple(patch_size)
327
+
328
+ self.img_size = img_size
329
+ self.patch_size = patch_size
330
+ self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1]
331
+ self.num_patches = self.H * self.W
332
+ self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=stride,
333
+ padding=(patch_size[0] // 2, patch_size[1] // 2))
334
+ self.norm = nn.LayerNorm(embed_dim)
335
+
336
+ self.apply(self._init_weights)
337
+
338
+ def _init_weights(self, m):
339
+ if isinstance(m, nn.Linear):
340
+ trunc_normal_(m.weight, std=.02)
341
+ if isinstance(m, nn.Linear) and m.bias is not None:
342
+ nn.init.constant_(m.bias, 0)
343
+ elif isinstance(m, nn.LayerNorm):
344
+ nn.init.constant_(m.bias, 0)
345
+ nn.init.constant_(m.weight, 1.0)
346
+ elif isinstance(m, nn.Conv2d):
347
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
348
+ fan_out //= m.groups
349
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
350
+ if m.bias is not None:
351
+ m.bias.data.zero_()
352
+
353
+ def forward(self, x):
354
+ x = self.proj(x)
355
+ _, _, H, W = x.shape
356
+ x = x.flatten(2).transpose(1, 2)
357
+ x = self.norm(x)
358
+
359
+ return x, H, W
360
+
361
+
362
+ class PyramidVisionTransformerImpr(nn.Module):
363
+ def __init__(self, img_size=224, patch_size=16, in_channels=3, num_classes=1000, embed_dims=[64, 128, 256, 512],
364
+ num_heads=[1, 2, 4, 8], mlp_ratios=[4, 4, 4, 4], qkv_bias=False, qk_scale=None, drop_rate=0.,
365
+ attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm,
366
+ depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1]):
367
+ super().__init__()
368
+ self.num_classes = num_classes
369
+ self.depths = depths
370
+
371
+ # patch_embed
372
+ self.patch_embed1 = OverlapPatchEmbed(img_size=img_size, patch_size=7, stride=4, in_channels=in_channels,
373
+ embed_dim=embed_dims[0])
374
+ self.patch_embed2 = OverlapPatchEmbed(img_size=img_size // 4, patch_size=3, stride=2, in_channels=embed_dims[0],
375
+ embed_dim=embed_dims[1])
376
+ self.patch_embed3 = OverlapPatchEmbed(img_size=img_size // 8, patch_size=3, stride=2, in_channels=embed_dims[1],
377
+ embed_dim=embed_dims[2])
378
+ self.patch_embed4 = OverlapPatchEmbed(img_size=img_size // 16, patch_size=3, stride=2, in_channels=embed_dims[2],
379
+ embed_dim=embed_dims[3])
380
+
381
+ # transformer encoder
382
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
383
+ cur = 0
384
+ self.block1 = nn.ModuleList([Block(
385
+ dim=embed_dims[0], num_heads=num_heads[0], mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, qk_scale=qk_scale,
386
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
387
+ sr_ratio=sr_ratios[0])
388
+ for i in range(depths[0])])
389
+ self.norm1 = norm_layer(embed_dims[0])
390
+
391
+ cur += depths[0]
392
+ self.block2 = nn.ModuleList([Block(
393
+ dim=embed_dims[1], num_heads=num_heads[1], mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, qk_scale=qk_scale,
394
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
395
+ sr_ratio=sr_ratios[1])
396
+ for i in range(depths[1])])
397
+ self.norm2 = norm_layer(embed_dims[1])
398
+
399
+ cur += depths[1]
400
+ self.block3 = nn.ModuleList([Block(
401
+ dim=embed_dims[2], num_heads=num_heads[2], mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, qk_scale=qk_scale,
402
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
403
+ sr_ratio=sr_ratios[2])
404
+ for i in range(depths[2])])
405
+ self.norm3 = norm_layer(embed_dims[2])
406
+
407
+ cur += depths[2]
408
+ self.block4 = nn.ModuleList([Block(
409
+ dim=embed_dims[3], num_heads=num_heads[3], mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, qk_scale=qk_scale,
410
+ drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer,
411
+ sr_ratio=sr_ratios[3])
412
+ for i in range(depths[3])])
413
+ self.norm4 = norm_layer(embed_dims[3])
414
+
415
+ # classification head
416
+ # self.head = nn.Linear(embed_dims[3], num_classes) if num_classes > 0 else nn.Identity()
417
+
418
+ self.apply(self._init_weights)
419
+
420
+ def _init_weights(self, m):
421
+ if isinstance(m, nn.Linear):
422
+ trunc_normal_(m.weight, std=.02)
423
+ if isinstance(m, nn.Linear) and m.bias is not None:
424
+ nn.init.constant_(m.bias, 0)
425
+ elif isinstance(m, nn.LayerNorm):
426
+ nn.init.constant_(m.bias, 0)
427
+ nn.init.constant_(m.weight, 1.0)
428
+ elif isinstance(m, nn.Conv2d):
429
+ fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
430
+ fan_out //= m.groups
431
+ m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
432
+ if m.bias is not None:
433
+ m.bias.data.zero_()
434
+
435
+ def init_weights(self, pretrained=None):
436
+ if isinstance(pretrained, str):
437
+ logger = 1
438
+ #load_checkpoint(self, pretrained, map_location='cpu', strict=False, logger=logger)
439
+
440
+ def reset_drop_path(self, drop_path_rate):
441
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(self.depths))]
442
+ cur = 0
443
+ for i in range(self.depths[0]):
444
+ self.block1[i].drop_path.drop_prob = dpr[cur + i]
445
+
446
+ cur += self.depths[0]
447
+ for i in range(self.depths[1]):
448
+ self.block2[i].drop_path.drop_prob = dpr[cur + i]
449
+
450
+ cur += self.depths[1]
451
+ for i in range(self.depths[2]):
452
+ self.block3[i].drop_path.drop_prob = dpr[cur + i]
453
+
454
+ cur += self.depths[2]
455
+ for i in range(self.depths[3]):
456
+ self.block4[i].drop_path.drop_prob = dpr[cur + i]
457
+
458
+ def freeze_patch_emb(self):
459
+ self.patch_embed1.requires_grad = False
460
+
461
+ @torch.jit.ignore
462
+ def no_weight_decay(self):
463
+ return {'pos_embed1', 'pos_embed2', 'pos_embed3', 'pos_embed4', 'cls_token'} # has pos_embed may be better
464
+
465
+ def get_classifier(self):
466
+ return self.head
467
+
468
+ def reset_classifier(self, num_classes, global_pool=''):
469
+ self.num_classes = num_classes
470
+ self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
471
+
472
+ def forward_features(self, x):
473
+ B = x.shape[0]
474
+ outs = []
475
+
476
+ # stage 1
477
+ x, H, W = self.patch_embed1(x)
478
+ for i, blk in enumerate(self.block1):
479
+ x = blk(x, H, W)
480
+ x = self.norm1(x)
481
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
482
+ outs.append(x)
483
+
484
+ # stage 2
485
+ x, H, W = self.patch_embed2(x)
486
+ for i, blk in enumerate(self.block2):
487
+ x = blk(x, H, W)
488
+ x = self.norm2(x)
489
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
490
+ outs.append(x)
491
+
492
+ # stage 3
493
+ x, H, W = self.patch_embed3(x)
494
+ for i, blk in enumerate(self.block3):
495
+ x = blk(x, H, W)
496
+ x = self.norm3(x)
497
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
498
+ outs.append(x)
499
+
500
+ # stage 4
501
+ x, H, W = self.patch_embed4(x)
502
+ for i, blk in enumerate(self.block4):
503
+ x = blk(x, H, W)
504
+ x = self.norm4(x)
505
+ x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
506
+ outs.append(x)
507
+
508
+ return outs
509
+
510
+ # return x.mean(dim=1)
511
+
512
+ def forward(self, x):
513
+ x = self.forward_features(x)
514
+ # x = self.head(x)
515
+
516
+ return x
517
+
518
+
519
+ class DWConv(nn.Module):
520
+ def __init__(self, dim=768):
521
+ super(DWConv, self).__init__()
522
+ self.dwconv = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
523
+
524
+ def forward(self, x, H, W):
525
+ B, N, C = x.shape
526
+ x = x.transpose(1, 2).view(B, C, H, W).contiguous()
527
+ x = self.dwconv(x)
528
+ x = x.flatten(2).transpose(1, 2)
529
+
530
+ return x
531
+
532
+
533
+ def _conv_filter(state_dict, patch_size=16):
534
+ """ convert patch embedding weight from manual patchify + linear proj to conv"""
535
+ out_dict = {}
536
+ for k, v in state_dict.items():
537
+ if 'patch_embed.proj.weight' in k:
538
+ v = v.reshape((v.shape[0], 3, patch_size, patch_size))
539
+ out_dict[k] = v
540
+
541
+ return out_dict
542
+
543
+
544
+ ## @register_model
545
+ class pvt_v2_b0(PyramidVisionTransformerImpr):
546
+ def __init__(self, **kwargs):
547
+ super(pvt_v2_b0, self).__init__(
548
+ patch_size=4, embed_dims=[32, 64, 160, 256], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
549
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
550
+ drop_rate=0.0, drop_path_rate=0.1)
551
+
552
+
553
+
554
+ ## @register_model
555
+ class pvt_v2_b1(PyramidVisionTransformerImpr):
556
+ def __init__(self, **kwargs):
557
+ super(pvt_v2_b1, self).__init__(
558
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
559
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[2, 2, 2, 2], sr_ratios=[8, 4, 2, 1],
560
+ drop_rate=0.0, drop_path_rate=0.1)
561
+
562
+ ## @register_model
563
+ class pvt_v2_b2(PyramidVisionTransformerImpr):
564
+ def __init__(self, in_channels=3, **kwargs):
565
+ super(pvt_v2_b2, self).__init__(
566
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
567
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1],
568
+ drop_rate=0.0, drop_path_rate=0.1, in_channels=in_channels)
569
+
570
+ ## @register_model
571
+ class pvt_v2_b3(PyramidVisionTransformerImpr):
572
+ def __init__(self, **kwargs):
573
+ super(pvt_v2_b3, self).__init__(
574
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
575
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1],
576
+ drop_rate=0.0, drop_path_rate=0.1)
577
+
578
+ ## @register_model
579
+ class pvt_v2_b4(PyramidVisionTransformerImpr):
580
+ def __init__(self, **kwargs):
581
+ super(pvt_v2_b4, self).__init__(
582
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4],
583
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1],
584
+ drop_rate=0.0, drop_path_rate=0.1)
585
+
586
+
587
+ ## @register_model
588
+ class pvt_v2_b5(PyramidVisionTransformerImpr):
589
+ def __init__(self, **kwargs):
590
+ super(pvt_v2_b5, self).__init__(
591
+ patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[4, 4, 4, 4],
592
+ qkv_bias=True, norm_layer=partial(nn.LayerNorm, eps=1e-6), depths=[3, 6, 40, 3], sr_ratios=[8, 4, 2, 1],
593
+ drop_rate=0.0, drop_path_rate=0.1)
594
+
595
+
596
+
597
+ ### models/backbones/swin_v1.py
598
+
599
+ # --------------------------------------------------------
600
+ # Swin Transformer
601
+ # Copyright (c) 2021 Microsoft
602
+ # Licensed under The MIT License [see LICENSE for details]
603
+ # Written by Ze Liu, Yutong Lin, Yixuan Wei
604
+ # --------------------------------------------------------
605
+
606
+ import torch
607
+ import torch.nn as nn
608
+ import torch.nn.functional as F
609
+ import torch.utils.checkpoint as checkpoint
610
+ import numpy as np
611
+ from timm.models.layers import DropPath, to_2tuple, trunc_normal_
612
+
613
+ # from config import Config
614
+
615
+
616
+ # config = Config()
617
+
618
+ class Mlp(nn.Module):
619
+ """ Multilayer perceptron."""
620
+
621
+ def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
622
+ super().__init__()
623
+ out_features = out_features or in_features
624
+ hidden_features = hidden_features or in_features
625
+ self.fc1 = nn.Linear(in_features, hidden_features)
626
+ self.act = act_layer()
627
+ self.fc2 = nn.Linear(hidden_features, out_features)
628
+ self.drop = nn.Dropout(drop)
629
+
630
+ def forward(self, x):
631
+ x = self.fc1(x)
632
+ x = self.act(x)
633
+ x = self.drop(x)
634
+ x = self.fc2(x)
635
+ x = self.drop(x)
636
+ return x
637
+
638
+
639
+ def window_partition(x, window_size):
640
+ """
641
+ Args:
642
+ x: (B, H, W, C)
643
+ window_size (int): window size
644
+
645
+ Returns:
646
+ windows: (num_windows*B, window_size, window_size, C)
647
+ """
648
+ B, H, W, C = x.shape
649
+ x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
650
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
651
+ return windows
652
+
653
+
654
+ def window_reverse(windows, window_size, H, W):
655
+ """
656
+ Args:
657
+ windows: (num_windows*B, window_size, window_size, C)
658
+ window_size (int): Window size
659
+ H (int): Height of image
660
+ W (int): Width of image
661
+
662
+ Returns:
663
+ x: (B, H, W, C)
664
+ """
665
+ B = int(windows.shape[0] / (H * W / window_size / window_size))
666
+ x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1)
667
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
668
+ return x
669
+
670
+
671
+ class WindowAttention(nn.Module):
672
+ """ Window based multi-head self attention (W-MSA) module with relative position bias.
673
+ It supports both of shifted and non-shifted window.
674
+
675
+ Args:
676
+ dim (int): Number of input channels.
677
+ window_size (tuple[int]): The height and width of the window.
678
+ num_heads (int): Number of attention heads.
679
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
680
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
681
+ attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
682
+ proj_drop (float, optional): Dropout ratio of output. Default: 0.0
683
+ """
684
+
685
+ def __init__(self, dim, window_size, num_heads, qkv_bias=True, qk_scale=None, attn_drop=0., proj_drop=0.):
686
+
687
+ super().__init__()
688
+ self.dim = dim
689
+ self.window_size = window_size # Wh, Ww
690
+ self.num_heads = num_heads
691
+ head_dim = dim // num_heads
692
+ self.scale = qk_scale or head_dim ** -0.5
693
+
694
+ # define a parameter table of relative position bias
695
+ self.relative_position_bias_table = nn.Parameter(
696
+ torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
697
+
698
+ # get pair-wise relative position index for each token inside the window
699
+ coords_h = torch.arange(self.window_size[0])
700
+ coords_w = torch.arange(self.window_size[1])
701
+ coords = torch.stack(torch.meshgrid([coords_h, coords_w], indexing='ij')) # 2, Wh, Ww
702
+ coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww
703
+ relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww
704
+ relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2
705
+ relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0
706
+ relative_coords[:, :, 1] += self.window_size[1] - 1
707
+ relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
708
+ relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww
709
+ self.register_buffer("relative_position_index", relative_position_index)
710
+
711
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
712
+ self.attn_drop_prob = attn_drop
713
+ self.attn_drop = nn.Dropout(attn_drop)
714
+ self.proj = nn.Linear(dim, dim)
715
+ self.proj_drop = nn.Dropout(proj_drop)
716
+
717
+ trunc_normal_(self.relative_position_bias_table, std=.02)
718
+ self.softmax = nn.Softmax(dim=-1)
719
+
720
+ def forward(self, x, mask=None):
721
+ """ Forward function.
722
+
723
+ Args:
724
+ x: input features with shape of (num_windows*B, N, C)
725
+ mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
726
+ """
727
+ B_, N, C = x.shape
728
+ qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
729
+ q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
730
+
731
+ q = q * self.scale
732
+
733
+ if config.SDPA_enabled:
734
+ x = torch.nn.functional.scaled_dot_product_attention(
735
+ q, k, v,
736
+ attn_mask=None, dropout_p=self.attn_drop_prob, is_causal=False
737
+ ).transpose(1, 2).reshape(B_, N, C)
738
+ else:
739
+ attn = (q @ k.transpose(-2, -1))
740
+
741
+ relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
742
+ self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH
743
+ relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww
744
+ attn = attn + relative_position_bias.unsqueeze(0)
745
+
746
+ if mask is not None:
747
+ nW = mask.shape[0]
748
+ attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0)
749
+ attn = attn.view(-1, self.num_heads, N, N)
750
+ attn = self.softmax(attn)
751
+ else:
752
+ attn = self.softmax(attn)
753
+
754
+ attn = self.attn_drop(attn)
755
+
756
+ x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
757
+ x = self.proj(x)
758
+ x = self.proj_drop(x)
759
+ return x
760
+
761
+
762
+ class SwinTransformerBlock(nn.Module):
763
+ """ Swin Transformer Block.
764
+
765
+ Args:
766
+ dim (int): Number of input channels.
767
+ num_heads (int): Number of attention heads.
768
+ window_size (int): Window size.
769
+ shift_size (int): Shift size for SW-MSA.
770
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
771
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
772
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
773
+ drop (float, optional): Dropout rate. Default: 0.0
774
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
775
+ drop_path (float, optional): Stochastic depth rate. Default: 0.0
776
+ act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
777
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
778
+ """
779
+
780
+ def __init__(self, dim, num_heads, window_size=7, shift_size=0,
781
+ mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0.,
782
+ act_layer=nn.GELU, norm_layer=nn.LayerNorm):
783
+ super().__init__()
784
+ self.dim = dim
785
+ self.num_heads = num_heads
786
+ self.window_size = window_size
787
+ self.shift_size = shift_size
788
+ self.mlp_ratio = mlp_ratio
789
+ assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size"
790
+
791
+ self.norm1 = norm_layer(dim)
792
+ self.attn = WindowAttention(
793
+ dim, window_size=to_2tuple(self.window_size), num_heads=num_heads,
794
+ qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
795
+
796
+ self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
797
+ self.norm2 = norm_layer(dim)
798
+ mlp_hidden_dim = int(dim * mlp_ratio)
799
+ self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
800
+
801
+ self.H = None
802
+ self.W = None
803
+
804
+ def forward(self, x, mask_matrix):
805
+ """ Forward function.
806
+
807
+ Args:
808
+ x: Input feature, tensor size (B, H*W, C).
809
+ H, W: Spatial resolution of the input feature.
810
+ mask_matrix: Attention mask for cyclic shift.
811
+ """
812
+ B, L, C = x.shape
813
+ H, W = self.H, self.W
814
+ assert L == H * W, "input feature has wrong size"
815
+
816
+ shortcut = x
817
+ x = self.norm1(x)
818
+ x = x.view(B, H, W, C)
819
+
820
+ # pad feature maps to multiples of window size
821
+ pad_l = pad_t = 0
822
+ pad_r = (self.window_size - W % self.window_size) % self.window_size
823
+ pad_b = (self.window_size - H % self.window_size) % self.window_size
824
+ x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
825
+ _, Hp, Wp, _ = x.shape
826
+
827
+ # cyclic shift
828
+ if self.shift_size > 0:
829
+ shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2))
830
+ attn_mask = mask_matrix
831
+ else:
832
+ shifted_x = x
833
+ attn_mask = None
834
+
835
+ # partition windows
836
+ x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C
837
+ x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # nW*B, window_size*window_size, C
838
+
839
+ # W-MSA/SW-MSA
840
+ attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C
841
+
842
+ # merge windows
843
+ attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
844
+ shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C
845
+
846
+ # reverse cyclic shift
847
+ if self.shift_size > 0:
848
+ x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2))
849
+ else:
850
+ x = shifted_x
851
+
852
+ if pad_r > 0 or pad_b > 0:
853
+ x = x[:, :H, :W, :].contiguous()
854
+
855
+ x = x.view(B, H * W, C)
856
+
857
+ # FFN
858
+ x = shortcut + self.drop_path(x)
859
+ x = x + self.drop_path(self.mlp(self.norm2(x)))
860
+
861
+ return x
862
+
863
+
864
+ class PatchMerging(nn.Module):
865
+ """ Patch Merging Layer
866
+
867
+ Args:
868
+ dim (int): Number of input channels.
869
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
870
+ """
871
+ def __init__(self, dim, norm_layer=nn.LayerNorm):
872
+ super().__init__()
873
+ self.dim = dim
874
+ self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
875
+ self.norm = norm_layer(4 * dim)
876
+
877
+ def forward(self, x, H, W):
878
+ """ Forward function.
879
+
880
+ Args:
881
+ x: Input feature, tensor size (B, H*W, C).
882
+ H, W: Spatial resolution of the input feature.
883
+ """
884
+ B, L, C = x.shape
885
+ assert L == H * W, "input feature has wrong size"
886
+
887
+ x = x.view(B, H, W, C)
888
+
889
+ # padding
890
+ pad_input = (H % 2 == 1) or (W % 2 == 1)
891
+ if pad_input:
892
+ x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
893
+
894
+ x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C
895
+ x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C
896
+ x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C
897
+ x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C
898
+ x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C
899
+ x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C
900
+
901
+ x = self.norm(x)
902
+ x = self.reduction(x)
903
+
904
+ return x
905
+
906
+
907
+ class BasicLayer(nn.Module):
908
+ """ A basic Swin Transformer layer for one stage.
909
+
910
+ Args:
911
+ dim (int): Number of feature channels
912
+ depth (int): Depths of this stage.
913
+ num_heads (int): Number of attention head.
914
+ window_size (int): Local window size. Default: 7.
915
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
916
+ qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
917
+ qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
918
+ drop (float, optional): Dropout rate. Default: 0.0
919
+ attn_drop (float, optional): Attention dropout rate. Default: 0.0
920
+ drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
921
+ norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
922
+ downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
923
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
924
+ """
925
+
926
+ def __init__(self,
927
+ dim,
928
+ depth,
929
+ num_heads,
930
+ window_size=7,
931
+ mlp_ratio=4.,
932
+ qkv_bias=True,
933
+ qk_scale=None,
934
+ drop=0.,
935
+ attn_drop=0.,
936
+ drop_path=0.,
937
+ norm_layer=nn.LayerNorm,
938
+ downsample=None,
939
+ use_checkpoint=False):
940
+ super().__init__()
941
+ self.window_size = window_size
942
+ self.shift_size = window_size // 2
943
+ self.depth = depth
944
+ self.use_checkpoint = use_checkpoint
945
+
946
+ # build blocks
947
+ self.blocks = nn.ModuleList([
948
+ SwinTransformerBlock(
949
+ dim=dim,
950
+ num_heads=num_heads,
951
+ window_size=window_size,
952
+ shift_size=0 if (i % 2 == 0) else window_size // 2,
953
+ mlp_ratio=mlp_ratio,
954
+ qkv_bias=qkv_bias,
955
+ qk_scale=qk_scale,
956
+ drop=drop,
957
+ attn_drop=attn_drop,
958
+ drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path,
959
+ norm_layer=norm_layer)
960
+ for i in range(depth)])
961
+
962
+ # patch merging layer
963
+ if downsample is not None:
964
+ self.downsample = downsample(dim=dim, norm_layer=norm_layer)
965
+ else:
966
+ self.downsample = None
967
+
968
+ def forward(self, x, H, W):
969
+ """ Forward function.
970
+
971
+ Args:
972
+ x: Input feature, tensor size (B, H*W, C).
973
+ H, W: Spatial resolution of the input feature.
974
+ """
975
+
976
+ # calculate attention mask for SW-MSA
977
+ Hp = int(np.ceil(H / self.window_size)) * self.window_size
978
+ Wp = int(np.ceil(W / self.window_size)) * self.window_size
979
+ img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1
980
+ h_slices = (slice(0, -self.window_size),
981
+ slice(-self.window_size, -self.shift_size),
982
+ slice(-self.shift_size, None))
983
+ w_slices = (slice(0, -self.window_size),
984
+ slice(-self.window_size, -self.shift_size),
985
+ slice(-self.shift_size, None))
986
+ cnt = 0
987
+ for h in h_slices:
988
+ for w in w_slices:
989
+ img_mask[:, h, w, :] = cnt
990
+ cnt += 1
991
+
992
+ mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1
993
+ mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
994
+ attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
995
+ attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
996
+
997
+ for blk in self.blocks:
998
+ blk.H, blk.W = H, W
999
+ if self.use_checkpoint:
1000
+ x = checkpoint.checkpoint(blk, x, attn_mask)
1001
+ else:
1002
+ x = blk(x, attn_mask)
1003
+ if self.downsample is not None:
1004
+ x_down = self.downsample(x, H, W)
1005
+ Wh, Ww = (H + 1) // 2, (W + 1) // 2
1006
+ return x, H, W, x_down, Wh, Ww
1007
+ else:
1008
+ return x, H, W, x, H, W
1009
+
1010
+
1011
+ class PatchEmbed(nn.Module):
1012
+ """ Image to Patch Embedding
1013
+
1014
+ Args:
1015
+ patch_size (int): Patch token size. Default: 4.
1016
+ in_channels (int): Number of input image channels. Default: 3.
1017
+ embed_dim (int): Number of linear projection output channels. Default: 96.
1018
+ norm_layer (nn.Module, optional): Normalization layer. Default: None
1019
+ """
1020
+
1021
+ def __init__(self, patch_size=4, in_channels=3, embed_dim=96, norm_layer=None):
1022
+ super().__init__()
1023
+ patch_size = to_2tuple(patch_size)
1024
+ self.patch_size = patch_size
1025
+
1026
+ self.in_channels = in_channels
1027
+ self.embed_dim = embed_dim
1028
+
1029
+ self.proj = nn.Conv2d(in_channels, embed_dim, kernel_size=patch_size, stride=patch_size)
1030
+ if norm_layer is not None:
1031
+ self.norm = norm_layer(embed_dim)
1032
+ else:
1033
+ self.norm = None
1034
+
1035
+ def forward(self, x):
1036
+ """Forward function."""
1037
+ # padding
1038
+ _, _, H, W = x.size()
1039
+ if W % self.patch_size[1] != 0:
1040
+ x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
1041
+ if H % self.patch_size[0] != 0:
1042
+ x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
1043
+
1044
+ x = self.proj(x) # B C Wh Ww
1045
+ if self.norm is not None:
1046
+ Wh, Ww = x.size(2), x.size(3)
1047
+ x = x.flatten(2).transpose(1, 2)
1048
+ x = self.norm(x)
1049
+ x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
1050
+
1051
+ return x
1052
+
1053
+
1054
+ class SwinTransformer(nn.Module):
1055
+ """ Swin Transformer backbone.
1056
+ A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` -
1057
+ https://arxiv.org/pdf/2103.14030
1058
+
1059
+ Args:
1060
+ pretrain_img_size (int): Input image size for training the pretrained model,
1061
+ used in absolute postion embedding. Default 224.
1062
+ patch_size (int | tuple(int)): Patch size. Default: 4.
1063
+ in_channels (int): Number of input image channels. Default: 3.
1064
+ embed_dim (int): Number of linear projection output channels. Default: 96.
1065
+ depths (tuple[int]): Depths of each Swin Transformer stage.
1066
+ num_heads (tuple[int]): Number of attention head of each stage.
1067
+ window_size (int): Window size. Default: 7.
1068
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
1069
+ qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
1070
+ qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
1071
+ drop_rate (float): Dropout rate.
1072
+ attn_drop_rate (float): Attention dropout rate. Default: 0.
1073
+ drop_path_rate (float): Stochastic depth rate. Default: 0.2.
1074
+ norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
1075
+ ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
1076
+ patch_norm (bool): If True, add normalization after patch embedding. Default: True.
1077
+ out_indices (Sequence[int]): Output from which stages.
1078
+ frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
1079
+ -1 means not freezing any parameters.
1080
+ use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
1081
+ """
1082
+
1083
+ def __init__(self,
1084
+ pretrain_img_size=224,
1085
+ patch_size=4,
1086
+ in_channels=3,
1087
+ embed_dim=96,
1088
+ depths=[2, 2, 6, 2],
1089
+ num_heads=[3, 6, 12, 24],
1090
+ window_size=7,
1091
+ mlp_ratio=4.,
1092
+ qkv_bias=True,
1093
+ qk_scale=None,
1094
+ drop_rate=0.,
1095
+ attn_drop_rate=0.,
1096
+ drop_path_rate=0.2,
1097
+ norm_layer=nn.LayerNorm,
1098
+ ape=False,
1099
+ patch_norm=True,
1100
+ out_indices=(0, 1, 2, 3),
1101
+ frozen_stages=-1,
1102
+ use_checkpoint=False):
1103
+ super().__init__()
1104
+
1105
+ self.pretrain_img_size = pretrain_img_size
1106
+ self.num_layers = len(depths)
1107
+ self.embed_dim = embed_dim
1108
+ self.ape = ape
1109
+ self.patch_norm = patch_norm
1110
+ self.out_indices = out_indices
1111
+ self.frozen_stages = frozen_stages
1112
+
1113
+ # split image into non-overlapping patches
1114
+ self.patch_embed = PatchEmbed(
1115
+ patch_size=patch_size, in_channels=in_channels, embed_dim=embed_dim,
1116
+ norm_layer=norm_layer if self.patch_norm else None)
1117
+
1118
+ # absolute position embedding
1119
+ if self.ape:
1120
+ pretrain_img_size = to_2tuple(pretrain_img_size)
1121
+ patch_size = to_2tuple(patch_size)
1122
+ patches_resolution = [pretrain_img_size[0] // patch_size[0], pretrain_img_size[1] // patch_size[1]]
1123
+
1124
+ self.absolute_pos_embed = nn.Parameter(torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]))
1125
+ trunc_normal_(self.absolute_pos_embed, std=.02)
1126
+
1127
+ self.pos_drop = nn.Dropout(p=drop_rate)
1128
+
1129
+ # stochastic depth
1130
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule
1131
+
1132
+ # build layers
1133
+ self.layers = nn.ModuleList()
1134
+ for i_layer in range(self.num_layers):
1135
+ layer = BasicLayer(
1136
+ dim=int(embed_dim * 2 ** i_layer),
1137
+ depth=depths[i_layer],
1138
+ num_heads=num_heads[i_layer],
1139
+ window_size=window_size,
1140
+ mlp_ratio=mlp_ratio,
1141
+ qkv_bias=qkv_bias,
1142
+ qk_scale=qk_scale,
1143
+ drop=drop_rate,
1144
+ attn_drop=attn_drop_rate,
1145
+ drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
1146
+ norm_layer=norm_layer,
1147
+ downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
1148
+ use_checkpoint=use_checkpoint)
1149
+ self.layers.append(layer)
1150
+
1151
+ num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
1152
+ self.num_features = num_features
1153
+
1154
+ # add a norm layer for each output
1155
+ for i_layer in out_indices:
1156
+ layer = norm_layer(num_features[i_layer])
1157
+ layer_name = f'norm{i_layer}'
1158
+ self.add_module(layer_name, layer)
1159
+
1160
+ self._freeze_stages()
1161
+
1162
+ def _freeze_stages(self):
1163
+ if self.frozen_stages >= 0:
1164
+ self.patch_embed.eval()
1165
+ for param in self.patch_embed.parameters():
1166
+ param.requires_grad = False
1167
+
1168
+ if self.frozen_stages >= 1 and self.ape:
1169
+ self.absolute_pos_embed.requires_grad = False
1170
+
1171
+ if self.frozen_stages >= 2:
1172
+ self.pos_drop.eval()
1173
+ for i in range(0, self.frozen_stages - 1):
1174
+ m = self.layers[i]
1175
+ m.eval()
1176
+ for param in m.parameters():
1177
+ param.requires_grad = False
1178
+
1179
+
1180
+ def forward(self, x):
1181
+ """Forward function."""
1182
+ x = self.patch_embed(x)
1183
+
1184
+ Wh, Ww = x.size(2), x.size(3)
1185
+ if self.ape:
1186
+ # interpolate the position embedding to the corresponding size
1187
+ absolute_pos_embed = F.interpolate(self.absolute_pos_embed, size=(Wh, Ww), mode='bicubic')
1188
+ x = (x + absolute_pos_embed) # B Wh*Ww C
1189
+
1190
+ outs = []#x.contiguous()]
1191
+ x = x.flatten(2).transpose(1, 2)
1192
+ x = self.pos_drop(x)
1193
+ for i in range(self.num_layers):
1194
+ layer = self.layers[i]
1195
+ x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
1196
+
1197
+ if i in self.out_indices:
1198
+ norm_layer = getattr(self, f'norm{i}')
1199
+ x_out = norm_layer(x_out)
1200
+
1201
+ out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous()
1202
+ outs.append(out)
1203
+
1204
+ return tuple(outs)
1205
+
1206
+ def train(self, mode=True):
1207
+ """Convert the model into training mode while keep layers freezed."""
1208
+ super(SwinTransformer, self).train(mode)
1209
+ self._freeze_stages()
1210
+
1211
+ def swin_v1_t():
1212
+ model = SwinTransformer(embed_dim=96, depths=[2, 2, 6, 2], num_heads=[3, 6, 12, 24], window_size=7)
1213
+ return model
1214
+
1215
+ def swin_v1_s():
1216
+ model = SwinTransformer(embed_dim=96, depths=[2, 2, 18, 2], num_heads=[3, 6, 12, 24], window_size=7)
1217
+ return model
1218
+
1219
+ def swin_v1_b():
1220
+ model = SwinTransformer(embed_dim=128, depths=[2, 2, 18, 2], num_heads=[4, 8, 16, 32], window_size=12)
1221
+ return model
1222
+
1223
+ def swin_v1_l():
1224
+ model = SwinTransformer(embed_dim=192, depths=[2, 2, 18, 2], num_heads=[6, 12, 24, 48], window_size=12)
1225
+ return model
1226
+
1227
+
1228
+
1229
+ ### models/modules/deform_conv.py
1230
+
1231
+ import torch
1232
+ import torch.nn as nn
1233
+ from torchvision.ops import deform_conv2d
1234
+
1235
+
1236
+ class DeformableConv2d(nn.Module):
1237
+ def __init__(self,
1238
+ in_channels,
1239
+ out_channels,
1240
+ kernel_size=3,
1241
+ stride=1,
1242
+ padding=1,
1243
+ bias=False):
1244
+
1245
+ super(DeformableConv2d, self).__init__()
1246
+
1247
+ assert type(kernel_size) == tuple or type(kernel_size) == int
1248
+
1249
+ kernel_size = kernel_size if type(kernel_size) == tuple else (kernel_size, kernel_size)
1250
+ self.stride = stride if type(stride) == tuple else (stride, stride)
1251
+ self.padding = padding
1252
+
1253
+ self.offset_conv = nn.Conv2d(in_channels,
1254
+ 2 * kernel_size[0] * kernel_size[1],
1255
+ kernel_size=kernel_size,
1256
+ stride=stride,
1257
+ padding=self.padding,
1258
+ bias=True)
1259
+
1260
+ nn.init.constant_(self.offset_conv.weight, 0.)
1261
+ nn.init.constant_(self.offset_conv.bias, 0.)
1262
+
1263
+ self.modulator_conv = nn.Conv2d(in_channels,
1264
+ 1 * kernel_size[0] * kernel_size[1],
1265
+ kernel_size=kernel_size,
1266
+ stride=stride,
1267
+ padding=self.padding,
1268
+ bias=True)
1269
+
1270
+ nn.init.constant_(self.modulator_conv.weight, 0.)
1271
+ nn.init.constant_(self.modulator_conv.bias, 0.)
1272
+
1273
+ self.regular_conv = nn.Conv2d(in_channels,
1274
+ out_channels=out_channels,
1275
+ kernel_size=kernel_size,
1276
+ stride=stride,
1277
+ padding=self.padding,
1278
+ bias=bias)
1279
+
1280
+ def forward(self, x):
1281
+ #h, w = x.shape[2:]
1282
+ #max_offset = max(h, w)/4.
1283
+
1284
+ offset = self.offset_conv(x)#.clamp(-max_offset, max_offset)
1285
+ modulator = 2. * torch.sigmoid(self.modulator_conv(x))
1286
+
1287
+ x = deform_conv2d(
1288
+ input=x,
1289
+ offset=offset,
1290
+ weight=self.regular_conv.weight,
1291
+ bias=self.regular_conv.bias,
1292
+ padding=self.padding,
1293
+ mask=modulator,
1294
+ stride=self.stride,
1295
+ )
1296
+ return x
1297
+
1298
+
1299
+
1300
+
1301
+ ### utils.py
1302
+
1303
+ import torch.nn as nn
1304
+
1305
+
1306
+ def build_act_layer(act_layer):
1307
+ if act_layer == 'ReLU':
1308
+ return nn.ReLU(inplace=True)
1309
+ elif act_layer == 'SiLU':
1310
+ return nn.SiLU(inplace=True)
1311
+ elif act_layer == 'GELU':
1312
+ return nn.GELU()
1313
+
1314
+ raise NotImplementedError(f'build_act_layer does not support {act_layer}')
1315
+
1316
+
1317
+ def build_norm_layer(dim,
1318
+ norm_layer,
1319
+ in_format='channels_last',
1320
+ out_format='channels_last',
1321
+ eps=1e-6):
1322
+ layers = []
1323
+ if norm_layer == 'BN':
1324
+ if in_format == 'channels_last':
1325
+ layers.append(to_channels_first())
1326
+ layers.append(nn.BatchNorm2d(dim))
1327
+ if out_format == 'channels_last':
1328
+ layers.append(to_channels_last())
1329
+ elif norm_layer == 'LN':
1330
+ if in_format == 'channels_first':
1331
+ layers.append(to_channels_last())
1332
+ layers.append(nn.LayerNorm(dim, eps=eps))
1333
+ if out_format == 'channels_first':
1334
+ layers.append(to_channels_first())
1335
+ else:
1336
+ raise NotImplementedError(
1337
+ f'build_norm_layer does not support {norm_layer}')
1338
+ return nn.Sequential(*layers)
1339
+
1340
+
1341
+ class to_channels_first(nn.Module):
1342
+
1343
+ def __init__(self):
1344
+ super().__init__()
1345
+
1346
+ def forward(self, x):
1347
+ return x.permute(0, 3, 1, 2)
1348
+
1349
+
1350
+ class to_channels_last(nn.Module):
1351
+
1352
+ def __init__(self):
1353
+ super().__init__()
1354
+
1355
+ def forward(self, x):
1356
+ return x.permute(0, 2, 3, 1)
1357
+
1358
+
1359
+
1360
+ ### dataset.py
1361
+
1362
+ _class_labels_TR_sorted = (
1363
+ 'Airplane, Ant, Antenna, Archery, Axe, BabyCarriage, Bag, BalanceBeam, Balcony, Balloon, Basket, BasketballHoop, Beatle, Bed, Bee, Bench, Bicycle, '
1364
+ 'BicycleFrame, BicycleStand, Boat, Bonsai, BoomLift, Bridge, BunkBed, Butterfly, Button, Cable, CableLift, Cage, Camcorder, Cannon, Canoe, Car, '
1365
+ 'CarParkDropArm, Carriage, Cart, Caterpillar, CeilingLamp, Centipede, Chair, Clip, Clock, Clothes, CoatHanger, Comb, ConcretePumpTruck, Crack, Crane, '
1366
+ 'Cup, DentalChair, Desk, DeskChair, Diagram, DishRack, DoorHandle, Dragonfish, Dragonfly, Drum, Earphone, Easel, ElectricIron, Excavator, Eyeglasses, '
1367
+ 'Fan, Fence, Fencing, FerrisWheel, FireExtinguisher, Fishing, Flag, FloorLamp, Forklift, GasStation, Gate, Gear, Goal, Golf, GymEquipment, Hammock, '
1368
+ 'Handcart, Handcraft, Handrail, HangGlider, Harp, Harvester, Headset, Helicopter, Helmet, Hook, HorizontalBar, Hydrovalve, IroningTable, Jewelry, Key, '
1369
+ 'KidsPlayground, Kitchenware, Kite, Knife, Ladder, LaundryRack, Lightning, Lobster, Locust, Machine, MachineGun, MagazineRack, Mantis, Medal, MemorialArchway, '
1370
+ 'Microphone, Missile, MobileHolder, Monitor, Mosquito, Motorcycle, MovingTrolley, Mower, MusicPlayer, MusicStand, ObservationTower, Octopus, OilWell, '
1371
+ 'OlympicLogo, OperatingTable, OutdoorFitnessEquipment, Parachute, Pavilion, Piano, Pipe, PlowHarrow, PoleVault, Punchbag, Rack, Racket, Rifle, Ring, Robot, '
1372
+ 'RockClimbing, Rope, Sailboat, Satellite, Scaffold, Scale, Scissor, Scooter, Sculpture, Seadragon, Seahorse, Seal, SewingMachine, Ship, Shoe, ShoppingCart, '
1373
+ 'ShoppingTrolley, Shower, Shrimp, Signboard, Skateboarding, Skeleton, Skiing, Spade, SpeedBoat, Spider, Spoon, Stair, Stand, Stationary, SteeringWheel, '
1374
+ 'Stethoscope, Stool, Stove, StreetLamp, SweetStand, Swing, Sword, TV, Table, TableChair, TableLamp, TableTennis, Tank, Tapeline, Teapot, Telescope, Tent, '
1375
+ 'TobaccoPipe, Toy, Tractor, TrafficLight, TrafficSign, Trampoline, TransmissionTower, Tree, Tricycle, TrimmerCover, Tripod, Trombone, Truck, Trumpet, Tuba, '
1376
+ 'UAV, Umbrella, UnevenBars, UtilityPole, VacuumCleaner, Violin, Wakesurfing, Watch, WaterTower, WateringPot, Well, WellLid, Wheel, Wheelchair, WindTurbine, Windmill, WineGlass, WireWhisk, Yacht'
1377
+ )
1378
+ class_labels_TR_sorted = _class_labels_TR_sorted.split(', ')
1379
+
1380
+
1381
+ ### models/backbones/build_backbones.py
1382
+
1383
+ import torch
1384
+ import torch.nn as nn
1385
+ from collections import OrderedDict
1386
+ from torchvision.models import vgg16, vgg16_bn, VGG16_Weights, VGG16_BN_Weights, resnet50, ResNet50_Weights
1387
+ # from models.pvt_v2 import pvt_v2_b0, pvt_v2_b1, pvt_v2_b2, pvt_v2_b5
1388
+ # from models.swin_v1 import swin_v1_t, swin_v1_s, swin_v1_b, swin_v1_l
1389
+ # from config import Config
1390
+
1391
+
1392
+ config = Config()
1393
+
1394
+ def build_backbone(bb_name, pretrained=True, params_settings=''):
1395
+ if bb_name == 'vgg16':
1396
+ bb_net = list(vgg16(pretrained=VGG16_Weights.DEFAULT if pretrained else None).children())[0]
1397
+ bb = nn.Sequential(OrderedDict({'conv1': bb_net[:4], 'conv2': bb_net[4:9], 'conv3': bb_net[9:16], 'conv4': bb_net[16:23]}))
1398
+ elif bb_name == 'vgg16bn':
1399
+ bb_net = list(vgg16_bn(pretrained=VGG16_BN_Weights.DEFAULT if pretrained else None).children())[0]
1400
+ bb = nn.Sequential(OrderedDict({'conv1': bb_net[:6], 'conv2': bb_net[6:13], 'conv3': bb_net[13:23], 'conv4': bb_net[23:33]}))
1401
+ elif bb_name == 'resnet50':
1402
+ bb_net = list(resnet50(pretrained=ResNet50_Weights.DEFAULT if pretrained else None).children())
1403
+ bb = nn.Sequential(OrderedDict({'conv1': nn.Sequential(*bb_net[0:3]), 'conv2': bb_net[4], 'conv3': bb_net[5], 'conv4': bb_net[6]}))
1404
+ else:
1405
+ bb = eval('{}({})'.format(bb_name, params_settings))
1406
+ if pretrained:
1407
+ bb = load_weights(bb, bb_name)
1408
+ return bb
1409
+
1410
+ def load_weights(model, model_name):
1411
+ save_model = torch.load(config.weights[model_name], map_location='cpu')
1412
+ model_dict = model.state_dict()
1413
+ state_dict = {k: v if v.size() == model_dict[k].size() else model_dict[k] for k, v in save_model.items() if k in model_dict.keys()}
1414
+ # to ignore the weights with mismatched size when I modify the backbone itself.
1415
+ if not state_dict:
1416
+ save_model_keys = list(save_model.keys())
1417
+ sub_item = save_model_keys[0] if len(save_model_keys) == 1 else None
1418
+ state_dict = {k: v if v.size() == model_dict[k].size() else model_dict[k] for k, v in save_model[sub_item].items() if k in model_dict.keys()}
1419
+ if not state_dict or not sub_item:
1420
+ print('Weights are not successully loaded. Check the state dict of weights file.')
1421
+ return None
1422
+ else:
1423
+ print('Found correct weights in the "{}" item of loaded state_dict.'.format(sub_item))
1424
+ model_dict.update(state_dict)
1425
+ model.load_state_dict(model_dict)
1426
+ return model
1427
+
1428
+
1429
+
1430
+ ### models/modules/decoder_blocks.py
1431
+
1432
+ import torch
1433
+ import torch.nn as nn
1434
+ # from models.aspp import ASPP, ASPPDeformable
1435
+ # from config import Config
1436
+
1437
+
1438
+ # config = Config()
1439
+
1440
+
1441
+ class BasicDecBlk(nn.Module):
1442
+ def __init__(self, in_channels=64, out_channels=64, inter_channels=64):
1443
+ super(BasicDecBlk, self).__init__()
1444
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1445
+ self.conv_in = nn.Conv2d(in_channels, inter_channels, 3, 1, padding=1)
1446
+ self.relu_in = nn.ReLU(inplace=True)
1447
+ if config.dec_att == 'ASPP':
1448
+ self.dec_att = ASPP(in_channels=inter_channels)
1449
+ elif config.dec_att == 'ASPPDeformable':
1450
+ self.dec_att = ASPPDeformable(in_channels=inter_channels)
1451
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, padding=1)
1452
+ self.bn_in = nn.BatchNorm2d(inter_channels) if config.batch_size > 1 else nn.Identity()
1453
+ self.bn_out = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1454
+
1455
+ def forward(self, x):
1456
+ x = self.conv_in(x)
1457
+ x = self.bn_in(x)
1458
+ x = self.relu_in(x)
1459
+ if hasattr(self, 'dec_att'):
1460
+ x = self.dec_att(x)
1461
+ x = self.conv_out(x)
1462
+ x = self.bn_out(x)
1463
+ return x
1464
+
1465
+
1466
+ class ResBlk(nn.Module):
1467
+ def __init__(self, in_channels=64, out_channels=None, inter_channels=64):
1468
+ super(ResBlk, self).__init__()
1469
+ if out_channels is None:
1470
+ out_channels = in_channels
1471
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1472
+
1473
+ self.conv_in = nn.Conv2d(in_channels, inter_channels, 3, 1, padding=1)
1474
+ self.bn_in = nn.BatchNorm2d(inter_channels) if config.batch_size > 1 else nn.Identity()
1475
+ self.relu_in = nn.ReLU(inplace=True)
1476
+
1477
+ if config.dec_att == 'ASPP':
1478
+ self.dec_att = ASPP(in_channels=inter_channels)
1479
+ elif config.dec_att == 'ASPPDeformable':
1480
+ self.dec_att = ASPPDeformable(in_channels=inter_channels)
1481
+
1482
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, padding=1)
1483
+ self.bn_out = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1484
+
1485
+ self.conv_resi = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
1486
+
1487
+ def forward(self, x):
1488
+ _x = self.conv_resi(x)
1489
+ x = self.conv_in(x)
1490
+ x = self.bn_in(x)
1491
+ x = self.relu_in(x)
1492
+ if hasattr(self, 'dec_att'):
1493
+ x = self.dec_att(x)
1494
+ x = self.conv_out(x)
1495
+ x = self.bn_out(x)
1496
+ return x + _x
1497
+
1498
+
1499
+
1500
+ ### models/modules/lateral_blocks.py
1501
+
1502
+ import numpy as np
1503
+ import torch
1504
+ import torch.nn as nn
1505
+ import torch.nn.functional as F
1506
+ from functools import partial
1507
+
1508
+ # from config import Config
1509
+
1510
+
1511
+ # config = Config()
1512
+
1513
+
1514
+ class BasicLatBlk(nn.Module):
1515
+ def __init__(self, in_channels=64, out_channels=64, inter_channels=64):
1516
+ super(BasicLatBlk, self).__init__()
1517
+ inter_channels = in_channels // 4 if config.dec_channels_inter == 'adap' else 64
1518
+ self.conv = nn.Conv2d(in_channels, out_channels, 1, 1, 0)
1519
+
1520
+ def forward(self, x):
1521
+ x = self.conv(x)
1522
+ return x
1523
+
1524
+
1525
+
1526
+ ### models/modules/aspp.py
1527
+
1528
+ import torch
1529
+ import torch.nn as nn
1530
+ import torch.nn.functional as F
1531
+ # from models.deform_conv import DeformableConv2d
1532
+ # from config import Config
1533
+
1534
+
1535
+ # config = Config()
1536
+
1537
+
1538
+ class _ASPPModule(nn.Module):
1539
+ def __init__(self, in_channels, planes, kernel_size, padding, dilation):
1540
+ super(_ASPPModule, self).__init__()
1541
+ self.atrous_conv = nn.Conv2d(in_channels, planes, kernel_size=kernel_size,
1542
+ stride=1, padding=padding, dilation=dilation, bias=False)
1543
+ self.bn = nn.BatchNorm2d(planes) if config.batch_size > 1 else nn.Identity()
1544
+ self.relu = nn.ReLU(inplace=True)
1545
+
1546
+ def forward(self, x):
1547
+ x = self.atrous_conv(x)
1548
+ x = self.bn(x)
1549
+
1550
+ return self.relu(x)
1551
+
1552
+
1553
+ class ASPP(nn.Module):
1554
+ def __init__(self, in_channels=64, out_channels=None, output_stride=16):
1555
+ super(ASPP, self).__init__()
1556
+ self.down_scale = 1
1557
+ if out_channels is None:
1558
+ out_channels = in_channels
1559
+ self.in_channelster = 256 // self.down_scale
1560
+ if output_stride == 16:
1561
+ dilations = [1, 6, 12, 18]
1562
+ elif output_stride == 8:
1563
+ dilations = [1, 12, 24, 36]
1564
+ else:
1565
+ raise NotImplementedError
1566
+
1567
+ self.aspp1 = _ASPPModule(in_channels, self.in_channelster, 1, padding=0, dilation=dilations[0])
1568
+ self.aspp2 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[1], dilation=dilations[1])
1569
+ self.aspp3 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[2], dilation=dilations[2])
1570
+ self.aspp4 = _ASPPModule(in_channels, self.in_channelster, 3, padding=dilations[3], dilation=dilations[3])
1571
+
1572
+ self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
1573
+ nn.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False),
1574
+ nn.BatchNorm2d(self.in_channelster) if config.batch_size > 1 else nn.Identity(),
1575
+ nn.ReLU(inplace=True))
1576
+ self.conv1 = nn.Conv2d(self.in_channelster * 5, out_channels, 1, bias=False)
1577
+ self.bn1 = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1578
+ self.relu = nn.ReLU(inplace=True)
1579
+ self.dropout = nn.Dropout(0.5)
1580
+
1581
+ def forward(self, x):
1582
+ x1 = self.aspp1(x)
1583
+ x2 = self.aspp2(x)
1584
+ x3 = self.aspp3(x)
1585
+ x4 = self.aspp4(x)
1586
+ x5 = self.global_avg_pool(x)
1587
+ x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
1588
+ x = torch.cat((x1, x2, x3, x4, x5), dim=1)
1589
+
1590
+ x = self.conv1(x)
1591
+ x = self.bn1(x)
1592
+ x = self.relu(x)
1593
+
1594
+ return self.dropout(x)
1595
+
1596
+
1597
+ ##################### Deformable
1598
+ class _ASPPModuleDeformable(nn.Module):
1599
+ def __init__(self, in_channels, planes, kernel_size, padding):
1600
+ super(_ASPPModuleDeformable, self).__init__()
1601
+ self.atrous_conv = DeformableConv2d(in_channels, planes, kernel_size=kernel_size,
1602
+ stride=1, padding=padding, bias=False)
1603
+ self.bn = nn.BatchNorm2d(planes) if config.batch_size > 1 else nn.Identity()
1604
+ self.relu = nn.ReLU(inplace=True)
1605
+
1606
+ def forward(self, x):
1607
+ x = self.atrous_conv(x)
1608
+ x = self.bn(x)
1609
+
1610
+ return self.relu(x)
1611
+
1612
+
1613
+ class ASPPDeformable(nn.Module):
1614
+ def __init__(self, in_channels, out_channels=None, parallel_block_sizes=[1, 3, 7]):
1615
+ super(ASPPDeformable, self).__init__()
1616
+ self.down_scale = 1
1617
+ if out_channels is None:
1618
+ out_channels = in_channels
1619
+ self.in_channelster = 256 // self.down_scale
1620
+
1621
+ self.aspp1 = _ASPPModuleDeformable(in_channels, self.in_channelster, 1, padding=0)
1622
+ self.aspp_deforms = nn.ModuleList([
1623
+ _ASPPModuleDeformable(in_channels, self.in_channelster, conv_size, padding=int(conv_size//2)) for conv_size in parallel_block_sizes
1624
+ ])
1625
+
1626
+ self.global_avg_pool = nn.Sequential(nn.AdaptiveAvgPool2d((1, 1)),
1627
+ nn.Conv2d(in_channels, self.in_channelster, 1, stride=1, bias=False),
1628
+ nn.BatchNorm2d(self.in_channelster) if config.batch_size > 1 else nn.Identity(),
1629
+ nn.ReLU(inplace=True))
1630
+ self.conv1 = nn.Conv2d(self.in_channelster * (2 + len(self.aspp_deforms)), out_channels, 1, bias=False)
1631
+ self.bn1 = nn.BatchNorm2d(out_channels) if config.batch_size > 1 else nn.Identity()
1632
+ self.relu = nn.ReLU(inplace=True)
1633
+ self.dropout = nn.Dropout(0.5)
1634
+
1635
+ def forward(self, x):
1636
+ x1 = self.aspp1(x)
1637
+ x_aspp_deforms = [aspp_deform(x) for aspp_deform in self.aspp_deforms]
1638
+ x5 = self.global_avg_pool(x)
1639
+ x5 = F.interpolate(x5, size=x1.size()[2:], mode='bilinear', align_corners=True)
1640
+ x = torch.cat((x1, *x_aspp_deforms, x5), dim=1)
1641
+
1642
+ x = self.conv1(x)
1643
+ x = self.bn1(x)
1644
+ x = self.relu(x)
1645
+
1646
+ return self.dropout(x)
1647
+
1648
+
1649
+
1650
+ ### models/refinement/refiner.py
1651
+
1652
+ import torch
1653
+ import torch.nn as nn
1654
+ from collections import OrderedDict
1655
+ import torch
1656
+ import torch.nn as nn
1657
+ import torch.nn.functional as F
1658
+ from torchvision.models import vgg16, vgg16_bn
1659
+ from torchvision.models import resnet50
1660
+
1661
+ # from config import Config
1662
+ # from dataset import class_labels_TR_sorted
1663
+ # from models.build_backbone import build_backbone
1664
+ # from models.decoder_blocks import BasicDecBlk
1665
+ # from models.lateral_blocks import BasicLatBlk
1666
+ # from models.ing import *
1667
+ # from models.stem_layer import StemLayer
1668
+
1669
+
1670
+ class RefinerPVTInChannels4(nn.Module):
1671
+ def __init__(self, in_channels=3+1):
1672
+ super(RefinerPVTInChannels4, self).__init__()
1673
+ self.config = Config()
1674
+ self.epoch = 1
1675
+ self.bb = build_backbone(self.config.bb, params_settings='in_channels=4')
1676
+
1677
+ lateral_channels_in_collection = {
1678
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
1679
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
1680
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
1681
+ }
1682
+ channels = lateral_channels_in_collection[self.config.bb]
1683
+ self.squeeze_module = BasicDecBlk(channels[0], channels[0])
1684
+
1685
+ self.decoder = Decoder(channels)
1686
+
1687
+ if 0:
1688
+ for key, value in self.named_parameters():
1689
+ if 'bb.' in key:
1690
+ value.requires_grad = False
1691
+
1692
+ def forward(self, x):
1693
+ if isinstance(x, list):
1694
+ x = torch.cat(x, dim=1)
1695
+ ########## Encoder ##########
1696
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
1697
+ x1 = self.bb.conv1(x)
1698
+ x2 = self.bb.conv2(x1)
1699
+ x3 = self.bb.conv3(x2)
1700
+ x4 = self.bb.conv4(x3)
1701
+ else:
1702
+ x1, x2, x3, x4 = self.bb(x)
1703
+
1704
+ x4 = self.squeeze_module(x4)
1705
+
1706
+ ########## Decoder ##########
1707
+
1708
+ features = [x, x1, x2, x3, x4]
1709
+ scaled_preds = self.decoder(features)
1710
+
1711
+ return scaled_preds
1712
+
1713
+
1714
+ class Refiner(nn.Module):
1715
+ def __init__(self, in_channels=3+1):
1716
+ super(Refiner, self).__init__()
1717
+ self.config = Config()
1718
+ self.epoch = 1
1719
+ self.stem_layer = StemLayer(in_channels=in_channels, inter_channels=48, out_channels=3, norm_layer='BN' if self.config.batch_size > 1 else 'LN')
1720
+ self.bb = build_backbone(self.config.bb)
1721
+
1722
+ lateral_channels_in_collection = {
1723
+ 'vgg16': [512, 256, 128, 64], 'vgg16bn': [512, 256, 128, 64], 'resnet50': [1024, 512, 256, 64],
1724
+ 'pvt_v2_b2': [512, 320, 128, 64], 'pvt_v2_b5': [512, 320, 128, 64],
1725
+ 'swin_v1_b': [1024, 512, 256, 128], 'swin_v1_l': [1536, 768, 384, 192],
1726
+ }
1727
+ channels = lateral_channels_in_collection[self.config.bb]
1728
+ self.squeeze_module = BasicDecBlk(channels[0], channels[0])
1729
+
1730
+ self.decoder = Decoder(channels)
1731
+
1732
+ if 0:
1733
+ for key, value in self.named_parameters():
1734
+ if 'bb.' in key:
1735
+ value.requires_grad = False
1736
+
1737
+ def forward(self, x):
1738
+ if isinstance(x, list):
1739
+ x = torch.cat(x, dim=1)
1740
+ x = self.stem_layer(x)
1741
+ ########## Encoder ##########
1742
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
1743
+ x1 = self.bb.conv1(x)
1744
+ x2 = self.bb.conv2(x1)
1745
+ x3 = self.bb.conv3(x2)
1746
+ x4 = self.bb.conv4(x3)
1747
+ else:
1748
+ x1, x2, x3, x4 = self.bb(x)
1749
+
1750
+ x4 = self.squeeze_module(x4)
1751
+
1752
+ ########## Decoder ##########
1753
+
1754
+ features = [x, x1, x2, x3, x4]
1755
+ scaled_preds = self.decoder(features)
1756
+
1757
+ return scaled_preds
1758
+
1759
+
1760
+ class Decoder(nn.Module):
1761
+ def __init__(self, channels):
1762
+ super(Decoder, self).__init__()
1763
+ self.config = Config()
1764
+ DecoderBlock = eval('BasicDecBlk')
1765
+ LateralBlock = eval('BasicLatBlk')
1766
+
1767
+ self.decoder_block4 = DecoderBlock(channels[0], channels[1])
1768
+ self.decoder_block3 = DecoderBlock(channels[1], channels[2])
1769
+ self.decoder_block2 = DecoderBlock(channels[2], channels[3])
1770
+ self.decoder_block1 = DecoderBlock(channels[3], channels[3]//2)
1771
+
1772
+ self.lateral_block4 = LateralBlock(channels[1], channels[1])
1773
+ self.lateral_block3 = LateralBlock(channels[2], channels[2])
1774
+ self.lateral_block2 = LateralBlock(channels[3], channels[3])
1775
+
1776
+ if self.config.ms_supervision:
1777
+ self.conv_ms_spvn_4 = nn.Conv2d(channels[1], 1, 1, 1, 0)
1778
+ self.conv_ms_spvn_3 = nn.Conv2d(channels[2], 1, 1, 1, 0)
1779
+ self.conv_ms_spvn_2 = nn.Conv2d(channels[3], 1, 1, 1, 0)
1780
+ self.conv_out1 = nn.Sequential(nn.Conv2d(channels[3]//2, 1, 1, 1, 0))
1781
+
1782
+ def forward(self, features):
1783
+ x, x1, x2, x3, x4 = features
1784
+ outs = []
1785
+ p4 = self.decoder_block4(x4)
1786
+ _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
1787
+ _p3 = _p4 + self.lateral_block4(x3)
1788
+
1789
+ p3 = self.decoder_block3(_p3)
1790
+ _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
1791
+ _p2 = _p3 + self.lateral_block3(x2)
1792
+
1793
+ p2 = self.decoder_block2(_p2)
1794
+ _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
1795
+ _p1 = _p2 + self.lateral_block2(x1)
1796
+
1797
+ _p1 = self.decoder_block1(_p1)
1798
+ _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
1799
+ p1_out = self.conv_out1(_p1)
1800
+
1801
+ if self.config.ms_supervision:
1802
+ outs.append(self.conv_ms_spvn_4(p4))
1803
+ outs.append(self.conv_ms_spvn_3(p3))
1804
+ outs.append(self.conv_ms_spvn_2(p2))
1805
+ outs.append(p1_out)
1806
+ return outs
1807
+
1808
+
1809
+ class RefUNet(nn.Module):
1810
+ # Refinement
1811
+ def __init__(self, in_channels=3+1):
1812
+ super(RefUNet, self).__init__()
1813
+ self.encoder_1 = nn.Sequential(
1814
+ nn.Conv2d(in_channels, 64, 3, 1, 1),
1815
+ nn.Conv2d(64, 64, 3, 1, 1),
1816
+ nn.BatchNorm2d(64),
1817
+ nn.ReLU(inplace=True)
1818
+ )
1819
+
1820
+ self.encoder_2 = nn.Sequential(
1821
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1822
+ nn.Conv2d(64, 64, 3, 1, 1),
1823
+ nn.BatchNorm2d(64),
1824
+ nn.ReLU(inplace=True)
1825
+ )
1826
+
1827
+ self.encoder_3 = nn.Sequential(
1828
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1829
+ nn.Conv2d(64, 64, 3, 1, 1),
1830
+ nn.BatchNorm2d(64),
1831
+ nn.ReLU(inplace=True)
1832
+ )
1833
+
1834
+ self.encoder_4 = nn.Sequential(
1835
+ nn.MaxPool2d(2, 2, ceil_mode=True),
1836
+ nn.Conv2d(64, 64, 3, 1, 1),
1837
+ nn.BatchNorm2d(64),
1838
+ nn.ReLU(inplace=True)
1839
+ )
1840
+
1841
+ self.pool4 = nn.MaxPool2d(2, 2, ceil_mode=True)
1842
+ #####
1843
+ self.decoder_5 = nn.Sequential(
1844
+ nn.Conv2d(64, 64, 3, 1, 1),
1845
+ nn.BatchNorm2d(64),
1846
+ nn.ReLU(inplace=True)
1847
+ )
1848
+ #####
1849
+ self.decoder_4 = nn.Sequential(
1850
+ nn.Conv2d(128, 64, 3, 1, 1),
1851
+ nn.BatchNorm2d(64),
1852
+ nn.ReLU(inplace=True)
1853
+ )
1854
+
1855
+ self.decoder_3 = nn.Sequential(
1856
+ nn.Conv2d(128, 64, 3, 1, 1),
1857
+ nn.BatchNorm2d(64),
1858
+ nn.ReLU(inplace=True)
1859
+ )
1860
+
1861
+ self.decoder_2 = nn.Sequential(
1862
+ nn.Conv2d(128, 64, 3, 1, 1),
1863
+ nn.BatchNorm2d(64),
1864
+ nn.ReLU(inplace=True)
1865
+ )
1866
+
1867
+ self.decoder_1 = nn.Sequential(
1868
+ nn.Conv2d(128, 64, 3, 1, 1),
1869
+ nn.BatchNorm2d(64),
1870
+ nn.ReLU(inplace=True)
1871
+ )
1872
+
1873
+ self.conv_d0 = nn.Conv2d(64, 1, 3, 1, 1)
1874
+
1875
+ self.upscore2 = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
1876
+
1877
+ def forward(self, x):
1878
+ outs = []
1879
+ if isinstance(x, list):
1880
+ x = torch.cat(x, dim=1)
1881
+ hx = x
1882
+
1883
+ hx1 = self.encoder_1(hx)
1884
+ hx2 = self.encoder_2(hx1)
1885
+ hx3 = self.encoder_3(hx2)
1886
+ hx4 = self.encoder_4(hx3)
1887
+
1888
+ hx = self.decoder_5(self.pool4(hx4))
1889
+ hx = torch.cat((self.upscore2(hx), hx4), 1)
1890
+
1891
+ d4 = self.decoder_4(hx)
1892
+ hx = torch.cat((self.upscore2(d4), hx3), 1)
1893
+
1894
+ d3 = self.decoder_3(hx)
1895
+ hx = torch.cat((self.upscore2(d3), hx2), 1)
1896
+
1897
+ d2 = self.decoder_2(hx)
1898
+ hx = torch.cat((self.upscore2(d2), hx1), 1)
1899
+
1900
+ d1 = self.decoder_1(hx)
1901
+
1902
+ x = self.conv_d0(d1)
1903
+ outs.append(x)
1904
+ return outs
1905
+
1906
+
1907
+
1908
+ ### models/stem_layer.py
1909
+
1910
+ import torch.nn as nn
1911
+ # from utils import build_act_layer, build_norm_layer
1912
+
1913
+
1914
+ class StemLayer(nn.Module):
1915
+ r""" Stem layer of InternImage
1916
+ Args:
1917
+ in_channels (int): number of input channels
1918
+ out_channels (int): number of output channels
1919
+ act_layer (str): activation layer
1920
+ norm_layer (str): normalization layer
1921
+ """
1922
+
1923
+ def __init__(self,
1924
+ in_channels=3+1,
1925
+ inter_channels=48,
1926
+ out_channels=96,
1927
+ act_layer='GELU',
1928
+ norm_layer='BN'):
1929
+ super().__init__()
1930
+ self.conv1 = nn.Conv2d(in_channels,
1931
+ inter_channels,
1932
+ kernel_size=3,
1933
+ stride=1,
1934
+ padding=1)
1935
+ self.norm1 = build_norm_layer(
1936
+ inter_channels, norm_layer, 'channels_first', 'channels_first'
1937
+ )
1938
+ self.act = build_act_layer(act_layer)
1939
+ self.conv2 = nn.Conv2d(inter_channels,
1940
+ out_channels,
1941
+ kernel_size=3,
1942
+ stride=1,
1943
+ padding=1)
1944
+ self.norm2 = build_norm_layer(
1945
+ out_channels, norm_layer, 'channels_first', 'channels_first'
1946
+ )
1947
+
1948
+ def forward(self, x):
1949
+ x = self.conv1(x)
1950
+ x = self.norm1(x)
1951
+ x = self.act(x)
1952
+ x = self.conv2(x)
1953
+ x = self.norm2(x)
1954
+ return x
1955
+
1956
+
1957
+ ### models/birefnet.py
1958
+
1959
+ import torch
1960
+ import torch.nn as nn
1961
+ import torch.nn.functional as F
1962
+ from kornia.filters import laplacian
1963
+ from transformers import PreTrainedModel
1964
+
1965
+ # from config import Config
1966
+ # from dataset import class_labels_TR_sorted
1967
+ # from models.build_backbone import build_backbone
1968
+ # from models.decoder_blocks import BasicDecBlk, ResBlk, HierarAttDecBlk
1969
+ # from models.lateral_blocks import BasicLatBlk
1970
+ # from models.aspp import ASPP, ASPPDeformable
1971
+ # from models.ing import *
1972
+ # from models.refiner import Refiner, RefinerPVTInChannels4, RefUNet
1973
+ # from models.stem_layer import StemLayer
1974
+ from .BiRefNet_config import BiRefNetConfig
1975
+
1976
+
1977
+ class BiRefNet(
1978
+ PreTrainedModel
1979
+ ):
1980
+ config_class = BiRefNetConfig
1981
+ def __init__(self, bb_pretrained=True, config=BiRefNetConfig()):
1982
+ super(BiRefNet, self).__init__(config)
1983
+ bb_pretrained = config.bb_pretrained
1984
+ self.config = Config()
1985
+ self.epoch = 1
1986
+ self.bb = build_backbone(self.config.bb, pretrained=bb_pretrained)
1987
+
1988
+ channels = self.config.lateral_channels_in_collection
1989
+
1990
+ if self.config.auxiliary_classification:
1991
+ self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
1992
+ self.cls_head = nn.Sequential(
1993
+ nn.Linear(channels[0], len(class_labels_TR_sorted))
1994
+ )
1995
+
1996
+ if self.config.squeeze_block:
1997
+ self.squeeze_module = nn.Sequential(*[
1998
+ eval(self.config.squeeze_block.split('_x')[0])(channels[0]+sum(self.config.cxt), channels[0])
1999
+ for _ in range(eval(self.config.squeeze_block.split('_x')[1]))
2000
+ ])
2001
+
2002
+ self.decoder = Decoder(channels)
2003
+
2004
+ if self.config.ender:
2005
+ self.dec_end = nn.Sequential(
2006
+ nn.Conv2d(1, 16, 3, 1, 1),
2007
+ nn.Conv2d(16, 1, 3, 1, 1),
2008
+ nn.ReLU(inplace=True),
2009
+ )
2010
+
2011
+ # refine patch-level segmentation
2012
+ if self.config.refine:
2013
+ if self.config.refine == 'itself':
2014
+ self.stem_layer = StemLayer(in_channels=3+1, inter_channels=48, out_channels=3, norm_layer='BN' if self.config.batch_size > 1 else 'LN')
2015
+ else:
2016
+ self.refiner = eval('{}({})'.format(self.config.refine, 'in_channels=3+1'))
2017
+
2018
+ if self.config.freeze_bb:
2019
+ # Freeze the backbone...
2020
+ print(self.named_parameters())
2021
+ for key, value in self.named_parameters():
2022
+ if 'bb.' in key and 'refiner.' not in key:
2023
+ value.requires_grad = False
2024
+
2025
+ def forward_enc(self, x):
2026
+ if self.config.bb in ['vgg16', 'vgg16bn', 'resnet50']:
2027
+ x1 = self.bb.conv1(x); x2 = self.bb.conv2(x1); x3 = self.bb.conv3(x2); x4 = self.bb.conv4(x3)
2028
+ else:
2029
+ x1, x2, x3, x4 = self.bb(x)
2030
+ if self.config.mul_scl_ipt == 'cat':
2031
+ B, C, H, W = x.shape
2032
+ x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
2033
+ x1 = torch.cat([x1, F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2034
+ x2 = torch.cat([x2, F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2035
+ x3 = torch.cat([x3, F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2036
+ x4 = torch.cat([x4, F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)], dim=1)
2037
+ elif self.config.mul_scl_ipt == 'add':
2038
+ B, C, H, W = x.shape
2039
+ x1_, x2_, x3_, x4_ = self.bb(F.interpolate(x, size=(H//2, W//2), mode='bilinear', align_corners=True))
2040
+ x1 = x1 + F.interpolate(x1_, size=x1.shape[2:], mode='bilinear', align_corners=True)
2041
+ x2 = x2 + F.interpolate(x2_, size=x2.shape[2:], mode='bilinear', align_corners=True)
2042
+ x3 = x3 + F.interpolate(x3_, size=x3.shape[2:], mode='bilinear', align_corners=True)
2043
+ x4 = x4 + F.interpolate(x4_, size=x4.shape[2:], mode='bilinear', align_corners=True)
2044
+ class_preds = self.cls_head(self.avgpool(x4).view(x4.shape[0], -1)) if self.training and self.config.auxiliary_classification else None
2045
+ if self.config.cxt:
2046
+ x4 = torch.cat(
2047
+ (
2048
+ *[
2049
+ F.interpolate(x1, size=x4.shape[2:], mode='bilinear', align_corners=True),
2050
+ F.interpolate(x2, size=x4.shape[2:], mode='bilinear', align_corners=True),
2051
+ F.interpolate(x3, size=x4.shape[2:], mode='bilinear', align_corners=True),
2052
+ ][-len(self.config.cxt):],
2053
+ x4
2054
+ ),
2055
+ dim=1
2056
+ )
2057
+ return (x1, x2, x3, x4), class_preds
2058
+
2059
+ def forward_ori(self, x):
2060
+ ########## Encoder ##########
2061
+ (x1, x2, x3, x4), class_preds = self.forward_enc(x)
2062
+ if self.config.squeeze_block:
2063
+ x4 = self.squeeze_module(x4)
2064
+ ########## Decoder ##########
2065
+ features = [x, x1, x2, x3, x4]
2066
+ if self.training and self.config.out_ref:
2067
+ features.append(laplacian(torch.mean(x, dim=1).unsqueeze(1), kernel_size=5))
2068
+ scaled_preds = self.decoder(features)
2069
+ return scaled_preds, class_preds
2070
+
2071
+ def forward(self, x):
2072
+ scaled_preds, class_preds = self.forward_ori(x)
2073
+ class_preds_lst = [class_preds]
2074
+ return [scaled_preds, class_preds_lst] if self.training else scaled_preds
2075
+
2076
+
2077
+ class Decoder(nn.Module):
2078
+ def __init__(self, channels):
2079
+ super(Decoder, self).__init__()
2080
+ self.config = Config()
2081
+ DecoderBlock = eval(self.config.dec_blk)
2082
+ LateralBlock = eval(self.config.lat_blk)
2083
+
2084
+ if self.config.dec_ipt:
2085
+ self.split = self.config.dec_ipt_split
2086
+ N_dec_ipt = 64
2087
+ DBlock = SimpleConvs
2088
+ ic = 64
2089
+ ipt_cha_opt = 1
2090
+ self.ipt_blk5 = DBlock(2**10*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
2091
+ self.ipt_blk4 = DBlock(2**8*3 if self.split else 3, [N_dec_ipt, channels[0]//8][ipt_cha_opt], inter_channels=ic)
2092
+ self.ipt_blk3 = DBlock(2**6*3 if self.split else 3, [N_dec_ipt, channels[1]//8][ipt_cha_opt], inter_channels=ic)
2093
+ self.ipt_blk2 = DBlock(2**4*3 if self.split else 3, [N_dec_ipt, channels[2]//8][ipt_cha_opt], inter_channels=ic)
2094
+ self.ipt_blk1 = DBlock(2**0*3 if self.split else 3, [N_dec_ipt, channels[3]//8][ipt_cha_opt], inter_channels=ic)
2095
+ else:
2096
+ self.split = None
2097
+
2098
+ self.decoder_block4 = DecoderBlock(channels[0]+([N_dec_ipt, channels[0]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[1])
2099
+ self.decoder_block3 = DecoderBlock(channels[1]+([N_dec_ipt, channels[0]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[2])
2100
+ self.decoder_block2 = DecoderBlock(channels[2]+([N_dec_ipt, channels[1]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[3])
2101
+ self.decoder_block1 = DecoderBlock(channels[3]+([N_dec_ipt, channels[2]//8][ipt_cha_opt] if self.config.dec_ipt else 0), channels[3]//2)
2102
+ self.conv_out1 = nn.Sequential(nn.Conv2d(channels[3]//2+([N_dec_ipt, channels[3]//8][ipt_cha_opt] if self.config.dec_ipt else 0), 1, 1, 1, 0))
2103
+
2104
+ self.lateral_block4 = LateralBlock(channels[1], channels[1])
2105
+ self.lateral_block3 = LateralBlock(channels[2], channels[2])
2106
+ self.lateral_block2 = LateralBlock(channels[3], channels[3])
2107
+
2108
+ if self.config.ms_supervision:
2109
+ self.conv_ms_spvn_4 = nn.Conv2d(channels[1], 1, 1, 1, 0)
2110
+ self.conv_ms_spvn_3 = nn.Conv2d(channels[2], 1, 1, 1, 0)
2111
+ self.conv_ms_spvn_2 = nn.Conv2d(channels[3], 1, 1, 1, 0)
2112
+
2113
+ if self.config.out_ref:
2114
+ _N = 16
2115
+ self.gdt_convs_4 = nn.Sequential(nn.Conv2d(channels[1], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2116
+ self.gdt_convs_3 = nn.Sequential(nn.Conv2d(channels[2], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2117
+ self.gdt_convs_2 = nn.Sequential(nn.Conv2d(channels[3], _N, 3, 1, 1), nn.BatchNorm2d(_N) if self.config.batch_size > 1 else nn.Identity(), nn.ReLU(inplace=True))
2118
+
2119
+ self.gdt_convs_pred_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2120
+ self.gdt_convs_pred_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2121
+ self.gdt_convs_pred_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2122
+
2123
+ self.gdt_convs_attn_4 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2124
+ self.gdt_convs_attn_3 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2125
+ self.gdt_convs_attn_2 = nn.Sequential(nn.Conv2d(_N, 1, 1, 1, 0))
2126
+
2127
+ def get_patches_batch(self, x, p):
2128
+ _size_h, _size_w = p.shape[2:]
2129
+ patches_batch = []
2130
+ for idx in range(x.shape[0]):
2131
+ columns_x = torch.split(x[idx], split_size_or_sections=_size_w, dim=-1)
2132
+ patches_x = []
2133
+ for column_x in columns_x:
2134
+ patches_x += [p.unsqueeze(0) for p in torch.split(column_x, split_size_or_sections=_size_h, dim=-2)]
2135
+ patch_sample = torch.cat(patches_x, dim=1)
2136
+ patches_batch.append(patch_sample)
2137
+ return torch.cat(patches_batch, dim=0)
2138
+
2139
+ def forward(self, features):
2140
+ if self.training and self.config.out_ref:
2141
+ outs_gdt_pred = []
2142
+ outs_gdt_label = []
2143
+ x, x1, x2, x3, x4, gdt_gt = features
2144
+ else:
2145
+ x, x1, x2, x3, x4 = features
2146
+ outs = []
2147
+
2148
+ if self.config.dec_ipt:
2149
+ patches_batch = self.get_patches_batch(x, x4) if self.split else x
2150
+ x4 = torch.cat((x4, self.ipt_blk5(F.interpolate(patches_batch, size=x4.shape[2:], mode='bilinear', align_corners=True))), 1)
2151
+ p4 = self.decoder_block4(x4)
2152
+ m4 = self.conv_ms_spvn_4(p4) if self.config.ms_supervision else None
2153
+ if self.config.out_ref:
2154
+ p4_gdt = self.gdt_convs_4(p4)
2155
+ if self.training:
2156
+ # >> GT:
2157
+ m4_dia = m4
2158
+ gdt_label_main_4 = gdt_gt * F.interpolate(m4_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2159
+ outs_gdt_label.append(gdt_label_main_4)
2160
+ # >> Pred:
2161
+ gdt_pred_4 = self.gdt_convs_pred_4(p4_gdt)
2162
+ outs_gdt_pred.append(gdt_pred_4)
2163
+ gdt_attn_4 = self.gdt_convs_attn_4(p4_gdt).sigmoid()
2164
+ # >> Finally:
2165
+ p4 = p4 * gdt_attn_4
2166
+ _p4 = F.interpolate(p4, size=x3.shape[2:], mode='bilinear', align_corners=True)
2167
+ _p3 = _p4 + self.lateral_block4(x3)
2168
+
2169
+ if self.config.dec_ipt:
2170
+ patches_batch = self.get_patches_batch(x, _p3) if self.split else x
2171
+ _p3 = torch.cat((_p3, self.ipt_blk4(F.interpolate(patches_batch, size=x3.shape[2:], mode='bilinear', align_corners=True))), 1)
2172
+ p3 = self.decoder_block3(_p3)
2173
+ m3 = self.conv_ms_spvn_3(p3) if self.config.ms_supervision else None
2174
+ if self.config.out_ref:
2175
+ p3_gdt = self.gdt_convs_3(p3)
2176
+ if self.training:
2177
+ # >> GT:
2178
+ # m3 --dilation--> m3_dia
2179
+ # G_3^gt * m3_dia --> G_3^m, which is the label of gradient
2180
+ m3_dia = m3
2181
+ gdt_label_main_3 = gdt_gt * F.interpolate(m3_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2182
+ outs_gdt_label.append(gdt_label_main_3)
2183
+ # >> Pred:
2184
+ # p3 --conv--BN--> F_3^G, where F_3^G predicts the \hat{G_3} with xx
2185
+ # F_3^G --sigmoid--> A_3^G
2186
+ gdt_pred_3 = self.gdt_convs_pred_3(p3_gdt)
2187
+ outs_gdt_pred.append(gdt_pred_3)
2188
+ gdt_attn_3 = self.gdt_convs_attn_3(p3_gdt).sigmoid()
2189
+ # >> Finally:
2190
+ # p3 = p3 * A_3^G
2191
+ p3 = p3 * gdt_attn_3
2192
+ _p3 = F.interpolate(p3, size=x2.shape[2:], mode='bilinear', align_corners=True)
2193
+ _p2 = _p3 + self.lateral_block3(x2)
2194
+
2195
+ if self.config.dec_ipt:
2196
+ patches_batch = self.get_patches_batch(x, _p2) if self.split else x
2197
+ _p2 = torch.cat((_p2, self.ipt_blk3(F.interpolate(patches_batch, size=x2.shape[2:], mode='bilinear', align_corners=True))), 1)
2198
+ p2 = self.decoder_block2(_p2)
2199
+ m2 = self.conv_ms_spvn_2(p2) if self.config.ms_supervision else None
2200
+ if self.config.out_ref:
2201
+ p2_gdt = self.gdt_convs_2(p2)
2202
+ if self.training:
2203
+ # >> GT:
2204
+ m2_dia = m2
2205
+ gdt_label_main_2 = gdt_gt * F.interpolate(m2_dia, size=gdt_gt.shape[2:], mode='bilinear', align_corners=True)
2206
+ outs_gdt_label.append(gdt_label_main_2)
2207
+ # >> Pred:
2208
+ gdt_pred_2 = self.gdt_convs_pred_2(p2_gdt)
2209
+ outs_gdt_pred.append(gdt_pred_2)
2210
+ gdt_attn_2 = self.gdt_convs_attn_2(p2_gdt).sigmoid()
2211
+ # >> Finally:
2212
+ p2 = p2 * gdt_attn_2
2213
+ _p2 = F.interpolate(p2, size=x1.shape[2:], mode='bilinear', align_corners=True)
2214
+ _p1 = _p2 + self.lateral_block2(x1)
2215
+
2216
+ if self.config.dec_ipt:
2217
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
2218
+ _p1 = torch.cat((_p1, self.ipt_blk2(F.interpolate(patches_batch, size=x1.shape[2:], mode='bilinear', align_corners=True))), 1)
2219
+ _p1 = self.decoder_block1(_p1)
2220
+ _p1 = F.interpolate(_p1, size=x.shape[2:], mode='bilinear', align_corners=True)
2221
+
2222
+ if self.config.dec_ipt:
2223
+ patches_batch = self.get_patches_batch(x, _p1) if self.split else x
2224
+ _p1 = torch.cat((_p1, self.ipt_blk1(F.interpolate(patches_batch, size=x.shape[2:], mode='bilinear', align_corners=True))), 1)
2225
+ p1_out = self.conv_out1(_p1)
2226
+
2227
+ if self.config.ms_supervision:
2228
+ outs.append(m4)
2229
+ outs.append(m3)
2230
+ outs.append(m2)
2231
+ outs.append(p1_out)
2232
+ return outs if not (self.config.out_ref and self.training) else ([outs_gdt_pred, outs_gdt_label], outs)
2233
+
2234
+
2235
+ class SimpleConvs(nn.Module):
2236
+ def __init__(
2237
+ self, in_channels: int, out_channels: int, inter_channels=64
2238
+ ) -> None:
2239
+ super().__init__()
2240
+ self.conv1 = nn.Conv2d(in_channels, inter_channels, 3, 1, 1)
2241
+ self.conv_out = nn.Conv2d(inter_channels, out_channels, 3, 1, 1)
2242
+
2243
+ def forward(self, x):
2244
+ return self.conv_out(self.conv1(x))
RMBG/RMBG-2.0/config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "ZhengPeng7/BiRefNet",
3
+ "architectures": [
4
+ "BiRefNet"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "BiRefNet_config.BiRefNetConfig",
8
+ "AutoModelForImageSegmentation": "birefnet.BiRefNet"
9
+ },
10
+ "custom_pipelines": {
11
+ "image-segmentation": {
12
+ "pt": [
13
+ "AutoModelForImageSegmentation"
14
+ ],
15
+ "tf": [],
16
+ "type": "image"
17
+ }
18
+ },
19
+ "bb_pretrained": false
20
+ }
audio_encoders/put_audio_encoder_models_here ADDED
File without changes
checkpoints/ComfyUI-Frame-Interpolation ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/Qwen-Rapid-AIO-NSFW-v18.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "Qwen-Rapid-AIO-NSFW-v18",
3
+ "model_name": "Qwen-Rapid-AIO-NSFW-v18",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/Qwen-Rapid-AIO-NSFW-v18.safetensors",
5
+ "size": 28431844103,
6
+ "modified": 1772965292.050338,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/SUPIR-v0F.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "SUPIR-v0F",
3
+ "model_name": "SUPIR-v0F",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/SUPIR-v0F.ckpt",
5
+ "size": 5329719950,
6
+ "modified": 1773182934.338468,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/analogMadnessSDXL_xl5.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "analogMadnessSDXL_xl5",
3
+ "model_name": "analogMadnessSDXL_xl5",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/analogMadnessSDXL_xl5.safetensors",
5
+ "size": 6938040682,
6
+ "modified": 1773170478.150429,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/epicrealismXL_pureFix.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "epicrealismXL_pureFix",
3
+ "model_name": "epicrealismXL_pureFix",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/epicrealismXL_pureFix.safetensors",
5
+ "size": 6938041144,
6
+ "modified": 1777644086.893859,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/gonzalomoXLFluxPony_v30FluxDAIO.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "gonzalomoXLFluxPony_v30FluxDAIO",
3
+ "model_name": "gonzalomoXLFluxPony_v30FluxDAIO",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/gonzalomoXLFluxPony_v30FluxDAIO.safetensors",
5
+ "size": 17068674198,
6
+ "modified": 1777201626.156541,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/illustriousRealismBy_v10.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "illustriousRealismBy_v10",
3
+ "model_name": "illustriousRealismBy_v10",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/illustriousRealismBy_v10.safetensors",
5
+ "size": 6938062210,
6
+ "modified": 1772965292.082859,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/illustriousRealismBy_v10VAE.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "illustriousRealismBy_v10VAE",
3
+ "model_name": "illustriousRealismBy_v10VAE",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/illustriousRealismBy_v10VAE.safetensors",
5
+ "size": 0,
6
+ "modified": 1772965292.112515,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/illustriousRealismBy_v10VAE.safetensors ADDED
File without changes
checkpoints/intorealismUltra_v10.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "intorealismUltra_v10",
3
+ "model_name": "intorealismUltra_v10",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/intorealismUltra_v10.safetensors",
5
+ "size": 6938040706,
6
+ "modified": 1777300584.216715,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }
checkpoints/juggernautXL_ragnarokBy.metadata.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "file_name": "juggernautXL_ragnarokBy",
3
+ "model_name": "juggernautXL_ragnarokBy",
4
+ "file_path": "/workspace/runpod-slim/ComfyUI/models/checkpoints/juggernautXL_ragnarokBy.safetensors",
5
+ "size": 7105350162,
6
+ "modified": 1777660870.143452,
7
+ "sha256": "",
8
+ "base_model": "Unknown",
9
+ "preview_url": "",
10
+ "preview_nsfw_level": 0,
11
+ "notes": "",
12
+ "from_civitai": false,
13
+ "civitai": {},
14
+ "tags": [],
15
+ "modelDescription": "",
16
+ "civitai_deleted": false,
17
+ "favorite": false,
18
+ "exclude": false,
19
+ "db_checked": false,
20
+ "skip_metadata_refresh": false,
21
+ "metadata_source": null,
22
+ "last_checked_at": 0,
23
+ "hash_status": "pending",
24
+ "sub_type": "checkpoint"
25
+ }