Flare77 commited on
Commit
b047851
·
verified ·
1 Parent(s): b79a0ed

Upload model HuLuLLM via Colab

Browse files
.gitattributes CHANGED
@@ -1,35 +1,9 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
3
+ model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
4
+ model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
5
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
6
+ model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
7
+ model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
8
+ model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
9
+ model-00004-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,570 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - medical
5
+ - multimodal
6
+ - vision-language-model
7
+ - image-to-text
8
+ - video-understanding
9
+ - 3d-understanding
10
+ - qwen
11
+ - pytorch
12
+ frameworks:
13
+ - pytorch
14
+ pipeline_tag: image-text-to-text
15
+ library_name: transformers
16
+ ---
17
+
18
+ <div style="display: flex; align-items: center; justify-content: center;">
19
+ <h1 style="margin: 0; text-align: left;">
20
+ Hulu-Med: A Transparent Generalist Model towards Holistic Medical Vision-Language Understanding
21
+ </h1>
22
+ </div>
23
+ <div align="center">
24
+
25
+ [![Paper](https://img.shields.io/badge/Paper-arXiv-red)](https://arxiv.org/abs/2510.08668)
26
+ [![HuggingFace](https://img.shields.io/badge/🤗%20Hugging%20Face-Models-yellow)](https://huggingface.co/ZJU-AI4H/Hulu-Med)
27
+ [![ModelScope](https://img.shields.io/badge/ModelScope-Models-blue)](https://modelscope.cn/models/Med-Team/Hulu-Med)
28
+ [![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE)
29
+ [![GitHub](https://img.shields.io/badge/GitHub-Code-blue?logo=github)](https://github.com/ZJUI-AI4H/Hulu-Med)
30
+ ![Total Downloads](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhuggingface.co%2Fapi%2Fmodels%2FZJU-AI4H%2FHulu-Med-7B%3Fexpand%255B%255D%3DdownloadsAllTime&query=%24.downloadsAllTime&label=Total%20Downloads&color=blue)
31
+
32
+ [📄 Paper](http://arxiv.org/abs/2510.08668) | [🤗 Hulu-Med-4B](https://huggingface.co/ZJU-AI4H/Hulu-Med-4B) | [🤗 Hulu-Med-7B](https://huggingface.co/ZJU-AI4H/Hulu-Med-7B) |[🤗 Hulu-Med-14B](https://huggingface.co/ZJU-AI4H/Hulu-Med-14B) |[🤗 Hulu-Med-32B](https://huggingface.co/ZJU-AI4H/Hulu-Med-32B) | [🔮 ModelScope Models](https://modelscope.cn/models/Med-Team/Hulu-Med) | [📊 Demo](#demo)
33
+
34
+ </div>
35
+
36
+ ## 🔥 News
37
+ - **[2025-11-27]** ⚡ **Hulu-Med** is now compatible with the latest **vLLM**, offering **faster inference** and **tensor parallel** support! Thank you all for your patience and feedback 💪 **[see here for installation](#🧩-vllm-installation)**
38
+
39
+ - **[2025-11-18]** 🎊 We released **Hulu-Med-4B**, a lightweight model with strong multimodal and text reasoning abilities that surpasses **MedGemma-4B** and **Lingshu-7B**!
40
+
41
+ - **[2025-11-01]** 📊 Releasing our new evaluation code, **MedUniEval**! Built on MedEvalKit, MedUniEval is designed for the comprehensive evaluation of medical visual-language models across various modalities—including text, 2D, 3D, and video. More benchmarks are coming soon.
42
+
43
+ - **[2025-10-15]** 🎉 Hulu-Med now supports Transformers integration! HuggingFace-compatible models released with simplified loading and inference. Integration with VLLM is ongoing. *The HF models are now available in the **main branch** on Hugging Face*.
44
+ - The model has been updated in the main branch of our Hugging Face repository. You can now load it directly using `AutoModelForCausalLM.from_pretrained` - the weights will be automatically downloaded.
45
+
46
+ - **[2025-10-08]** Hulu-Med models and inference code released!
47
+
48
+ ## 📖 Overview
49
+
50
+ **Hulu-Med** is a transparent medical vision-language model that unifies understanding across diverse modalities including **medical text, 2D/3D images, and videos**. Built with a focus on transparency and accessibility, Hulu-Med achieves state-of-the-art performance on 30 medical benchmarks while being trained entirely on public data.
51
+
52
+ <div align="center">
53
+ <img src="https://cdn-uploads.huggingface.co/production/uploads/68e4dbf1beab849e9baa6e26/ckBITEJ6W_VszDKujCaMW.jpeg" width="100%">
54
+ </div>
55
+
56
+ ### Key Features
57
+
58
+ - 🌟 **Holistic Multimodal Understanding**: Seamlessly processes medical text, 2D images, 3D volumes, and surgical videos
59
+ - 🔓 **Fully Transparent**: Complete open-source pipeline including data curation, training code, and model weights
60
+ - 📊 **State-of-the-Art Performance**: Outperforms leading open-source models and competes with proprietary systems
61
+ - ⚡ **Efficient Training**: Only 4,000-40,000 GPU hours required for 7B-32B variants
62
+ - 🗂️ **Comprehensive Coverage**: Trained on 16.7M samples spanning 12 anatomical systems and 14 imaging modalities
63
+ - 🤗 **Transformers Native**: Now with native HuggingFace Transformers support for easier integration
64
+
65
+ ### Comprehensive Data Coverage
66
+
67
+ Our training corpus encompasses:
68
+
69
+ - **12 Major Anatomical Systems**: Multi-System, Skin/Integumentary, Respiratory, Cellular/Tissue Level, Digestive, Nervous, Cardiovascular, Musculoskeletal, Reproductive, Urinary, Whole Body, Endocrine, Immune/Lymphatic, and Hematologic systems
70
+ - **14 Medical Imaging Modalities**: CT, MRI, X-Ray, Ultrasound, PET, OCT, Endoscopy, Microscopy, Histopathology, Fundus, Dermoscopy, Angiography, Digital Photograph, and Medical Chart
71
+ - **Diverse Downstream Tasks**: Medical Dialogue, Anomaly Detection, Prognosis Prediction, Treatment Planning, Surgical Skill Assessment, Education, Medical Report Generation, Surgical Phase Recognition, Medical Computation, and more
72
+
73
+ ## 🏆 Performance Highlights
74
+
75
+ ### Medical Multimodal Benchmarks
76
+
77
+ Performance comparison on medical multimodal benchmarks (For the 'Medical VLM < 10B' subgroup, **bold** indicates the best method):
78
+
79
+ | Models | OM.VQA | PMC-VQA | VQA-RAD | SLAKE | PathVQA | MedXQA | MMMU-Med |
80
+ |--------|--------|---------|---------|-------|---------|--------|----------|
81
+ | **Proprietary Models** |
82
+ | GPT-4.1 | 75.5 | 55.2 | 65.0 | 72.2 | 55.5 | 45.2 | 75.2 |
83
+ | GPT-4o | 67.5 | 49.7 | 61.0 | 71.2 | 55.5 | 44.3 | 62.8 |
84
+ | Claude Sonnet 4 | 65.5 | 54.4 | 67.6 | 70.6 | 54.2 | 43.3 | 74.6 |
85
+ | Gemini-2.5-Flash | 71.0 | 55.4 | 68.5 | 75.8 | 55.4 | 52.8 | 76.9 |
86
+ | **General VLMs < 10B** |
87
+ | Qwen2.5VL-7B | 63.6 | 51.9 | 63.2 | 66.8 | 44.1 | 20.1 | 50.6 |
88
+ | InternVL2.5-8B | 81.3 | 51.3 | 59.4 | 69.0 | 42.1 | 21.7 | 53.5 |
89
+ | InternVL3-8B | 79.1 | 53.8 | 65.4 | 72.8 | 48.6 | 22.4 | 59.2 |
90
+ | **General VLMs > 10B** |
91
+ | InternVL3-14B | 78.9 | 54.1 | 66.3 | 72.8 | 48.0 | 23.1 | 63.1 |
92
+ | Qwen2.5V-32B | 68.2 | 54.5 | 71.8 | 71.2 | 41.9 | 25.2 | 59.6 |
93
+ | InternVL3-38B | 79.8 | 56.6 | 65.4 | 72.7 | 51.0 | 25.2 | 65.2 |
94
+ | **Medical VLMs < 10B** |
95
+ | LLaVA-Med-7B | 34.8 | 22.7 | 46.6 | 51.9 | 35.2 | 20.8 | 28.1 |
96
+ | MedGemma-4B | 70.7 | 49.2 | 72.3 | 78.2 | 48.1 | 25.4 | 43.2 |
97
+ | HuatuoGPT-V-7B | 74.3 | 53.1 | 67.6 | 68.1 | 44.8 | 23.2 | 49.8 |
98
+ | Lingshu-7B | 82.9 | 56.3 | 67.9 | 83.1 | 61.9 | 26.7 | - |
99
+ | **Hulu-Med-4B** | **81.6** | **64.6** | **71.6** | **85.0** | **60.1** | **26.4** | **50.5** |
100
+ | **Hulu-Med-7B** | **84.2** | **66.8** | **78.0** | **86.8** | **65.6** | **29.0** | **51.4** |
101
+ | **Medical VLMs > 10B** |
102
+ | HealthGPT-14B | 75.2 | 56.4 | 65.0 | 66.1 | 56.7 | 24.7 | 49.6 |
103
+ | HuatuoGPT-V-34B | 74.0 | 56.6 | 61.4 | 69.5 | 44.4 | 22.1 | 51.8 |
104
+ | Lingshu-32B | 83.4 | 57.9 | 76.7 | 86.7 | 65.5 | 30.9 | - |
105
+ | **Hulu-Med-14B** | **85.1** | **68.9** | **76.1** | **86.5** | **64.4** | **30.0** | **54.8** |
106
+ | **Hulu-Med-32B** | **84.6** | **69.4** | **81.4** | **85.7** | **67.3** | **34.0** | **60.4** |
107
+
108
+ ### Medical Text Benchmarks
109
+
110
+ Performance comparison on medical text benchmarks (**bold** indicates the best method in each subgroup):
111
+
112
+ | Models | MMLU-Pro | MedXQA | Medbullets | SGPQA | PubMedQA | MedMCQA | MedQA | MMLU-Med |
113
+ |--------|----------|--------|------------|-------|----------|---------|-------|----------|
114
+ | **Proprietary Models** |
115
+ | GPT-4.1 | 78.0 | 30.9 | 77.0 | 49.9 | 75.6 | 77.7 | 89.1 | 89.6 |
116
+ | o3-mini | 78.1 | 35.4 | 83.7 | 50.1 | 73.6 | 60.6 | 74.5 | 87.0 |
117
+ | Claude Sonnet 4 | 79.5 | 33.6 | 80.2 | 56.3 | 78.6 | 79.3 | 92.1 | 91.3 |
118
+ | Gemini-2.5-Flash | 70.0 | 35.6 | 77.6 | 53.3 | 73.8 | 73.6 | 91.2 | 84.2 |
119
+ | **General VLMs < 10B** |
120
+ | Qwen2.5VL-7B | 50.5 | 12.8 | 42.1 | 26.3 | 76.4 | 52.6 | 57.3 | 73.4 |
121
+ | InternVL2.5-8B | 50.6 | 11.6 | 42.4 | 26.1 | 76.4 | 52.4 | 53.7 | 74.2 |
122
+ | InternVL3-8B | 57.9 | 13.1 | 48.5 | 31.2 | 75.4 | 57.7 | 62.1 | 77.5 |
123
+ | **General VLMs > 10B** |
124
+ | Qwen2.5VL-32B | 66.5 | 15.6 | 54.2 | 37.6 | 68.4 | 63.0 | 71.6 | 83.2 |
125
+ | InternVL3-14B | 65.4 | 14.1 | 49.5 | 37.9 | 77.2 | 62.0 | 70.1 | 81.7 |
126
+ | InternVL3-38B | 72.1 | 16.0 | 54.6 | 42.5 | 73.2 | 64.9 | 73.5 | 83.8 |
127
+ | **Medical VLMs < 10B** |
128
+ | LLaVA-Med-7B | 16.6 | 9.9 | 34.4 | 16.1 | 26.4 | 39.4 | 42.0 | 50.6 |
129
+ | MedGemma-4B | 38.6 | 12.8 | 45.6 | 21.6 | 72.2 | 52.2 | 56.2 | 66.7 |
130
+ | HuatuoGPT-V-7B | 44.6 | 10.1 | 40.9 | 21.9 | 72.8 | 51.2 | 52.9 | 69.3 |
131
+ | Lingshu-7B | 50.4 | 16.5 | 56.2 | 26.3 | 76.6 | 55.9 | 63.3 | 74.5 |
132
+ | **Hulu-Med-4B** | **58.6** | **16.8** | **59.4** | **29.5** | **77.6** | **64.8** | **71.9** | **78.6** |
133
+ | **Hulu-Med-7B** | **60.6** | **19.6** | **61.5** | **31.1** | **77.4** | **67.6** | **73.5** | **79.5** |
134
+ | **Medical VLMs > 10B** |
135
+ | HealthGPT-14B | 63.4 | 11.3 | 39.8 | 25.7 | 68.0 | 63.4 | 66.2 | 80.2 |
136
+ | Lingshu-32B | 70.2 | 22.7 | 65.4 | 41.1 | 77.8 | 66.1 | 74.7 | 84.7 |
137
+ | HuatuoGPT-V-34B | 51.8 | 11.4 | 42.7 | 26.5 | 72.2 | 54.7 | 58.8 | 74.7 |
138
+ | **Hulu-Med-14B** | **68.0** | **23.2** | **68.5** | **37.7** | **79.8** | **70.4** | **78.1** | **83.3** |
139
+ | **Hulu-Med-32B** | **72.9** | **24.2** | **68.8** | **41.8** | **80.8** | **72.8** | **80.4** | **85.6** |
140
+
141
+ ## 🚀 Model Zoo
142
+
143
+ We provide three model variants with different parameter scales:
144
+
145
+ | Model | Parameters | LLM Base | Training Cost | HuggingFace | ModelScope |
146
+ |-------|-----------|----------|---------------|-------------|------------|
147
+ | **Hulu-Med-7B** | 7B | Qwen2.5-7B | ~4,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-7B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-7B) |
148
+ | **Hulu-Med-14B** | 14B | Qwen3-14B | ~8,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-14B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-14B) |
149
+ | **Hulu-Med-32B** | 32B | Qwen2.5-32B | ~40,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-32B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-32B) |
150
+
151
+ **Note**: HuggingFace-compatible versions (Hulu-Med-HF) are also available for easier integration with the Transformers library.
152
+
153
+ ## 🛠️ Installation
154
+
155
+ ```bash
156
+ # Clone the repository
157
+ git clone https://github.com/ZJUI-AI4H/Hulu-Med.git
158
+ cd Hulu-Med
159
+
160
+ # Create conda environment
161
+ conda create -n hulumed python=3.10
162
+ conda activate hulumed
163
+
164
+ # PyTorch and torchvision for CUDA 11.8
165
+ pip install torch==2.4.0 torchvision==0.19.0 --extra-index-url https://download.pytorch.org/whl/cu118
166
+
167
+ # Flash-attn pinned to a compatible version
168
+ pip install flash-attn==2.7.3 --no-build-isolation --upgrade
169
+
170
+ # Transformers and accelerate
171
+ pip install transformers==4.51.2 accelerate==1.7.0
172
+
173
+ # Video processing dependencies
174
+ pip install decord ffmpeg-python imageio opencv-python
175
+
176
+ # For 3D medical image processing (NIfTI files)
177
+ pip install nibabel
178
+
179
+ # Install other dependencies
180
+ pip install -r requirements.txt
181
+ ```
182
+
183
+ <a id="vllm-install"></a>
184
+ ### 🧩 vLLM Installation
185
+
186
+ ```bash
187
+ pip install git+https://github.com/jiangsongtao/vllm.git
188
+
189
+ # or try this way
190
+ git clone https://github.com/jiangsongtao/vllm.git
191
+ cd vllm-main
192
+ export VLLM_USE_PRECOMPILED=1
193
+ rm -rf build/ .deps/
194
+ pip install -e .
195
+ pip uninstall flash-attn -y
196
+ pip install flash-attn --no-build-isolation
197
+ ```
198
+
199
+
200
+ ## 💻 Quick Start
201
+
202
+ We provide two ways to use Hulu-Med:
203
+
204
+ ### Option 1: Using HuggingFace Transformers (Recommended for Hulu-Med-HF models)
205
+
206
+ For easier integration, use the HuggingFace-compatible models with native Transformers support:
207
+
208
+ ```python
209
+ from transformers import AutoModelForCausalLM, AutoProcessor
210
+ import torch
211
+
212
+ model_path = "ZJU-AI4H/Hulu-Med-32B"
213
+
214
+ # Load model and processor
215
+ model = AutoModelForCausalLM.from_pretrained(
216
+ model_path,
217
+ trust_remote_code=True,
218
+ torch_dtype="bfloat16",
219
+ device_map="auto",
220
+ attn_implementation="flash_attention_2",
221
+ )
222
+
223
+ processor = AutoProcessor.from_pretrained(
224
+ model_path,
225
+ trust_remote_code=True
226
+ )
227
+
228
+ tokenizer = processor.tokenizer
229
+ ```
230
+
231
+ #### Text-Only Example
232
+
233
+ ```python
234
+ conversation = [
235
+ {
236
+ "role": "user",
237
+ "content": [
238
+ {"type": "text", "text": "Hello, I have a headache, what should I eat?"},
239
+ ]
240
+ }
241
+ ]
242
+
243
+ modal = 'text'
244
+ inputs = processor(
245
+ conversation=conversation,
246
+ return_tensors="pt",
247
+ add_generation_prompt=True
248
+ )
249
+
250
+ inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v
251
+ for k, v in inputs.items()}
252
+
253
+ with torch.inference_mode():
254
+ output_ids = model.generate(
255
+ **inputs,
256
+ do_sample=True,
257
+ modals=[modal],
258
+ temperature=0.6,
259
+ max_new_tokens=4096,
260
+ use_cache=True,
261
+ pad_token_id=tokenizer.eos_token_id,
262
+ )
263
+
264
+ # Decode output
265
+ # Enable thinking mode by adding: "Please reason step by step, and put your final answer within \boxed{}."
266
+ # use_think=False: Only return the final answer without thinking process
267
+ # use_think=True: Include the model's reasoning/thinking process in the output
268
+ outputs = processor.batch_decode(
269
+ output_ids,
270
+ skip_special_tokens=True,
271
+ use_think=False # Set to True to see the thinking process
272
+ )[0].strip()
273
+ print(outputs)
274
+ ```
275
+
276
+ #### 2D Image Example
277
+
278
+ ```python
279
+ conversation = [
280
+ {
281
+ "role": "user",
282
+ "content": [
283
+ {
284
+ "type": "image",
285
+ "image": {
286
+ "image_path": "./demo/demo.jpg",
287
+ }
288
+ },
289
+ {
290
+ "type": "text",
291
+ "text": "Generate a medical report for this image."
292
+ },
293
+ ]
294
+ }
295
+ ]
296
+
297
+ inputs = processor(
298
+ conversation=conversation,
299
+ add_system_prompt=True,
300
+ add_generation_prompt=True,
301
+ return_tensors="pt"
302
+ )
303
+
304
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
305
+ for k, v in inputs.items()}
306
+
307
+ if "pixel_values" in inputs:
308
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
309
+
310
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
311
+ outputs = processor.batch_decode(
312
+ output_ids,
313
+ skip_special_tokens=True,
314
+ use_think=False
315
+ )[0].strip()
316
+ print(outputs)
317
+ ```
318
+ #### Multi Images Example
319
+ ```python
320
+ conversation = [
321
+ {
322
+ "role": "user",
323
+ "content": [
324
+ {
325
+ "type": "image",
326
+ "image": {
327
+ "image_path": "./demo/demo1.jpg",
328
+ }
329
+ },
330
+ {
331
+ "type": "image",
332
+ "image": {
333
+ "image_path": "./demo/demo2.jpg",
334
+ }
335
+ },
336
+ {
337
+ "type": "text",
338
+ "text": "Are these two images the same?"
339
+ },
340
+ ]
341
+ }
342
+ ]
343
+
344
+ inputs = processor(
345
+ conversation=conversation,
346
+ add_system_prompt=True,
347
+ add_generation_prompt=True,
348
+ return_tensors="pt"
349
+ )
350
+
351
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
352
+ for k, v in inputs.items()}
353
+ if "pixel_values" in inputs:
354
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
355
+
356
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
357
+ outputs_no_think = processor.batch_decode(
358
+ output_ids,
359
+ skip_special_tokens=True,
360
+ use_think=False
361
+ )[0].strip()
362
+ print(outputs_no_think)
363
+ ```
364
+ #### Interleaved Example
365
+ ```python
366
+ conversation = [
367
+ {
368
+ "role": "user",
369
+ "content": [
370
+ {
371
+ "type": "text",
372
+ "text": "Image A:"
373
+ },
374
+ {
375
+ "type": "image",
376
+ "image": {
377
+ "image_path": "./demo/XRay.jpg",
378
+ }
379
+ },
380
+ {
381
+ "type": "text",
382
+ "text": "Image B:"
383
+ },
384
+ {
385
+ "type": "image",
386
+ "image": {
387
+ "image_path": "./demo/pathology.png",
388
+ }
389
+ },
390
+ {
391
+ "type": "text",
392
+ "text": "Which image is the pathology slide?"
393
+ },
394
+ ]
395
+ }
396
+ ]
397
+
398
+ inputs = processor(
399
+ conversation=conversation,
400
+ add_system_prompt=True,
401
+ add_generation_prompt=True,
402
+ return_tensors="pt"
403
+ )
404
+
405
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
406
+ for k, v in inputs.items()}
407
+ if "pixel_values" in inputs:
408
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
409
+
410
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
411
+ outputs_no_think = processor.batch_decode(
412
+ output_ids,
413
+ skip_special_tokens=True,
414
+ use_think=False
415
+ )[0].strip()
416
+ print(outputs_no_think)
417
+ #The pathology slide is Image B. It shows a microscopic view of tissue with various cellular structures and components, such as cells in different stages of maturation and areas of fibrous tissue. This type of image is typically used to examine the cellular architecture and identify any pathological changes within the tissue.
418
+ ```
419
+
420
+ #### 3D Medical Image Example
421
+
422
+ ```python
423
+ # Requires: pip install nibabel
424
+
425
+ conversation = [
426
+ {
427
+ "role": "user",
428
+ "content": [
429
+ {
430
+ "type": "3d",
431
+ "3d": {
432
+ "image_path": "./demo/amos_0013.nii",
433
+ "nii_num_slices": 180,
434
+ "nii_axis": 2, # 0=sagittal, 1=coronal, 2=axial
435
+ }
436
+ },
437
+ {
438
+ "type": "text",
439
+ "text": "Generate a medical report for this 3D CT scan."
440
+ },
441
+ ]
442
+ }
443
+ ]
444
+
445
+ inputs = processor(
446
+ conversation=conversation,
447
+ add_system_prompt=True,
448
+ add_generation_prompt=True,
449
+ return_tensors="pt"
450
+ )
451
+
452
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
453
+ for k, v in inputs.items()}
454
+
455
+ if "pixel_values" in inputs:
456
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
457
+
458
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
459
+ outputs = processor.batch_decode(
460
+ output_ids,
461
+ skip_special_tokens=True,
462
+ use_think=False
463
+ )[0].strip()
464
+ print(outputs)
465
+ ```
466
+
467
+ #### Video Example
468
+
469
+ ```python
470
+ conversation = [
471
+ {
472
+ "role": "user",
473
+ "content": [
474
+ {
475
+ "type": "video",
476
+ "video": {
477
+ "video_path": "./demo/1min_demo.mp4",
478
+ "fps": 1,
479
+ "max_frames": 1800
480
+ }
481
+ },
482
+ {
483
+ "type": "text",
484
+ "text": "Describe this video in detail."
485
+ },
486
+ ]
487
+ }
488
+ ]
489
+
490
+ inputs = processor(
491
+ conversation=conversation,
492
+ add_system_prompt=True,
493
+ add_generation_prompt=True,
494
+ return_tensors="pt"
495
+ )
496
+
497
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
498
+ for k, v in inputs.items()}
499
+
500
+ if "pixel_values" in inputs:
501
+ inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
502
+
503
+ output_ids = model.generate(**inputs, max_new_tokens=1024)
504
+ outputs = processor.batch_decode(
505
+ output_ids,
506
+ skip_special_tokens=True,
507
+ use_think=False
508
+ )[0].strip()
509
+ print(outputs)
510
+ ```
511
+
512
+ **Understanding the `use_think` parameter:**
513
+ - `use_think=False`: Returns only the final answer (default for most use cases)
514
+ - `use_think=True`: Includes the model's internal reasoning/thinking process before the final answer
515
+
516
+
517
+ ## 📊 Training
518
+
519
+ ### Data Preparation
520
+
521
+ Our training data consists of 16.7M samples across four categories:
522
+
523
+ - **Medical Multimodal Data** (9M samples): Covering 14 imaging modalities
524
+ - **Medical Text Data** (4.9M samples): Clinical notes, literature, QA pairs
525
+ - **General Multimodal Data** (1.3M samples): Enhancing generalization
526
+ - **General Text Data** (1.5M samples): Improving reasoning capabilities
527
+
528
+ Download and prepare the data:
529
+ Coming soon
530
+
531
+ ## 🏗️ Model Architecture
532
+
533
+ Hulu-Med consists of four core components:
534
+
535
+ 1. **Vision Encoder**: SigLIP-based encoder with 2D RoPE for unified 2D/3D/video processing
536
+ 2. **Multimodal Projector**: Projects visual tokens into language model space
537
+ 3. **LLM Decoder**: Qwen-based decoder for generating responses
538
+ 4. **Medical-Aware Token Reduction**: Efficient processing with ~55% token reduction
539
+
540
+ ## 📋 Supported Tasks
541
+
542
+ - ✅ Visual Question Answering (2D/3D/Video)
543
+ - ✅ Medical Report Generation
544
+ - ✅ Disease Diagnosis
545
+ - ✅ Anatomical Understanding
546
+ - ✅ Surgical Phase Recognition
547
+ - ✅ Clinical Dialogue
548
+ - ✅ Medical Text Reasoning
549
+ - ✅ Multilingual Medical QA
550
+ - ✅ Rare Disease Diagnosis
551
+ - ✅ And more
552
+
553
+ ## 📄 Citation
554
+
555
+ If you find Hulu-Med useful in your research, please cite:
556
+ ```bibtex
557
+ @misc{jiang2025hulumedtransparentgeneralistmodel,
558
+ title={Hulu-Med: A Transparent Generalist Model towards Holistic Medical Vision-Language Understanding},
559
+ author={Songtao Jiang and Yuan Wang and Sibo Song and Tianxiang Hu and Chenyi Zhou and Bin Pu and Yan Zhang and Zhibo Yang and Yang Feng and Joey Tianyi Zhou and Jin Hao and Zijian Chen and Ruijia Wu and Tao Tang and Junhui Lv and Hongxia Xu and Hongwei Wang and Jun Xiao and Bin Feng and Fudong Zhu and Kenli Li and Weidi Xie and Jimeng Sun and Jian Wu and Zuozhu Liu},
560
+ year={2025},
561
+ eprint={2510.08668},
562
+ archivePrefix={arXiv},
563
+ primaryClass={cs.CV},
564
+ url={https://arxiv.org/abs/2510.08668},
565
+ }
566
+ ```
567
+
568
+ ## 📜 License
569
+
570
+ This project is released under the [Apache 2.0 License](LICENSE).
added_tokens.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<image>": 151665,
4
+ "<tool_call>": 151657,
5
+ "<|box_end|>": 151649,
6
+ "<|box_start|>": 151648,
7
+ "<|endoftext|>": 151643,
8
+ "<|file_sep|>": 151664,
9
+ "<|fim_middle|>": 151660,
10
+ "<|fim_pad|>": 151662,
11
+ "<|fim_prefix|>": 151659,
12
+ "<|fim_suffix|>": 151661,
13
+ "<|im_end|>": 151645,
14
+ "<|im_start|>": 151644,
15
+ "<|image_pad|>": 151655,
16
+ "<|object_ref_end|>": 151647,
17
+ "<|object_ref_start|>": 151646,
18
+ "<|quad_end|>": 151651,
19
+ "<|quad_start|>": 151650,
20
+ "<|repo_name|>": 151663,
21
+ "<|stream_end|>": 151667,
22
+ "<|stream_start|>": 151666,
23
+ "<|video_pad|>": 151656,
24
+ "<|vision_end|>": 151653,
25
+ "<|vision_pad|>": 151654,
26
+ "<|vision_start|>": 151652
27
+ }
chat_template.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "\n{%- set identifier = 'im' %}\n{% for message in messages %}\n {% if add_system_prompt and loop.first and message['role'] != 'system' %}\n {{- '<|im_start|>system\nYou are Hulu-Med, a helpful health assistant that can understand text, 2D images, videos, and 3D images.<|im_end|>\n' -}}\n {% endif %}\n {% if message['role'] == 'stream' %}\n {% set identifier = 'stream' %}\n {% else %}\n {% set identifier = 'im' %}\n {% endif %}\n {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}\n {% if message['content'] is string %}\n {{- message['content'] + '<|' + identifier + '_end|>\n' -}}\n {% else %}\n {% for content in message['content'] %}\n {% if content is string %}\n {{- content -}}\n {% elif content['type'] == 'text' or 'text' in content %}\n {{- content['text'] -}}\n {% elif content['type'] == 'image' or 'image' in content %}\n {% if 'timestamp' in content %}\n {{- 'Time ' + content['timestamp'] | round(1) | string + 's: ' -}}\n {% endif %}\n {{- image_token + '\n' -}}\n {% elif content['type'] == 'video' or 'video' in content %}\n {% for i in range(content['num_frames']) %}\n {% if 'timestamps' in content %}\n {{- 'Time ' + content['timestamps'][i] | round(1) | string + 's:' -}}\n {% endif %}\n {% if i < content['num_frames'] - 1 %}\n {{- image_token + ',' -}}\n {% else %}\n {{- image_token + '\n' -}}\n {% endif %}\n {% endfor %}\n {% endif %}\n {% endfor %}\n {% if identifier == 'stream' %}\n {{- '<|' + identifier + '_end|>' -}}\n {% else %}\n {{- '<|' + identifier + '_end|>\n' -}}\n {% endif %}\n {% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' -}}\n{% endif %}\n"
3
+ }
config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "HulumedQwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_hulumed_qwen2.HulumedQwen2Config",
8
+ "AutoModelForCausalLM": "modeling_hulumed_qwen2.HulumedQwen2ForCausalLM"
9
+ },
10
+ "bos_token_id": 151643,
11
+ "eos_token_id": 151645,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 3584,
14
+ "image_token_index": 151665,
15
+ "initializer_range": 0.02,
16
+ "intermediate_size": 18944,
17
+ "max_position_embeddings": 32768,
18
+ "max_window_layers": 28,
19
+ "mm_projector_type": "mlp2x_gelu",
20
+ "model_type": "hulumed_qwen2",
21
+ "num_attention_heads": 28,
22
+ "num_hidden_layers": 28,
23
+ "num_key_value_heads": 4,
24
+ "rms_norm_eps": 1e-06,
25
+ "rope_scaling": null,
26
+ "rope_theta": 1000000.0,
27
+ "sliding_window": null,
28
+ "tie_word_embeddings": false,
29
+ "torch_dtype": "bfloat16",
30
+ "transformers_version": "4.51.2",
31
+ "use_cache": true,
32
+ "use_sliding_window": false,
33
+ "use_token_compression": false,
34
+ "vision_encoder_config": {
35
+ "hidden_size": 1152,
36
+ "intermediate_size": 4304,
37
+ "model_type": "hulumed_vision_encoder",
38
+ "num_attention_heads": 16,
39
+ "num_hidden_layers": 27,
40
+ "patch_size": 14
41
+ },
42
+ "vocab_size": 152064
43
+ }
44
+
configuration.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"framework": "pytorch", "task": "text-generation", "allow_remote": true}
configuration_hulumed_encoder.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/configuration_siglip.py.
2
+ # Below is the original copyright:
3
+ # coding=utf-8
4
+ # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """HuluMed vision encoder model configuration."""
18
+
19
+ from transformers import PretrainedConfig
20
+
21
+
22
+ class HulumedVisionEncoderConfig(PretrainedConfig):
23
+
24
+ model_type = "hulumed_vision_encoder"
25
+
26
+ def __init__(
27
+ self,
28
+ hidden_size=768,
29
+ intermediate_size=3072,
30
+ num_hidden_layers=12,
31
+ num_attention_heads=12,
32
+ num_channels=3,
33
+ patch_size=16,
34
+ hidden_act="gelu_pytorch_tanh",
35
+ layer_norm_eps=1e-6,
36
+ attention_dropout=0.0,
37
+ **kwargs,
38
+ ):
39
+ super().__init__(**kwargs)
40
+
41
+ self.hidden_size = hidden_size
42
+ self.intermediate_size = intermediate_size
43
+ self.num_hidden_layers = num_hidden_layers
44
+ self.num_attention_heads = num_attention_heads
45
+ self.num_channels = num_channels
46
+ self.patch_size = patch_size
47
+ self.attention_dropout = attention_dropout
48
+ self.layer_norm_eps = layer_norm_eps
49
+ self.hidden_act = hidden_act
configuration_hulumed_qwen2.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HuluMed model configuration."""
2
+
3
+ import importlib.util
4
+ import os.path as osp
5
+ from typing import Optional, Dict, Any
6
+
7
+ from transformers import AutoConfig, AutoModel, PretrainedConfig, Qwen2Config
8
+
9
+ try:
10
+ from .configuration_hulumed_encoder import HulumedVisionEncoderConfig
11
+ except ModuleNotFoundError:
12
+ spec = importlib.util.spec_from_file_location(
13
+ "configuration_hulumed_encoder",
14
+ osp.join(osp.dirname(__file__), "configuration_hulumed_encoder.py"),
15
+ )
16
+ configuration_hulumed_encoder = importlib.util.module_from_spec(spec)
17
+ spec.loader.exec_module(configuration_hulumed_encoder)
18
+ HulumedVisionEncoderConfig = getattr(
19
+ configuration_hulumed_encoder,
20
+ "HulumedVisionEncoderConfig",
21
+ )
22
+
23
+ try:
24
+ from .modeling_hulumed_encoder import HulumedVisionEncoderModel
25
+ except ModuleNotFoundError:
26
+ spec = importlib.util.spec_from_file_location(
27
+ "modeling_hulumed_encoder",
28
+ osp.join(osp.dirname(__file__), "modeling_hulumed_encoder.py"),
29
+ )
30
+ modeling_hulumed_encoder = importlib.util.module_from_spec(spec)
31
+ spec.loader.exec_module(modeling_hulumed_encoder)
32
+ HulumedVisionEncoderModel = getattr(
33
+ modeling_hulumed_encoder,
34
+ "HulumedVisionEncoderModel",
35
+ )
36
+
37
+ AutoConfig.register("hulumed_vision_encoder", HulumedVisionEncoderConfig)
38
+ AutoModel.register(HulumedVisionEncoderConfig, HulumedVisionEncoderModel)
39
+
40
+
41
+ class HulumedQwen2Config(Qwen2Config):
42
+ """
43
+ HuluMed model configuration.
44
+
45
+ This configuration class extends Qwen2Config to store the configuration of a HuluMed model.
46
+ It includes configuration for the vision encoder and multimodal projector.
47
+ """
48
+
49
+ model_type = "hulumed_qwen2"
50
+ sub_configs = {"vision_encoder_config": HulumedVisionEncoderConfig}
51
+
52
+ def __init__(
53
+ self,
54
+ vision_encoder: Optional[str] = None,
55
+ vision_encoder_config: Dict[str, Any] = {},
56
+ mm_projector_type: str = "mlp2x_gelu",
57
+ use_token_compression: bool = True,
58
+ image_token_index: int = -1,
59
+ **kwargs,
60
+ ):
61
+ """
62
+ Initialize HuluMed configuration.
63
+
64
+ Args:
65
+ vision_encoder (str, optional): Path or identifier of the vision encoder.
66
+ vision_encoder_config (dict, optional): Configuration for the vision encoder.
67
+ mm_projector_type (str): Type of multimodal projector. Default is "mlp2x_gelu".
68
+ use_token_compression (bool): Whether to use token compression for videos. Default is True.
69
+ image_token_index (int): Token index for image placeholders. Default is -1.
70
+ **kwargs: Additional arguments passed to Qwen2Config.
71
+ """
72
+ super().__init__(**kwargs)
73
+ self.model_type = "hulumed_qwen2"
74
+
75
+ self.vision_encoder = vision_encoder
76
+
77
+ if vision_encoder_config is not None and not isinstance(vision_encoder_config, PretrainedConfig):
78
+ vision_encoder_config = HulumedVisionEncoderConfig(**vision_encoder_config)
79
+
80
+ self.vision_encoder_config = vision_encoder_config
81
+ self.mm_projector_type = mm_projector_type
82
+ self.use_token_compression = use_token_compression
83
+ self.image_token_index = image_token_index
generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.51.2"
14
+ }
image_processing_hulumed.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py.
2
+ # Below is the original copyright:
3
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
6
+ # and OPT implementations in this library. It has been modified from its
7
+ # original forms to accommodate minor architectural differences compared
8
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+ """Image processor class for HuluMed."""
22
+
23
+ import math
24
+ from typing import Dict, List, Optional, Union
25
+
26
+ import numpy as np
27
+
28
+ import torch
29
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
30
+ from transformers.image_utils import ImageInput
31
+ from transformers.image_transforms import (
32
+ convert_to_rgb,
33
+ resize,
34
+ to_channel_dimension_format,
35
+ )
36
+ from transformers.image_utils import (
37
+ OPENAI_CLIP_MEAN,
38
+ OPENAI_CLIP_STD,
39
+ ChannelDimension,
40
+ ImageInput,
41
+ PILImageResampling,
42
+ get_image_size,
43
+ infer_channel_dimension_format,
44
+ is_scaled_image,
45
+ is_valid_image,
46
+ make_list_of_images,
47
+ to_numpy_array,
48
+ )
49
+ try:
50
+ from transformers.video_utils import VideoInput
51
+ except:
52
+ from transformers.image_utils import VideoInput
53
+
54
+ from transformers.utils import TensorType, is_vision_available, logging
55
+
56
+
57
+ logger = logging.get_logger(__name__)
58
+
59
+
60
+ if is_vision_available():
61
+ from PIL import Image
62
+
63
+
64
+ def is_valid_video(video) -> bool:
65
+ if isinstance(video, (list, tuple)):
66
+ return all(is_valid_image(frame) for frame in video)
67
+ elif isinstance(video, np.ndarray):
68
+ return video.ndim == 4
69
+ elif isinstance(video, torch.Tensor):
70
+ return video.ndim == 4
71
+ return False
72
+
73
+
74
+ def make_batched_images(images) -> List[List[ImageInput]]:
75
+ """
76
+ Accepts images in list or nested list format, and makes a list of images for preprocessing.
77
+
78
+ Args:
79
+ images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
80
+ The input image.
81
+
82
+ Returns:
83
+ list: A list of images.
84
+ """
85
+ if isinstance(images, (list, tuple)):
86
+ # list of images/videos
87
+ if not all(is_valid_video(image) or is_valid_image(image) for image in images):
88
+ raise ValueError(f"Could not make batched images from {images}")
89
+ return images
90
+ elif is_valid_video(images) or is_valid_image(images):
91
+ # single image/video
92
+ return [images]
93
+
94
+ raise ValueError(f"Could not make batched images from {images}")
95
+
96
+
97
+ def simple_batched_resize(
98
+ images, factor: int = 28, min_tokens: int = 4 * 4, max_tokens: int = 16384, input_data_format: str = None
99
+ ):
100
+ min_pixels = min_tokens * factor * factor
101
+ max_pixels = max_tokens * factor * factor
102
+
103
+ num_images = 0
104
+ for image in images:
105
+ if is_valid_video(image):
106
+ num_images += len(image)
107
+ else:
108
+ num_images += 1
109
+
110
+ image_sizes = []
111
+ for image in images:
112
+ if is_valid_video(image):
113
+ image = image[0]
114
+ if isinstance(image, Image.Image):
115
+ height, width = image.size
116
+ else:
117
+ height, width = get_image_size(image, channel_dim=input_data_format)
118
+ image_sizes.append([height, width])
119
+
120
+ tmp_image_sizes = []
121
+ for height, width in image_sizes:
122
+ h_bar = round(height / factor) * factor
123
+ w_bar = round(width / factor) * factor
124
+ if h_bar * w_bar > (max_pixels // num_images):
125
+ beta = math.sqrt((height * width) / (max_pixels // num_images))
126
+ h_bar = math.floor(height / beta / factor) * factor
127
+ w_bar = math.floor(width / beta / factor) * factor
128
+ # per image min_pixels
129
+ if h_bar * w_bar < min_pixels:
130
+ beta = math.sqrt(min_pixels / (height * width))
131
+ h_bar = math.ceil(height * beta / factor) * factor
132
+ w_bar = math.ceil(width * beta / factor) * factor
133
+ tmp_image_sizes.append((h_bar, w_bar))
134
+ image_sizes = tmp_image_sizes
135
+ return image_sizes
136
+
137
+
138
+ def batched_resize(
139
+ images, factors: List[int], min_tokens: int = 4 * 4, max_tokens: int = 16384, input_data_format: str = None
140
+ ):
141
+ image_sizes = []
142
+ for image in images:
143
+ if is_valid_video(image):
144
+ num_frame = len(image)
145
+ image = image[0]
146
+ else:
147
+ num_frame = 1
148
+ if isinstance(image, Image.Image):
149
+ height, width = image.size
150
+ else:
151
+ height, width = get_image_size(image, channel_dim=input_data_format)
152
+ image_sizes.append([num_frame, height, width])
153
+
154
+ # global max_pixels
155
+ smart_scale_factors = 1.0
156
+ total_tokens = 0
157
+ for (num_frame, height, width), factor in zip(image_sizes, factors):
158
+ total_tokens += num_frame * math.ceil(height / factor) * math.ceil(width / factor)
159
+
160
+ # TODO: add min_pixels
161
+ if total_tokens > max_tokens:
162
+ beta = math.sqrt(total_tokens / max_tokens)
163
+ tmp_image_sizes = []
164
+ for (_, height, width), factor in zip(image_sizes, factors):
165
+ h_bar = math.floor(height / beta / factor) * factor
166
+ w_bar = math.floor(width / beta / factor) * factor
167
+ tmp_image_sizes.append((h_bar, w_bar))
168
+ image_sizes = tmp_image_sizes
169
+ else:
170
+ tmp_image_sizes = []
171
+ for (_, height, width), factor in zip(image_sizes, factors):
172
+ height = round(height / factor) * factor
173
+ width = round(width / factor) * factor
174
+ tmp_image_sizes.append((height, width))
175
+ image_sizes = tmp_image_sizes
176
+
177
+ return image_sizes
178
+
179
+
180
+ class HulumedImageProcessor(BaseImageProcessor):
181
+ r"""
182
+ Constructs a HuluMed image processor that dynamically resizes images based on the original images.
183
+
184
+ Args:
185
+ do_resize (`bool`, *optional*, defaults to `True`):
186
+ Whether to resize the image's (height, width) dimensions.
187
+ resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
188
+ Resampling filter to use when resizing the image.
189
+ do_rescale (`bool`, *optional*, defaults to `True`):
190
+ Whether to rescale the image by the specified scale `rescale_factor`.
191
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
192
+ Scale factor to use if rescaling the image.
193
+ do_normalize (`bool`, *optional*, defaults to `True`):
194
+ Whether to normalize the image.
195
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
196
+ Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
197
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
198
+ Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
199
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
200
+ Whether to convert the image to RGB.
201
+ min_pixels (`int`, *optional*, defaults to `56 * 56`):
202
+ The min pixels of the image to resize the image.
203
+ max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
204
+ The max pixels of the image to resize the image.
205
+ patch_size (`int`, *optional*, defaults to 14):
206
+ The spacial patch size of the vision encoder.
207
+ merge_size (`int`, *optional*, defaults to `None`):
208
+ The default merge size for processing. If None, no default merge size is applied.
209
+ """
210
+
211
+ model_input_names = ["pixel_values", "grid_sizes", "merge_sizes"]
212
+
213
+ def __init__(
214
+ self,
215
+ do_resize: bool = True,
216
+ resample: PILImageResampling = PILImageResampling.BICUBIC,
217
+ do_rescale: bool = True,
218
+ rescale_factor: Union[int, float] = 1 / 255,
219
+ do_normalize: bool = True,
220
+ image_mean: Optional[Union[float, List[float]]] = None,
221
+ image_std: Optional[Union[float, List[float]]] = None,
222
+ do_convert_rgb: bool = True,
223
+ min_tokens: int = 4 * 4,
224
+ max_tokens: int = 16384,
225
+ patch_size: int = 14,
226
+ merge_size: Optional[int] = None,
227
+ **kwargs,
228
+ ) -> None:
229
+ super().__init__(**kwargs)
230
+ self.do_resize = do_resize
231
+ self.resample = resample
232
+ self.do_rescale = do_rescale
233
+ self.rescale_factor = rescale_factor
234
+ self.do_normalize = do_normalize
235
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
236
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
237
+ self.min_tokens = min_tokens
238
+ self.max_tokens = max_tokens
239
+ self.patch_size = patch_size
240
+ self.do_convert_rgb = do_convert_rgb
241
+ self.merge_size = merge_size
242
+
243
+ def _preprocess(
244
+ self,
245
+ images: Union[ImageInput, VideoInput],
246
+ target_size: List[int],
247
+ merge_size: int = 1,
248
+ do_resize: bool = None,
249
+ resample: PILImageResampling = None,
250
+ do_rescale: bool = None,
251
+ rescale_factor: float = None,
252
+ do_normalize: bool = None,
253
+ image_mean: Optional[Union[float, List[float]]] = None,
254
+ image_std: Optional[Union[float, List[float]]] = None,
255
+ do_convert_rgb: bool = None,
256
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
257
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
258
+ ):
259
+ """
260
+ Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
261
+
262
+ Args:
263
+ images (`ImageInput`):
264
+ Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
265
+ target_size (`List[int]`):
266
+ The target size to resize the image to. Should be a list of two integers: [target_height, target_width].
267
+ merge_size (`int`, *optional*, defaults to `1`):
268
+ The merge size after the vision encoder.
269
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
270
+ Whether to resize the image.
271
+ resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
272
+ Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
273
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
274
+ Whether to rescale the image.
275
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
276
+ Scale factor to use if rescaling the image.
277
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
278
+ Whether to normalize the image.
279
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
280
+ Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
281
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
282
+ Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
283
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
284
+ Whether to convert the image to RGB.
285
+ data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
286
+ The channel dimension format for the output image. Can be one of:
287
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
288
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
289
+ - Unset: Use the channel dimension format of the input image.
290
+ input_data_format (`ChannelDimension` or `str`, *optional*):
291
+ The channel dimension format for the input image. Can be one of:
292
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
293
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
294
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
295
+ """
296
+ images = make_list_of_images(images)
297
+
298
+ if do_convert_rgb:
299
+ images = [convert_to_rgb(image) for image in images]
300
+
301
+ # All transformations expect numpy arrays.
302
+ images = [to_numpy_array(image) for image in images]
303
+
304
+ if is_scaled_image(images[0]) and do_rescale:
305
+ logger.warning_once(
306
+ "It looks like you are trying to rescale already rescaled images. If the input"
307
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
308
+ )
309
+ if input_data_format is None:
310
+ # We assume that all images have the same channel dimension format.
311
+ input_data_format = infer_channel_dimension_format(images[0])
312
+
313
+ height, width = get_image_size(images[0], channel_dim=input_data_format)
314
+ resized_height, resized_width = height, width
315
+ processed_images = []
316
+ for image in images:
317
+ if do_resize:
318
+ resized_height, resized_width = target_size
319
+ image = resize(
320
+ image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
321
+ )
322
+
323
+ if do_rescale:
324
+ image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
325
+
326
+ if do_normalize:
327
+ image = self.normalize(
328
+ image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
329
+ )
330
+
331
+ image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
332
+ processed_images.append(image)
333
+
334
+ patches = np.array(processed_images)
335
+ if data_format == ChannelDimension.LAST:
336
+ patches = patches.transpose(0, 3, 1, 2)
337
+ t = patches.shape[0]
338
+ channel = patches.shape[1]
339
+ grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
340
+ patches = patches.reshape(
341
+ t,
342
+ channel,
343
+ grid_h // merge_size,
344
+ merge_size,
345
+ self.patch_size,
346
+ grid_w // merge_size,
347
+ merge_size,
348
+ self.patch_size,
349
+ )
350
+ patches = patches.transpose(0, 2, 5, 3, 6, 1, 4, 7)
351
+ flatten_patches = patches.reshape(
352
+ t * grid_h * grid_w, channel * self.patch_size * self.patch_size
353
+ )
354
+
355
+ return flatten_patches, (t, grid_h, grid_w)
356
+
357
+ def preprocess(
358
+ self,
359
+ images: ImageInput,
360
+ do_resize: bool = None,
361
+ resample: PILImageResampling = None,
362
+ do_rescale: bool = None,
363
+ rescale_factor: float = None,
364
+ do_normalize: bool = None,
365
+ image_mean: Optional[Union[float, List[float]]] = None,
366
+ image_std: Optional[Union[float, List[float]]] = None,
367
+ do_convert_rgb: bool = None,
368
+ merge_size: Optional[Union[int, List[int]]] = None,
369
+ return_tensors: Optional[Union[str, TensorType]] = None,
370
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
371
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
372
+ ):
373
+ """
374
+ Args:
375
+ images (`ImageInput`):
376
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
377
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
378
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
379
+ Whether to resize the image.
380
+ resample (`int`, *optional*, defaults to `self.resample`):
381
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
382
+ has an effect if `do_resize` is set to `True`.
383
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
384
+ Whether to rescale the image.
385
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
386
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
387
+ do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
388
+ Whether to normalize the image.
389
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
390
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
391
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
392
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
393
+ `True`.
394
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
395
+ Whether to convert the image to RGB.
396
+ merge_size (`int` or `List[int]`, *optional*, defaults to `self.merge_size`):
397
+ The merge size for processing. Can be a single value or a list of values (one per image).
398
+ return_tensors (`str` or `TensorType`, *optional*):
399
+ The type of tensors to return. Can be one of:
400
+ - Unset: Return a list of `np.ndarray`.
401
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
402
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
403
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
404
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
405
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
406
+ The channel dimension format for the output image. Can be one of:
407
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
408
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
409
+ - Unset: Use the channel dimension format of the input image.
410
+ input_data_format (`ChannelDimension` or `str`, *optional*):
411
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
412
+ from the input image. Can be one of:
413
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
414
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
415
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
416
+
417
+ """
418
+ do_resize = do_resize if do_resize is not None else self.do_resize
419
+ resample = resample if resample is not None else self.resample
420
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
421
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
422
+ do_normalize = do_normalize if do_normalize is not None else self.do_normalize
423
+ image_mean = image_mean if image_mean is not None else self.image_mean
424
+ image_std = image_std if image_std is not None else self.image_std
425
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
426
+
427
+ # Handle merge_size: use provided value, or fall back to instance default, or use 1
428
+ if merge_size is None:
429
+ merge_size = self.merge_size if self.merge_size is not None else 1
430
+
431
+ images = make_batched_images(images)
432
+
433
+ if isinstance(merge_size, (list, tuple)):
434
+ assert len(merge_size) == len(images), "Merge size must be the same length as images."
435
+ merge_sizes = merge_size
436
+ else:
437
+ merge_sizes = [merge_size for _ in images]
438
+ if all(merge_size == merge_sizes[0] for merge_size in merge_sizes):
439
+ target_sizes = simple_batched_resize(
440
+ images,
441
+ factor=self.patch_size * merge_sizes[0],
442
+ min_tokens=self.min_tokens,
443
+ max_tokens=self.max_tokens,
444
+ input_data_format=input_data_format,
445
+ )
446
+ else:
447
+ target_sizes = batched_resize(
448
+ images,
449
+ factors=[self.patch_size * merge_size for merge_size in merge_sizes],
450
+ min_tokens=self.min_tokens,
451
+ max_tokens=self.max_tokens,
452
+ input_data_format=input_data_format,
453
+ )
454
+
455
+ pixel_values, grid_sizes = [], []
456
+ for image, merge_size, target_size in zip(images, merge_sizes, target_sizes):
457
+ patches, grid_size = self._preprocess(
458
+ image,
459
+ target_size=target_size,
460
+ merge_size=merge_size,
461
+ do_resize=do_resize,
462
+ resample=resample,
463
+ do_rescale=do_rescale,
464
+ rescale_factor=rescale_factor,
465
+ do_normalize=do_normalize,
466
+ image_mean=image_mean,
467
+ image_std=image_std,
468
+ data_format=data_format,
469
+ do_convert_rgb=do_convert_rgb,
470
+ input_data_format=input_data_format,
471
+ )
472
+ pixel_values.append(patches)
473
+ grid_sizes.append(grid_size)
474
+
475
+ pixel_values = np.concatenate(pixel_values, axis=0)
476
+ grid_sizes = np.array(grid_sizes)
477
+ merge_sizes = np.array(merge_sizes)
478
+
479
+ data = {
480
+ "pixel_values": pixel_values,
481
+ "grid_sizes": grid_sizes,
482
+ "merge_sizes": merge_sizes,
483
+ }
484
+
485
+ return BatchFeature(data=data, tensor_type=return_tensors)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:360c4f7335d9cf1edd72f3f4eb0f95dcccbadcfb2bf964ac2c60e7aae39a93b8
3
+ size 5343777696
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af174d7d5af3d13b926f8112871892b68e6a44710b74c94b4fa799227219e6f
3
+ size 5263077248
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8d08aa83687fc5365c09223d42a1cb4059977aecb4d373a62852688652806b9
3
+ size 4392737312
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95316e809197f95a3543696ae49a8ab3894c9d7eae20d8c0168406a10edbb63e
3
+ size 1089994848
model.safetensors.index.json ADDED
@@ -0,0 +1,786 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 16089489888
4
+ },
5
+ "weight_map": {
6
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
7
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
8
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
20
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
32
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
33
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
34
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
35
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
36
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
37
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
38
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
39
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
40
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
41
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
42
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
43
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
44
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
45
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
46
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
47
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
48
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
49
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
50
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
51
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
52
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
53
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
54
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
55
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
56
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
57
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
58
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
59
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
60
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
61
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
62
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
63
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
64
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
65
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
66
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
67
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
68
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
69
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
70
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
71
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
72
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
73
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
74
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
75
+ "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
76
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
77
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
78
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
79
+ "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
80
+ "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
81
+ "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
82
+ "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
83
+ "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
84
+ "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
85
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
86
+ "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
87
+ "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
88
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
89
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
90
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
91
+ "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
92
+ "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
93
+ "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
94
+ "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
95
+ "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
96
+ "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
97
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
98
+ "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
99
+ "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
100
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
101
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
102
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
103
+ "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
104
+ "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
105
+ "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
106
+ "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
107
+ "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
108
+ "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
109
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
110
+ "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
111
+ "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
112
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
113
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
114
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
115
+ "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
116
+ "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
117
+ "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
118
+ "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
119
+ "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
120
+ "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
121
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
122
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
123
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
124
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
125
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
126
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
127
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
128
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
129
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
130
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
131
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
132
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
133
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
134
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
135
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
136
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
137
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
138
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
139
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
140
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
141
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
142
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
143
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
144
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
145
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
146
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
147
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
148
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
149
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
150
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
151
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
152
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
153
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
154
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
155
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
156
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
157
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
158
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
159
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
160
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
161
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
162
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
163
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
164
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
165
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
166
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
167
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
168
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
169
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
170
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
171
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
172
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
173
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
174
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
175
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
176
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
177
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
178
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
179
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
180
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
181
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
182
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
183
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
184
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
185
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
186
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
187
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
188
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
189
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
190
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
191
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
192
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
193
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
194
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
195
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
196
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
197
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
198
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
199
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
200
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
201
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
202
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
203
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
204
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
205
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
206
+ "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
207
+ "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
208
+ "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
209
+ "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
210
+ "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
211
+ "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
212
+ "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
213
+ "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
214
+ "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
215
+ "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
216
+ "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
217
+ "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
218
+ "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
219
+ "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
220
+ "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
221
+ "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
222
+ "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
223
+ "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
224
+ "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
225
+ "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
226
+ "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
227
+ "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
228
+ "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
229
+ "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
230
+ "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
231
+ "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
232
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
233
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
234
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
235
+ "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
236
+ "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
237
+ "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
238
+ "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
239
+ "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
240
+ "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
241
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
242
+ "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
243
+ "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
244
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
245
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
246
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
247
+ "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
248
+ "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
249
+ "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
250
+ "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
251
+ "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
252
+ "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
253
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
254
+ "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
255
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
256
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
257
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
258
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
259
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
260
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
261
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
262
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
263
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
264
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
265
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
266
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
267
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
268
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
269
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
270
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
271
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
272
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
273
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
274
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
275
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
276
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
277
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
278
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
279
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
280
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
281
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
282
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
283
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
284
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
285
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
286
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
287
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
288
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
289
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
290
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
291
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
292
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
293
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
294
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
295
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
296
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
297
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
298
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
299
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
300
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
301
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
302
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
303
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
304
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
305
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
306
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
307
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
308
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
309
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
310
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
311
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
312
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
313
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
314
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
315
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
316
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
317
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
318
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
319
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
320
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
321
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
322
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
323
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
324
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
325
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
326
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
327
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
328
+ "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
329
+ "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
330
+ "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
331
+ "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
332
+ "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
333
+ "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
334
+ "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
335
+ "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
336
+ "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
337
+ "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
338
+ "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
339
+ "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
340
+ "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
341
+ "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
342
+ "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
343
+ "model.norm.weight": "model-00003-of-00004.safetensors",
344
+ "model.vision_encoder.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
345
+ "model.vision_encoder.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
346
+ "model.vision_encoder.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
347
+ "model.vision_encoder.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
348
+ "model.vision_encoder.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
349
+ "model.vision_encoder.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
350
+ "model.vision_encoder.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
351
+ "model.vision_encoder.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
352
+ "model.vision_encoder.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
353
+ "model.vision_encoder.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
354
+ "model.vision_encoder.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
355
+ "model.vision_encoder.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
356
+ "model.vision_encoder.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
357
+ "model.vision_encoder.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
358
+ "model.vision_encoder.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
359
+ "model.vision_encoder.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
360
+ "model.vision_encoder.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
361
+ "model.vision_encoder.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
362
+ "model.vision_encoder.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
363
+ "model.vision_encoder.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
364
+ "model.vision_encoder.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
365
+ "model.vision_encoder.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
366
+ "model.vision_encoder.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
367
+ "model.vision_encoder.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
368
+ "model.vision_encoder.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
369
+ "model.vision_encoder.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
370
+ "model.vision_encoder.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
371
+ "model.vision_encoder.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
372
+ "model.vision_encoder.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
373
+ "model.vision_encoder.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
374
+ "model.vision_encoder.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
375
+ "model.vision_encoder.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
376
+ "model.vision_encoder.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
377
+ "model.vision_encoder.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
378
+ "model.vision_encoder.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
379
+ "model.vision_encoder.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
380
+ "model.vision_encoder.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
381
+ "model.vision_encoder.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
382
+ "model.vision_encoder.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
383
+ "model.vision_encoder.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
384
+ "model.vision_encoder.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
385
+ "model.vision_encoder.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
386
+ "model.vision_encoder.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
387
+ "model.vision_encoder.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
388
+ "model.vision_encoder.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
389
+ "model.vision_encoder.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
390
+ "model.vision_encoder.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
391
+ "model.vision_encoder.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
392
+ "model.vision_encoder.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
393
+ "model.vision_encoder.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
394
+ "model.vision_encoder.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
395
+ "model.vision_encoder.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
396
+ "model.vision_encoder.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
397
+ "model.vision_encoder.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
398
+ "model.vision_encoder.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
399
+ "model.vision_encoder.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
400
+ "model.vision_encoder.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
401
+ "model.vision_encoder.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
402
+ "model.vision_encoder.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
403
+ "model.vision_encoder.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
404
+ "model.vision_encoder.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
405
+ "model.vision_encoder.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
406
+ "model.vision_encoder.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
407
+ "model.vision_encoder.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
408
+ "model.vision_encoder.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
409
+ "model.vision_encoder.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
410
+ "model.vision_encoder.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
411
+ "model.vision_encoder.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
412
+ "model.vision_encoder.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
413
+ "model.vision_encoder.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
414
+ "model.vision_encoder.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
415
+ "model.vision_encoder.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
416
+ "model.vision_encoder.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
417
+ "model.vision_encoder.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
418
+ "model.vision_encoder.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
419
+ "model.vision_encoder.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
420
+ "model.vision_encoder.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
421
+ "model.vision_encoder.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
422
+ "model.vision_encoder.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
423
+ "model.vision_encoder.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
424
+ "model.vision_encoder.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
425
+ "model.vision_encoder.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
426
+ "model.vision_encoder.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
427
+ "model.vision_encoder.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
428
+ "model.vision_encoder.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
429
+ "model.vision_encoder.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
430
+ "model.vision_encoder.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
431
+ "model.vision_encoder.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
432
+ "model.vision_encoder.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
433
+ "model.vision_encoder.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
434
+ "model.vision_encoder.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
435
+ "model.vision_encoder.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
436
+ "model.vision_encoder.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
437
+ "model.vision_encoder.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
438
+ "model.vision_encoder.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
439
+ "model.vision_encoder.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
440
+ "model.vision_encoder.encoder.layers.5.layer_norm2.weight": "model-00003-of-00004.safetensors",
441
+ "model.vision_encoder.encoder.layers.5.layer_norm2.bias": "model-00003-of-00004.safetensors",
442
+ "model.vision_encoder.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
443
+ "model.vision_encoder.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
444
+ "model.vision_encoder.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
445
+ "model.vision_encoder.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
446
+ "model.vision_encoder.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
447
+ "model.vision_encoder.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
448
+ "model.vision_encoder.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
449
+ "model.vision_encoder.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
450
+ "model.vision_encoder.encoder.layers.6.layer_norm1.weight": "model-00003-of-00004.safetensors",
451
+ "model.vision_encoder.encoder.layers.6.layer_norm1.bias": "model-00003-of-00004.safetensors",
452
+ "model.vision_encoder.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
453
+ "model.vision_encoder.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
454
+ "model.vision_encoder.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
455
+ "model.vision_encoder.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
456
+ "model.vision_encoder.encoder.layers.6.layer_norm2.weight": "model-00003-of-00004.safetensors",
457
+ "model.vision_encoder.encoder.layers.6.layer_norm2.bias": "model-00003-of-00004.safetensors",
458
+ "model.vision_encoder.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
459
+ "model.vision_encoder.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
460
+ "model.vision_encoder.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
461
+ "model.vision_encoder.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
462
+ "model.vision_encoder.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
463
+ "model.vision_encoder.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
464
+ "model.vision_encoder.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
465
+ "model.vision_encoder.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
466
+ "model.vision_encoder.encoder.layers.7.layer_norm1.weight": "model-00003-of-00004.safetensors",
467
+ "model.vision_encoder.encoder.layers.7.layer_norm1.bias": "model-00003-of-00004.safetensors",
468
+ "model.vision_encoder.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
469
+ "model.vision_encoder.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
470
+ "model.vision_encoder.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
471
+ "model.vision_encoder.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
472
+ "model.vision_encoder.encoder.layers.7.layer_norm2.weight": "model-00003-of-00004.safetensors",
473
+ "model.vision_encoder.encoder.layers.7.layer_norm2.bias": "model-00003-of-00004.safetensors",
474
+ "model.vision_encoder.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
475
+ "model.vision_encoder.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
476
+ "model.vision_encoder.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
477
+ "model.vision_encoder.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
478
+ "model.vision_encoder.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
479
+ "model.vision_encoder.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
480
+ "model.vision_encoder.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
481
+ "model.vision_encoder.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
482
+ "model.vision_encoder.encoder.layers.8.layer_norm1.weight": "model-00003-of-00004.safetensors",
483
+ "model.vision_encoder.encoder.layers.8.layer_norm1.bias": "model-00003-of-00004.safetensors",
484
+ "model.vision_encoder.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
485
+ "model.vision_encoder.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
486
+ "model.vision_encoder.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
487
+ "model.vision_encoder.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
488
+ "model.vision_encoder.encoder.layers.8.layer_norm2.weight": "model-00003-of-00004.safetensors",
489
+ "model.vision_encoder.encoder.layers.8.layer_norm2.bias": "model-00003-of-00004.safetensors",
490
+ "model.vision_encoder.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
491
+ "model.vision_encoder.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
492
+ "model.vision_encoder.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
493
+ "model.vision_encoder.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
494
+ "model.vision_encoder.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
495
+ "model.vision_encoder.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
496
+ "model.vision_encoder.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
497
+ "model.vision_encoder.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
498
+ "model.vision_encoder.encoder.layers.9.layer_norm1.weight": "model-00003-of-00004.safetensors",
499
+ "model.vision_encoder.encoder.layers.9.layer_norm1.bias": "model-00003-of-00004.safetensors",
500
+ "model.vision_encoder.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
501
+ "model.vision_encoder.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
502
+ "model.vision_encoder.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
503
+ "model.vision_encoder.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
504
+ "model.vision_encoder.encoder.layers.9.layer_norm2.weight": "model-00003-of-00004.safetensors",
505
+ "model.vision_encoder.encoder.layers.9.layer_norm2.bias": "model-00003-of-00004.safetensors",
506
+ "model.vision_encoder.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
507
+ "model.vision_encoder.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
508
+ "model.vision_encoder.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
509
+ "model.vision_encoder.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
510
+ "model.vision_encoder.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
511
+ "model.vision_encoder.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
512
+ "model.vision_encoder.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
513
+ "model.vision_encoder.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
514
+ "model.vision_encoder.encoder.layers.10.layer_norm1.weight": "model-00003-of-00004.safetensors",
515
+ "model.vision_encoder.encoder.layers.10.layer_norm1.bias": "model-00003-of-00004.safetensors",
516
+ "model.vision_encoder.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
517
+ "model.vision_encoder.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
518
+ "model.vision_encoder.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
519
+ "model.vision_encoder.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
520
+ "model.vision_encoder.encoder.layers.10.layer_norm2.weight": "model-00003-of-00004.safetensors",
521
+ "model.vision_encoder.encoder.layers.10.layer_norm2.bias": "model-00003-of-00004.safetensors",
522
+ "model.vision_encoder.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
523
+ "model.vision_encoder.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
524
+ "model.vision_encoder.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
525
+ "model.vision_encoder.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
526
+ "model.vision_encoder.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
527
+ "model.vision_encoder.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
528
+ "model.vision_encoder.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
529
+ "model.vision_encoder.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
530
+ "model.vision_encoder.encoder.layers.11.layer_norm1.weight": "model-00003-of-00004.safetensors",
531
+ "model.vision_encoder.encoder.layers.11.layer_norm1.bias": "model-00003-of-00004.safetensors",
532
+ "model.vision_encoder.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
533
+ "model.vision_encoder.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
534
+ "model.vision_encoder.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
535
+ "model.vision_encoder.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
536
+ "model.vision_encoder.encoder.layers.11.layer_norm2.weight": "model-00003-of-00004.safetensors",
537
+ "model.vision_encoder.encoder.layers.11.layer_norm2.bias": "model-00003-of-00004.safetensors",
538
+ "model.vision_encoder.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
539
+ "model.vision_encoder.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
540
+ "model.vision_encoder.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
541
+ "model.vision_encoder.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
542
+ "model.vision_encoder.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
543
+ "model.vision_encoder.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
544
+ "model.vision_encoder.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
545
+ "model.vision_encoder.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
546
+ "model.vision_encoder.encoder.layers.12.layer_norm1.weight": "model-00003-of-00004.safetensors",
547
+ "model.vision_encoder.encoder.layers.12.layer_norm1.bias": "model-00003-of-00004.safetensors",
548
+ "model.vision_encoder.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
549
+ "model.vision_encoder.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
550
+ "model.vision_encoder.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
551
+ "model.vision_encoder.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
552
+ "model.vision_encoder.encoder.layers.12.layer_norm2.weight": "model-00003-of-00004.safetensors",
553
+ "model.vision_encoder.encoder.layers.12.layer_norm2.bias": "model-00003-of-00004.safetensors",
554
+ "model.vision_encoder.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
555
+ "model.vision_encoder.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
556
+ "model.vision_encoder.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
557
+ "model.vision_encoder.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
558
+ "model.vision_encoder.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
559
+ "model.vision_encoder.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
560
+ "model.vision_encoder.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
561
+ "model.vision_encoder.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
562
+ "model.vision_encoder.encoder.layers.13.layer_norm1.weight": "model-00003-of-00004.safetensors",
563
+ "model.vision_encoder.encoder.layers.13.layer_norm1.bias": "model-00003-of-00004.safetensors",
564
+ "model.vision_encoder.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
565
+ "model.vision_encoder.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
566
+ "model.vision_encoder.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
567
+ "model.vision_encoder.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
568
+ "model.vision_encoder.encoder.layers.13.layer_norm2.weight": "model-00003-of-00004.safetensors",
569
+ "model.vision_encoder.encoder.layers.13.layer_norm2.bias": "model-00003-of-00004.safetensors",
570
+ "model.vision_encoder.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
571
+ "model.vision_encoder.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
572
+ "model.vision_encoder.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
573
+ "model.vision_encoder.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
574
+ "model.vision_encoder.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
575
+ "model.vision_encoder.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
576
+ "model.vision_encoder.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
577
+ "model.vision_encoder.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
578
+ "model.vision_encoder.encoder.layers.14.layer_norm1.weight": "model-00003-of-00004.safetensors",
579
+ "model.vision_encoder.encoder.layers.14.layer_norm1.bias": "model-00003-of-00004.safetensors",
580
+ "model.vision_encoder.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00004.safetensors",
581
+ "model.vision_encoder.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00004.safetensors",
582
+ "model.vision_encoder.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00004.safetensors",
583
+ "model.vision_encoder.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00004.safetensors",
584
+ "model.vision_encoder.encoder.layers.14.layer_norm2.weight": "model-00003-of-00004.safetensors",
585
+ "model.vision_encoder.encoder.layers.14.layer_norm2.bias": "model-00003-of-00004.safetensors",
586
+ "model.vision_encoder.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
587
+ "model.vision_encoder.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
588
+ "model.vision_encoder.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
589
+ "model.vision_encoder.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
590
+ "model.vision_encoder.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
591
+ "model.vision_encoder.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
592
+ "model.vision_encoder.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
593
+ "model.vision_encoder.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
594
+ "model.vision_encoder.encoder.layers.15.layer_norm1.weight": "model-00003-of-00004.safetensors",
595
+ "model.vision_encoder.encoder.layers.15.layer_norm1.bias": "model-00003-of-00004.safetensors",
596
+ "model.vision_encoder.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00004.safetensors",
597
+ "model.vision_encoder.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00004.safetensors",
598
+ "model.vision_encoder.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
599
+ "model.vision_encoder.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00004.safetensors",
600
+ "model.vision_encoder.encoder.layers.15.layer_norm2.weight": "model-00003-of-00004.safetensors",
601
+ "model.vision_encoder.encoder.layers.15.layer_norm2.bias": "model-00003-of-00004.safetensors",
602
+ "model.vision_encoder.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
603
+ "model.vision_encoder.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
604
+ "model.vision_encoder.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
605
+ "model.vision_encoder.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
606
+ "model.vision_encoder.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
607
+ "model.vision_encoder.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
608
+ "model.vision_encoder.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
609
+ "model.vision_encoder.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
610
+ "model.vision_encoder.encoder.layers.16.layer_norm1.weight": "model-00003-of-00004.safetensors",
611
+ "model.vision_encoder.encoder.layers.16.layer_norm1.bias": "model-00003-of-00004.safetensors",
612
+ "model.vision_encoder.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00004.safetensors",
613
+ "model.vision_encoder.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00004.safetensors",
614
+ "model.vision_encoder.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
615
+ "model.vision_encoder.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00004.safetensors",
616
+ "model.vision_encoder.encoder.layers.16.layer_norm2.weight": "model-00003-of-00004.safetensors",
617
+ "model.vision_encoder.encoder.layers.16.layer_norm2.bias": "model-00003-of-00004.safetensors",
618
+ "model.vision_encoder.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
619
+ "model.vision_encoder.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
620
+ "model.vision_encoder.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
621
+ "model.vision_encoder.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
622
+ "model.vision_encoder.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
623
+ "model.vision_encoder.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
624
+ "model.vision_encoder.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
625
+ "model.vision_encoder.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
626
+ "model.vision_encoder.encoder.layers.17.layer_norm1.weight": "model-00003-of-00004.safetensors",
627
+ "model.vision_encoder.encoder.layers.17.layer_norm1.bias": "model-00003-of-00004.safetensors",
628
+ "model.vision_encoder.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00004.safetensors",
629
+ "model.vision_encoder.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00004.safetensors",
630
+ "model.vision_encoder.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
631
+ "model.vision_encoder.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
632
+ "model.vision_encoder.encoder.layers.17.layer_norm2.weight": "model-00003-of-00004.safetensors",
633
+ "model.vision_encoder.encoder.layers.17.layer_norm2.bias": "model-00003-of-00004.safetensors",
634
+ "model.vision_encoder.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
635
+ "model.vision_encoder.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
636
+ "model.vision_encoder.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
637
+ "model.vision_encoder.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
638
+ "model.vision_encoder.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
639
+ "model.vision_encoder.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
640
+ "model.vision_encoder.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
641
+ "model.vision_encoder.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
642
+ "model.vision_encoder.encoder.layers.18.layer_norm1.weight": "model-00003-of-00004.safetensors",
643
+ "model.vision_encoder.encoder.layers.18.layer_norm1.bias": "model-00003-of-00004.safetensors",
644
+ "model.vision_encoder.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00004.safetensors",
645
+ "model.vision_encoder.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00004.safetensors",
646
+ "model.vision_encoder.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00004.safetensors",
647
+ "model.vision_encoder.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00004.safetensors",
648
+ "model.vision_encoder.encoder.layers.18.layer_norm2.weight": "model-00003-of-00004.safetensors",
649
+ "model.vision_encoder.encoder.layers.18.layer_norm2.bias": "model-00003-of-00004.safetensors",
650
+ "model.vision_encoder.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
651
+ "model.vision_encoder.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
652
+ "model.vision_encoder.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
653
+ "model.vision_encoder.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
654
+ "model.vision_encoder.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
655
+ "model.vision_encoder.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
656
+ "model.vision_encoder.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
657
+ "model.vision_encoder.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
658
+ "model.vision_encoder.encoder.layers.19.layer_norm1.weight": "model-00003-of-00004.safetensors",
659
+ "model.vision_encoder.encoder.layers.19.layer_norm1.bias": "model-00003-of-00004.safetensors",
660
+ "model.vision_encoder.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
661
+ "model.vision_encoder.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00004.safetensors",
662
+ "model.vision_encoder.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
663
+ "model.vision_encoder.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
664
+ "model.vision_encoder.encoder.layers.19.layer_norm2.weight": "model-00003-of-00004.safetensors",
665
+ "model.vision_encoder.encoder.layers.19.layer_norm2.bias": "model-00003-of-00004.safetensors",
666
+ "model.vision_encoder.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
667
+ "model.vision_encoder.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
668
+ "model.vision_encoder.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
669
+ "model.vision_encoder.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
670
+ "model.vision_encoder.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
671
+ "model.vision_encoder.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
672
+ "model.vision_encoder.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
673
+ "model.vision_encoder.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
674
+ "model.vision_encoder.encoder.layers.20.layer_norm1.weight": "model-00003-of-00004.safetensors",
675
+ "model.vision_encoder.encoder.layers.20.layer_norm1.bias": "model-00003-of-00004.safetensors",
676
+ "model.vision_encoder.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00004.safetensors",
677
+ "model.vision_encoder.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00004.safetensors",
678
+ "model.vision_encoder.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
679
+ "model.vision_encoder.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00004.safetensors",
680
+ "model.vision_encoder.encoder.layers.20.layer_norm2.weight": "model-00003-of-00004.safetensors",
681
+ "model.vision_encoder.encoder.layers.20.layer_norm2.bias": "model-00003-of-00004.safetensors",
682
+ "model.vision_encoder.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
683
+ "model.vision_encoder.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
684
+ "model.vision_encoder.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
685
+ "model.vision_encoder.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
686
+ "model.vision_encoder.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
687
+ "model.vision_encoder.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
688
+ "model.vision_encoder.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
689
+ "model.vision_encoder.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
690
+ "model.vision_encoder.encoder.layers.21.layer_norm1.weight": "model-00003-of-00004.safetensors",
691
+ "model.vision_encoder.encoder.layers.21.layer_norm1.bias": "model-00003-of-00004.safetensors",
692
+ "model.vision_encoder.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
693
+ "model.vision_encoder.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00004.safetensors",
694
+ "model.vision_encoder.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00004.safetensors",
695
+ "model.vision_encoder.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00004.safetensors",
696
+ "model.vision_encoder.encoder.layers.21.layer_norm2.weight": "model-00003-of-00004.safetensors",
697
+ "model.vision_encoder.encoder.layers.21.layer_norm2.bias": "model-00003-of-00004.safetensors",
698
+ "model.vision_encoder.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
699
+ "model.vision_encoder.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
700
+ "model.vision_encoder.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
701
+ "model.vision_encoder.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
702
+ "model.vision_encoder.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
703
+ "model.vision_encoder.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
704
+ "model.vision_encoder.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
705
+ "model.vision_encoder.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
706
+ "model.vision_encoder.encoder.layers.22.layer_norm1.weight": "model-00003-of-00004.safetensors",
707
+ "model.vision_encoder.encoder.layers.22.layer_norm1.bias": "model-00003-of-00004.safetensors",
708
+ "model.vision_encoder.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00004.safetensors",
709
+ "model.vision_encoder.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00004.safetensors",
710
+ "model.vision_encoder.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00004.safetensors",
711
+ "model.vision_encoder.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00004.safetensors",
712
+ "model.vision_encoder.encoder.layers.22.layer_norm2.weight": "model-00003-of-00004.safetensors",
713
+ "model.vision_encoder.encoder.layers.22.layer_norm2.bias": "model-00003-of-00004.safetensors",
714
+ "model.vision_encoder.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
715
+ "model.vision_encoder.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
716
+ "model.vision_encoder.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
717
+ "model.vision_encoder.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
718
+ "model.vision_encoder.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
719
+ "model.vision_encoder.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
720
+ "model.vision_encoder.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
721
+ "model.vision_encoder.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
722
+ "model.vision_encoder.encoder.layers.23.layer_norm1.weight": "model-00003-of-00004.safetensors",
723
+ "model.vision_encoder.encoder.layers.23.layer_norm1.bias": "model-00003-of-00004.safetensors",
724
+ "model.vision_encoder.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00004.safetensors",
725
+ "model.vision_encoder.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00004.safetensors",
726
+ "model.vision_encoder.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00004.safetensors",
727
+ "model.vision_encoder.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00004.safetensors",
728
+ "model.vision_encoder.encoder.layers.23.layer_norm2.weight": "model-00003-of-00004.safetensors",
729
+ "model.vision_encoder.encoder.layers.23.layer_norm2.bias": "model-00003-of-00004.safetensors",
730
+ "model.vision_encoder.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
731
+ "model.vision_encoder.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
732
+ "model.vision_encoder.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
733
+ "model.vision_encoder.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
734
+ "model.vision_encoder.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
735
+ "model.vision_encoder.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
736
+ "model.vision_encoder.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
737
+ "model.vision_encoder.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
738
+ "model.vision_encoder.encoder.layers.24.layer_norm1.weight": "model-00003-of-00004.safetensors",
739
+ "model.vision_encoder.encoder.layers.24.layer_norm1.bias": "model-00003-of-00004.safetensors",
740
+ "model.vision_encoder.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00004.safetensors",
741
+ "model.vision_encoder.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00004.safetensors",
742
+ "model.vision_encoder.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00004.safetensors",
743
+ "model.vision_encoder.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00004.safetensors",
744
+ "model.vision_encoder.encoder.layers.24.layer_norm2.weight": "model-00003-of-00004.safetensors",
745
+ "model.vision_encoder.encoder.layers.24.layer_norm2.bias": "model-00003-of-00004.safetensors",
746
+ "model.vision_encoder.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
747
+ "model.vision_encoder.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
748
+ "model.vision_encoder.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
749
+ "model.vision_encoder.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
750
+ "model.vision_encoder.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
751
+ "model.vision_encoder.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
752
+ "model.vision_encoder.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
753
+ "model.vision_encoder.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
754
+ "model.vision_encoder.encoder.layers.25.layer_norm1.weight": "model-00003-of-00004.safetensors",
755
+ "model.vision_encoder.encoder.layers.25.layer_norm1.bias": "model-00003-of-00004.safetensors",
756
+ "model.vision_encoder.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00004.safetensors",
757
+ "model.vision_encoder.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00004.safetensors",
758
+ "model.vision_encoder.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00004.safetensors",
759
+ "model.vision_encoder.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00004.safetensors",
760
+ "model.vision_encoder.encoder.layers.25.layer_norm2.weight": "model-00003-of-00004.safetensors",
761
+ "model.vision_encoder.encoder.layers.25.layer_norm2.bias": "model-00003-of-00004.safetensors",
762
+ "model.vision_encoder.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
763
+ "model.vision_encoder.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
764
+ "model.vision_encoder.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
765
+ "model.vision_encoder.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
766
+ "model.vision_encoder.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
767
+ "model.vision_encoder.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
768
+ "model.vision_encoder.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
769
+ "model.vision_encoder.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
770
+ "model.vision_encoder.encoder.layers.26.layer_norm1.weight": "model-00003-of-00004.safetensors",
771
+ "model.vision_encoder.encoder.layers.26.layer_norm1.bias": "model-00003-of-00004.safetensors",
772
+ "model.vision_encoder.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00004.safetensors",
773
+ "model.vision_encoder.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00004.safetensors",
774
+ "model.vision_encoder.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00004.safetensors",
775
+ "model.vision_encoder.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00004.safetensors",
776
+ "model.vision_encoder.encoder.layers.26.layer_norm2.weight": "model-00003-of-00004.safetensors",
777
+ "model.vision_encoder.encoder.layers.26.layer_norm2.bias": "model-00003-of-00004.safetensors",
778
+ "model.vision_encoder.post_layernorm.weight": "model-00003-of-00004.safetensors",
779
+ "model.vision_encoder.post_layernorm.bias": "model-00003-of-00004.safetensors",
780
+ "model.mm_projector.readout.0.weight": "model-00003-of-00004.safetensors",
781
+ "model.mm_projector.readout.0.bias": "model-00003-of-00004.safetensors",
782
+ "model.mm_projector.readout.2.weight": "model-00003-of-00004.safetensors",
783
+ "model.mm_projector.readout.2.bias": "model-00003-of-00004.safetensors",
784
+ "lm_head.weight": "model-00004-of-00004.safetensors"
785
+ }
786
+ }
modeling_hulumed_encoder.py ADDED
@@ -0,0 +1,534 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py.
2
+ # Below is the original copyright:
3
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
4
+ #
5
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
6
+ # and OPT implementations in this library. It has been modified from its
7
+ # original forms to accommodate minor architectural differences compared
8
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
9
+ #
10
+ # Licensed under the Apache License, Version 2.0 (the "License");
11
+ # you may not use this file except in compliance with the License.
12
+ # You may obtain a copy of the License at
13
+ #
14
+ # http://www.apache.org/licenses/LICENSE-2.0
15
+ #
16
+ # Unless required by applicable law or agreed to in writing, software
17
+ # distributed under the License is distributed on an "AS IS" BASIS,
18
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19
+ # See the License for the specific language governing permissions and
20
+ # limitations under the License.
21
+ """PyTorch HuluMed vision encoder model."""
22
+
23
+ import importlib.util
24
+ import os.path as osp
25
+ import math
26
+ import warnings
27
+
28
+ import torch
29
+ import torch.nn as nn
30
+ import torch.nn.functional as F
31
+ import torch.utils.checkpoint
32
+ from torch.nn.init import _calculate_fan_in_and_fan_out
33
+
34
+ from transformers.activations import ACT2FN
35
+ from transformers.modeling_utils import PreTrainedModel
36
+ from transformers.utils import is_flash_attn_2_available
37
+
38
+ if is_flash_attn_2_available():
39
+ from flash_attn import flash_attn_varlen_func
40
+ else:
41
+ flash_attn_varlen_func = None
42
+
43
+ try:
44
+ from .configuration_hulumed_encoder import HulumedVisionEncoderConfig
45
+ except ImportError:
46
+ spec = importlib.util.spec_from_file_location(
47
+ "configuration_hulumed_encoder",
48
+ osp.join(osp.dirname(__file__), "configuration_hulumed_encoder.py"),
49
+ )
50
+ configuration_hulumed_encoder = importlib.util.module_from_spec(spec)
51
+ spec.loader.exec_module(configuration_hulumed_encoder)
52
+ HulumedVisionEncoderConfig = getattr(
53
+ configuration_hulumed_encoder,
54
+ "HulumedVisionEncoderConfig",
55
+ )
56
+
57
+
58
+ def _trunc_normal_(tensor, mean, std, a, b):
59
+ # Cut & paste from PyTorch official master until it's in a few official releases - RW
60
+ # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
61
+ def norm_cdf(x):
62
+ # Computes standard normal cumulative distribution function
63
+ return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
64
+
65
+ if (mean < a - 2 * std) or (mean > b + 2 * std):
66
+ warnings.warn(
67
+ "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
68
+ "The distribution of values may be incorrect.",
69
+ stacklevel=2,
70
+ )
71
+
72
+ # Values are generated by using a truncated uniform distribution and
73
+ # then using the inverse CDF for the normal distribution.
74
+ # Get upper and lower cdf values
75
+ l = norm_cdf((a - mean) / std)
76
+ u = norm_cdf((b - mean) / std)
77
+
78
+ # Uniformly fill tensor with values from [l, u], then translate to
79
+ # [2l-1, 2u-1].
80
+ tensor.uniform_(2 * l - 1, 2 * u - 1)
81
+
82
+ # Use inverse cdf transform for normal distribution to get truncated
83
+ # standard normal
84
+ tensor.erfinv_()
85
+
86
+ # Transform to proper mean, std
87
+ tensor.mul_(std * math.sqrt(2.0))
88
+ tensor.add_(mean)
89
+
90
+ # Clamp to ensure it's in the proper range
91
+ tensor.clamp_(min=a, max=b)
92
+
93
+
94
+ def trunc_normal_tf_(
95
+ tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
96
+ ) -> torch.Tensor:
97
+ """Fills the input Tensor with values drawn from a truncated
98
+ normal distribution. The values are effectively drawn from the
99
+ normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
100
+ with values outside :math:`[a, b]` redrawn until they are within
101
+ the bounds. The method used for generating the random values works
102
+ best when :math:`a \\leq \text{mean} \\leq b`.
103
+
104
+ NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
105
+ bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
106
+ and the result is subsequently scaled and shifted by the mean and std args.
107
+
108
+ Args:
109
+ tensor: an n-dimensional `torch.Tensor`
110
+ mean: the mean of the normal distribution
111
+ std: the standard deviation of the normal distribution
112
+ a: the minimum cutoff value
113
+ b: the maximum cutoff value
114
+ """
115
+ with torch.no_grad():
116
+ _trunc_normal_(tensor, 0, 1.0, a, b)
117
+ tensor.mul_(std).add_(mean)
118
+
119
+
120
+ def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
121
+ fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
122
+ if mode == "fan_in":
123
+ denom = fan_in
124
+ elif mode == "fan_out":
125
+ denom = fan_out
126
+ elif mode == "fan_avg":
127
+ denom = (fan_in + fan_out) / 2
128
+
129
+ variance = scale / denom
130
+
131
+ if distribution == "truncated_normal":
132
+ # constant is stddev of standard normal truncated to (-2, 2)
133
+ trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
134
+ elif distribution == "normal":
135
+ with torch.no_grad():
136
+ tensor.normal_(std=math.sqrt(variance))
137
+ elif distribution == "uniform":
138
+ bound = math.sqrt(3 * variance)
139
+ with torch.no_grad():
140
+ tensor.uniform_(-bound, bound)
141
+ else:
142
+ raise ValueError(f"invalid distribution {distribution}")
143
+
144
+
145
+ def lecun_normal_(tensor):
146
+ variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
147
+
148
+
149
+ def default_flax_embed_init(tensor):
150
+ variance_scaling_(tensor, mode="fan_in", distribution="normal")
151
+
152
+
153
+ # Copied from transformers.models.llama.modeling_llama.rotate_half
154
+ def rotate_half(x):
155
+ """Rotates half the hidden dims of the input."""
156
+ x1 = x[..., : x.shape[-1] // 2]
157
+ x2 = x[..., x.shape[-1] // 2 :]
158
+ return torch.cat((-x2, x1), dim=-1)
159
+
160
+
161
+ def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
162
+ orig_dtype = tensor.dtype
163
+ tensor = tensor.float()
164
+ cos = freqs.cos()
165
+ sin = freqs.sin()
166
+ cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
167
+ sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
168
+ output = (tensor * cos) + (rotate_half(tensor) * sin)
169
+ output = output.to(orig_dtype)
170
+ return output
171
+
172
+
173
+ class VisionRotaryEmbedding(nn.Module):
174
+
175
+ def __init__(self, dim: int, theta: float = 10000.0) -> None:
176
+ super().__init__()
177
+ inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
178
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
179
+
180
+ def forward(self, seqlen: int) -> torch.Tensor:
181
+ seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
182
+ freqs = torch.outer(seq, self.inv_freq)
183
+ return freqs
184
+
185
+
186
+ class HulumedVisionEmbeddings(nn.Module):
187
+
188
+ def __init__(self, config: HulumedVisionEncoderConfig):
189
+ super().__init__()
190
+ self.config = config
191
+ self.embed_dim = config.hidden_size
192
+ self.patch_size = config.patch_size
193
+
194
+ self.patch_embedding = nn.Conv2d(
195
+ in_channels=config.num_channels,
196
+ out_channels=self.embed_dim,
197
+ kernel_size=self.patch_size,
198
+ stride=self.patch_size,
199
+ padding="valid",
200
+ )
201
+
202
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
203
+ hidden_states = hidden_states.view(
204
+ -1, self.config.num_channels, self.patch_size, self.patch_size
205
+ )
206
+ patch_embeds = self.patch_embedding(hidden_states) # shape = [*, width, grid, grid]
207
+ # embeddings = patch_embeds.flatten(2).transpose(1, 2)
208
+ embeddings = patch_embeds.view(-1, self.embed_dim)
209
+
210
+ return embeddings
211
+
212
+
213
+ class VisionAttention(nn.Module):
214
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
215
+
216
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
217
+ def __init__(self, config):
218
+ super().__init__()
219
+ self.config = config
220
+ self.embed_dim = config.hidden_size
221
+ self.num_heads = config.num_attention_heads
222
+ self.head_dim = self.embed_dim // self.num_heads
223
+ if self.head_dim * self.num_heads != self.embed_dim:
224
+ raise ValueError(
225
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
226
+ f" {self.num_heads})."
227
+ )
228
+ self.scale = self.head_dim**-0.5
229
+ self.dropout = config.attention_dropout
230
+
231
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
232
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
233
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
234
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
235
+
236
+ def forward(
237
+ self,
238
+ hidden_states: torch.Tensor,
239
+ cu_seqlens: torch.Tensor,
240
+ rotary_pos_emb: torch.Tensor = None,
241
+ ) -> torch.Tensor:
242
+ """Input shape: Time x Channel"""
243
+
244
+ q_len, _ = hidden_states.size()
245
+
246
+ query_states = self.q_proj(hidden_states)
247
+ key_states = self.k_proj(hidden_states)
248
+ value_states = self.v_proj(hidden_states)
249
+
250
+ query_states = query_states.view(q_len, self.num_heads, self.head_dim)
251
+ key_states = key_states.view(q_len, self.num_heads, self.head_dim)
252
+ value_states = value_states.view(q_len, self.num_heads, self.head_dim)
253
+
254
+ query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
255
+ key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
256
+
257
+ attention_mask = torch.zeros([1, q_len, q_len], device=query_states.device, dtype=torch.bool)
258
+ for i in range(1, len(cu_seqlens)):
259
+ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
260
+
261
+ query_states = query_states.transpose(0, 1)
262
+ key_states = key_states.transpose(0, 1)
263
+ value_states = value_states.transpose(0, 1)
264
+
265
+ attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
266
+ attn_weights = attn_weights + attention_mask
267
+
268
+ # upcast attention to fp32
269
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
270
+ attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
271
+ attn_output = torch.matmul(attn_weights, value_states)
272
+
273
+ attn_output = attn_output.transpose(0, 1)
274
+ attn_output = attn_output.reshape(q_len, -1)
275
+ attn_output = self.out_proj(attn_output)
276
+
277
+ return attn_output
278
+
279
+
280
+ class VisionFlashAttention2(VisionAttention):
281
+
282
+ def __init__(self, *args, **kwargs):
283
+ super().__init__(*args, **kwargs)
284
+
285
+ # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
286
+ def forward(
287
+ self,
288
+ hidden_states: torch.Tensor,
289
+ cu_seqlens: torch.Tensor,
290
+ rotary_pos_emb: torch.Tensor = None,
291
+ ) -> torch.Tensor:
292
+ q_len, _ = hidden_states.size()
293
+
294
+ query_states = self.q_proj(hidden_states)
295
+ key_states = self.k_proj(hidden_states)
296
+ value_states = self.v_proj(hidden_states)
297
+
298
+ # Flash attention requires the input to have the shape
299
+ # batch_size x seq_length x head_dim x hidden_dim
300
+ # therefore we just need to keep the original shape
301
+ query_states = query_states.view(q_len, self.num_heads, self.head_dim)
302
+ key_states = key_states.view(q_len, self.num_heads, self.head_dim)
303
+ value_states = value_states.view(q_len, self.num_heads, self.head_dim)
304
+ query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
305
+ key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
306
+
307
+ max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
308
+ attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
309
+ q_len, -1
310
+ )
311
+ attn_output = self.out_proj(attn_output)
312
+
313
+ return attn_output
314
+
315
+
316
+ class VisionSdpaAttention(VisionAttention):
317
+
318
+ def forward(
319
+ self,
320
+ hidden_states: torch.Tensor,
321
+ cu_seqlens: torch.Tensor,
322
+ rotary_pos_emb: torch.Tensor = None,
323
+ ) -> torch.Tensor:
324
+ seq_length = hidden_states.shape[0]
325
+ query_states = self.q_proj(hidden_states)
326
+ key_states = self.k_proj(hidden_states)
327
+ value_states = self.v_proj(hidden_states)
328
+
329
+ query_states = query_states.view(seq_length, self.num_heads, self.head_dim)
330
+ key_states = key_states.view(seq_length, self.num_heads, self.head_dim)
331
+ value_states = value_states.view(seq_length, self.num_heads, self.head_dim)
332
+
333
+ query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
334
+ key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
335
+
336
+ attention_mask = torch.zeros([1, seq_length, seq_length], device=query_states.device, dtype=torch.bool)
337
+ for i in range(1, len(cu_seqlens)):
338
+ attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
339
+
340
+ query_states = query_states.transpose(0, 1)
341
+ key_states = key_states.transpose(0, 1)
342
+ value_states = value_states.transpose(0, 1)
343
+ attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attention_mask, dropout_p=0.0)
344
+ attn_output = attn_output.transpose(0, 1)
345
+ attn_output = attn_output.reshape(seq_length, -1)
346
+ attn_output = self.out_proj(attn_output)
347
+ return attn_output
348
+
349
+
350
+ VISION_ATTENTION_CLASSES = {
351
+ "eager": VisionAttention,
352
+ "flash_attention_2": VisionFlashAttention2,
353
+ "sdpa": VisionSdpaAttention,
354
+ }
355
+
356
+
357
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Hulumed
358
+ class HulumedVisionMLP(nn.Module):
359
+
360
+ def __init__(self, config):
361
+ super().__init__()
362
+ self.config = config
363
+ self.activation_fn = ACT2FN[config.hidden_act]
364
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
365
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
366
+
367
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
368
+ hidden_states = self.fc1(hidden_states)
369
+ hidden_states = self.activation_fn(hidden_states)
370
+ hidden_states = self.fc2(hidden_states)
371
+ return hidden_states
372
+
373
+
374
+ class HulumedVisionEncoderLayer(nn.Module):
375
+
376
+ def __init__(self, config: HulumedVisionEncoderConfig):
377
+ super().__init__()
378
+ self.embed_dim = config.hidden_size
379
+ self.self_attn = VISION_ATTENTION_CLASSES[config._attn_implementation](config=config)
380
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
381
+ self.mlp = HulumedVisionMLP(config)
382
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
383
+
384
+ # Ignore copy
385
+ def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
386
+ hidden_states = hidden_states + self.self_attn(
387
+ self.layer_norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
388
+ )
389
+ hidden_states = hidden_states + self.mlp(self.layer_norm2(hidden_states))
390
+ return hidden_states
391
+
392
+
393
+ class HulumedVisionTransformerEncoder(nn.Module):
394
+
395
+ def __init__(self, config: HulumedVisionEncoderConfig):
396
+ super().__init__()
397
+ self.config = config
398
+ head_dim = config.hidden_size // config.num_attention_heads
399
+ self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
400
+ self.layers = nn.ModuleList([HulumedVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
401
+ self.gradient_checkpointing = False
402
+
403
+ def rot_pos_emb(self, grid_sizes, merge_sizes):
404
+ pos_ids = []
405
+ for (t, h, w), merge_size in zip(grid_sizes, merge_sizes):
406
+ hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
407
+ hpos_ids = hpos_ids.reshape(
408
+ h // merge_size,
409
+ merge_size,
410
+ w // merge_size,
411
+ merge_size,
412
+ )
413
+ hpos_ids = hpos_ids.permute(0, 2, 1, 3)
414
+ hpos_ids = hpos_ids.flatten()
415
+
416
+ wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
417
+ wpos_ids = wpos_ids.reshape(
418
+ h // merge_size,
419
+ merge_size,
420
+ w // merge_size,
421
+ merge_size,
422
+ )
423
+ wpos_ids = wpos_ids.permute(0, 2, 1, 3)
424
+ wpos_ids = wpos_ids.flatten()
425
+ pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
426
+
427
+ pos_ids = torch.cat(pos_ids, dim=0)
428
+ max_grid_size = grid_sizes[:, 1:].max()
429
+ rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
430
+ rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
431
+
432
+ return rotary_pos_emb
433
+
434
+ def forward(self, hidden_states, grid_sizes, merge_sizes) -> torch.Tensor:
435
+ rotary_pos_emb = self.rot_pos_emb(grid_sizes, merge_sizes)
436
+
437
+ cu_seqlens = torch.repeat_interleave(grid_sizes[:, 1] * grid_sizes[:, 2], grid_sizes[:, 0]).cumsum(dim=0, dtype=torch.int32)
438
+ cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
439
+
440
+ for blk in self.layers:
441
+ if self.gradient_checkpointing and self.training:
442
+ hidden_states = self._gradient_checkpointing_func(
443
+ blk.__call__,
444
+ hidden_states,
445
+ cu_seqlens,
446
+ rotary_pos_emb
447
+ )
448
+ else:
449
+ hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
450
+
451
+ return hidden_states
452
+
453
+
454
+ class HulumedVisionEncoderModel(PreTrainedModel):
455
+
456
+ config_class = HulumedVisionEncoderConfig
457
+ base_model_prefix = "hulumed"
458
+ main_input_name = "pixel_values"
459
+ supports_gradient_checkpointing = True
460
+ _no_split_modules = [
461
+ "HulumedVisionEncoderLayer",
462
+ "HulumedVisionEmbeddings",
463
+ ]
464
+ _supports_flash_attn_2 = True
465
+ _supports_sdpa = True
466
+
467
+ def __init__(self, config: HulumedVisionEncoderConfig):
468
+ super().__init__(config=config)
469
+ embed_dim = config.hidden_size
470
+
471
+ self.embeddings = HulumedVisionEmbeddings(config)
472
+ self.encoder = HulumedVisionTransformerEncoder(config)
473
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
474
+
475
+ self.post_init()
476
+
477
+ def forward(self, pixel_values, grid_sizes, merge_sizes=None) -> torch.Tensor:
478
+ hidden_states = self.embeddings(pixel_values)
479
+ hidden_states = self.encoder(hidden_states, grid_sizes, merge_sizes)
480
+ hidden_states = self.post_layernorm(hidden_states)
481
+
482
+ hidden_states_chunks = hidden_states.split(grid_sizes.prod(dim=1).tolist(), dim=0)
483
+ outputs = []
484
+
485
+ for hidden_states, grid_size, merge_size in zip(hidden_states_chunks, grid_sizes, merge_sizes):
486
+ # NOTE: previous implementation, which supports downsampling with any factor
487
+ c = hidden_states.shape[-1]
488
+ hidden_states = hidden_states.view(
489
+ grid_size[0], grid_size[1] // merge_size, grid_size[2] // merge_size, merge_size, merge_size, c
490
+ ).permute(0, 1, 3, 2, 4, 5)
491
+ hidden_states = hidden_states.reshape(
492
+ grid_size[0], grid_size[1], grid_size[2], c
493
+ ).permute(0, 3, 1, 2)
494
+ hidden_states = torch.nn.functional.interpolate(
495
+ hidden_states,
496
+ size=(grid_size[1] // merge_size, grid_size[2] // merge_size),
497
+ mode='bilinear'
498
+ )
499
+ hidden_states = hidden_states.permute(0, 2, 3, 1).view(-1, c)
500
+
501
+ # NOTE: simplified implementation, which only supports downsampling with integer factor
502
+ # NOTE: this implementation is mathematically equivalent to the previous one when merge_size is 1 or 2 but may cause slightly different results
503
+ # hidden_states = hidden_states.view(-1, merge_size * merge_size, hidden_states.size(-1))
504
+ # hidden_states = hidden_states.mean(dim=1)
505
+
506
+ outputs.append(hidden_states)
507
+
508
+ return torch.cat(outputs, dim=0)
509
+
510
+ def _init_weights(self, module):
511
+ """Initialize the weights"""
512
+ if isinstance(module, nn.Embedding):
513
+ default_flax_embed_init(module.weight)
514
+ elif isinstance(module, VisionAttention):
515
+ nn.init.xavier_uniform_(module.q_proj.weight)
516
+ nn.init.xavier_uniform_(module.k_proj.weight)
517
+ nn.init.xavier_uniform_(module.v_proj.weight)
518
+ nn.init.xavier_uniform_(module.out_proj.weight)
519
+ nn.init.zeros_(module.q_proj.bias)
520
+ nn.init.zeros_(module.k_proj.bias)
521
+ nn.init.zeros_(module.v_proj.bias)
522
+ nn.init.zeros_(module.out_proj.bias)
523
+ elif isinstance(module, HulumedVisionMLP):
524
+ nn.init.xavier_uniform_(module.fc1.weight)
525
+ nn.init.xavier_uniform_(module.fc2.weight)
526
+ nn.init.normal_(module.fc1.bias, std=1e-6)
527
+ nn.init.normal_(module.fc2.bias, std=1e-6)
528
+ elif isinstance(module, (nn.Linear, nn.Conv2d)):
529
+ lecun_normal_(module.weight)
530
+ if module.bias is not None:
531
+ nn.init.zeros_(module.bias)
532
+ elif isinstance(module, nn.LayerNorm):
533
+ module.bias.data.zero_()
534
+ module.weight.data.fill_(1.0)
modeling_hulumed_qwen2.py ADDED
@@ -0,0 +1,525 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """PyTorch HuluMed model."""
16
+
17
+ import importlib.util
18
+ import os.path as osp
19
+ import re
20
+ from abc import ABC, abstractmethod
21
+ from typing import List, Optional, Tuple, Union
22
+
23
+ import torch
24
+ import torch.nn as nn
25
+ import torch.utils.checkpoint
26
+ from transformers import AutoModel, Qwen2ForCausalLM, Qwen2Model
27
+ from transformers.generation.utils import GenerateOutput
28
+ from transformers.modeling_outputs import CausalLMOutputWithPast
29
+
30
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
31
+ WORKER_HEART_BEAT_INTERVAL = 15
32
+
33
+ LOGDIR = "."
34
+
35
+ # Model Constants
36
+ IGNORE_INDEX = -100
37
+
38
+ # Image arguments
39
+ IMAGE_TOKEN_INDEX = -200
40
+ DEFAULT_IMAGE_TOKEN = "<image>"
41
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
42
+ DEFAULT_IM_START_TOKEN = "<im_start>"
43
+ DEFAULT_IM_END_TOKEN = "<im_end>"
44
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
45
+
46
+ # Video arguments
47
+ VIDEO_TOKEN_INDEX = -201
48
+ DEFAULT_VIDEO_TOKEN = "<video>"
49
+ NUM_FRAMES = 128
50
+ MAX_FRAMES = 768
51
+ NUM_FRAMES_PER_SECOND = 1
52
+
53
+ # Audio arguments
54
+ AUDIO_TOKEN_INDEX = -202
55
+ DEFAULT_AUDIO_TOKEN = "<audio>"
56
+
57
+ # Stream arguments
58
+ STREAM_START_TOKEN = "<|stream_start|>"
59
+ STREAM_END_TOKEN = "<|stream_end|>"
60
+ STREAM_MAX_FRAMES = 400
61
+
62
+ MODAL_INDEX_MAP = {
63
+ "<image>": -200,
64
+ "<video>": -201,
65
+ "<audio>": -202,
66
+ }
67
+
68
+ subimage_token_num=196
69
+ try:
70
+ from .configuration_hulumed_qwen2 import HulumedQwen2Config
71
+ except ModuleNotFoundError:
72
+ spec = importlib.util.spec_from_file_location(
73
+ "configuration_hulumed_qwen2",
74
+ osp.join(osp.dirname(__file__), "configuration_hulumed_qwen2.py"),
75
+ )
76
+ configuration_hulumed_qwen2 = importlib.util.module_from_spec(spec)
77
+ spec.loader.exec_module(configuration_hulumed_qwen2)
78
+ HulumedQwen2Config = getattr(
79
+ configuration_hulumed_qwen2,
80
+ "HulumedQwen2Config",
81
+ )
82
+
83
+
84
+ def build_mlp(depth, hidden_size, output_hidden_size):
85
+ """Build MLP layers for projection."""
86
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
87
+ for _ in range(1, depth):
88
+ modules.append(nn.GELU())
89
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
90
+ return nn.Sequential(*modules)
91
+
92
+
93
+ def build_vision_projector(config, delay_load=False, **kwargs):
94
+ """Build vision projector based on config."""
95
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
96
+
97
+ if projector_type == "linear":
98
+ return nn.Linear(config.vision_encoder_config.hidden_size, config.hidden_size)
99
+ elif projector_type.startswith("mlp"):
100
+ return MlpGeluProjector(config, projector_type)
101
+ else:
102
+ raise ValueError(f'Unknown projector type: {projector_type}')
103
+
104
+
105
+ class MlpGeluProjector(nn.Module):
106
+ """MLP projector with GELU activation."""
107
+
108
+ def __init__(self, config, projector_type):
109
+ super().__init__()
110
+
111
+ mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
112
+ if mlp_gelu_match is None:
113
+ raise ValueError(f"Invalid projector type format: {projector_type}")
114
+ mlp_depth = int(mlp_gelu_match.group(1))
115
+
116
+ self.readout = build_mlp(
117
+ mlp_depth,
118
+ config.vision_encoder_config.hidden_size,
119
+ config.hidden_size
120
+ )
121
+
122
+ def forward(self, x):
123
+ return self.readout(x)
124
+
125
+
126
+ class HulumedMetaModel:
127
+ """Meta model for HuluMed that handles vision encoder initialization."""
128
+
129
+ def __init__(self, config):
130
+ super(HulumedMetaModel, self).__init__(config)
131
+ print('config.vision_encoder',config.vision_encoder)
132
+ if config.vision_encoder is not None:
133
+ # Load from pretrained path
134
+ print('Load from pretrained path')
135
+ self.vision_encoder = AutoModel.from_pretrained(
136
+ config.vision_encoder,
137
+ attn_implementation=self.config._attn_implementation,
138
+ torch_dtype=self.dtype,
139
+ )
140
+ self.config.vision_encoder_config = self.vision_encoder.config
141
+ self.config.vision_encoder = None
142
+ elif config.vision_encoder_config is not None:
143
+ # Build from config
144
+ print('Build from config')
145
+ self.vision_encoder = AutoModel.from_config(
146
+ self.config.vision_encoder_config,
147
+ attn_implementation=self.config._attn_implementation,
148
+ torch_dtype=self.dtype,
149
+ )
150
+ else:
151
+ raise ValueError("Vision encoder is not provided in config")
152
+
153
+ self.mm_projector = build_vision_projector(config)
154
+
155
+ def get_vision_encoder(self):
156
+ return self.vision_encoder
157
+
158
+ def get_mm_projector(self):
159
+ return self.mm_projector
160
+
161
+
162
+ class HulumedQwen2Model(HulumedMetaModel, Qwen2Model):
163
+ """HuluMed Qwen2 Model."""
164
+
165
+ config_class = HulumedQwen2Config
166
+
167
+ def __init__(self, config: HulumedQwen2Config):
168
+ super(HulumedQwen2Model, self).__init__(config)
169
+
170
+
171
+ class HulumedMetaForCausalLM(ABC):
172
+ """Meta class for HuluMed Causal LM with multimodal support."""
173
+
174
+ @abstractmethod
175
+ def get_model(self):
176
+ pass
177
+
178
+ def get_vision_encoder(self):
179
+ return self.get_model().get_vision_encoder()
180
+
181
+ def get_mm_projector(self):
182
+ return self.get_model().get_mm_projector()
183
+
184
+ def encode_images(
185
+ self,
186
+ pixel_values: torch.FloatTensor,
187
+ grid_sizes: torch.LongTensor,
188
+ merge_sizes: torch.LongTensor,
189
+ ) -> torch.FloatTensor:
190
+ """Encode images using vision encoder and projector."""
191
+ mm_features = self.get_model().get_vision_encoder()(
192
+ pixel_values=pixel_values,
193
+ grid_sizes=grid_sizes,
194
+ merge_sizes=merge_sizes,
195
+ )
196
+ mm_features = self.get_model().mm_projector(mm_features)
197
+ return mm_features
198
+
199
+ def _get_valid_visual_tokens(
200
+ self,
201
+ mm_features: torch.FloatTensor,
202
+ batched_num_patches: torch.LongTensor,
203
+ modals: List[str],
204
+ ):
205
+ """Filter out text-only samples and keep only valid visual tokens."""
206
+ valid_masks = []
207
+ for num_patches, modal in zip(batched_num_patches, modals):
208
+ valid_mask = torch.full(
209
+ (num_patches,),
210
+ modal != "text",
211
+ dtype=torch.bool,
212
+ device=mm_features.device
213
+ )
214
+ valid_masks.append(valid_mask)
215
+ mm_features = mm_features[torch.cat(valid_masks)]
216
+ return mm_features
217
+
218
+ def _maybe_truncate_visual_tokens(
219
+ self,
220
+ mm_features: torch.FloatTensor,
221
+ compression_mask: torch.BoolTensor,
222
+ batched_num_patches: torch.LongTensor,
223
+ modals: List[str],
224
+ input_ids: torch.LongTensor,
225
+ position_ids: Optional[torch.LongTensor] = None,
226
+ ):
227
+ """Truncate visual tokens if necessary based on position_ids."""
228
+ if position_ids is None or mm_features.shape[0] == input_ids.eq(self.config.image_token_index).sum():
229
+ return mm_features, compression_mask
230
+
231
+ truncation_mask = []
232
+ for num_patches, modal in zip(batched_num_patches, modals):
233
+ if modal == "text":
234
+ truncation_mask.append(torch.ones((0,), dtype=torch.bool, device=input_ids.device))
235
+ else:
236
+ truncation_mask.append(torch.ones((num_patches,), dtype=torch.bool, device=input_ids.device))
237
+
238
+ seq_end_indices = torch.nonzero(position_ids == 0)[:, 0]
239
+ seq_end_indices = seq_end_indices[seq_end_indices > 0].tolist() + [len(input_ids)]
240
+ seq_start_indices = [0] + seq_end_indices[:-1]
241
+ num_visual_tokens = [
242
+ input_ids[start:end].eq(self.config.image_token_index).sum()
243
+ for start, end in zip(seq_start_indices, seq_end_indices)
244
+ ]
245
+
246
+ for n, mask in zip(num_visual_tokens, truncation_mask):
247
+ if len(mask) > 0:
248
+ mask[n:] = False
249
+ truncation_mask = torch.cat(truncation_mask)
250
+
251
+ return mm_features[truncation_mask], compression_mask[truncation_mask]
252
+
253
+ def _get_compression_mask(
254
+ self,
255
+ pixel_values: torch.FloatTensor,
256
+ batched_num_patches: torch.LongTensor,
257
+ grid_sizes: torch.LongTensor,
258
+ merge_sizes: torch.LongTensor,
259
+ modals: List[str],
260
+ threshold: float = 0.1,
261
+ min_tokens: int = 1,
262
+ ) -> torch.BoolTensor:
263
+ """Get compression mask for video tokens based on frame differences."""
264
+ batched_images = pixel_values.split(grid_sizes.prod(dim=1).tolist(), dim=0)
265
+ compression_masks = []
266
+
267
+ for images, num_patches, grid_size, merge_size, modal in zip(
268
+ batched_images, batched_num_patches, grid_sizes, merge_sizes, modals
269
+ ):
270
+ t, h, w = grid_size
271
+ if modal == "image" or (modal == "video" and t == 1):
272
+ compression_masks.append(torch.ones((num_patches,), dtype=torch.bool, device=images.device))
273
+
274
+ elif modal == "video":
275
+ # Video token compression based on pixel differences
276
+ images = images.view(t, (h // merge_size) * (w // merge_size), -1)
277
+
278
+ pixel_diff = images[1:] - images[:-1]
279
+ pixel_diff = torch.abs(pixel_diff).mean(dim=-1) * 255
280
+ pixel_diff = torch.cat([torch.full_like(pixel_diff[0:1], threshold + 1), pixel_diff], dim=0)
281
+ mask = (pixel_diff / 255.0) > threshold
282
+ padding_ids = torch.nonzero(mask.sum(dim=1) < min_tokens)[:, 0]
283
+ mask[padding_ids, :min_tokens] = 1
284
+ compression_masks.append(mask.flatten())
285
+
286
+ else:
287
+ # Pseudo image case
288
+ compression_masks.append(torch.ones((0,), dtype=torch.bool, device=images.device))
289
+
290
+ return torch.cat(compression_masks)
291
+
292
+ def _compress_visual_tokens(
293
+ self,
294
+ compression_mask: torch.BoolTensor,
295
+ mm_features: torch.FloatTensor,
296
+ input_ids: torch.LongTensor,
297
+ attention_mask: Optional[torch.Tensor] = None,
298
+ position_ids: Optional[torch.LongTensor] = None,
299
+ labels: Optional[torch.LongTensor] = None,
300
+ ):
301
+ """Compress visual tokens based on compression mask."""
302
+ mm_features = mm_features[compression_mask]
303
+ image_selected = (input_ids == self.config.image_token_index)
304
+
305
+ text_masks = torch.logical_not(image_selected)
306
+ text_masks[image_selected] = compression_mask
307
+ input_ids = input_ids[text_masks]
308
+
309
+ if attention_mask is not None:
310
+ attention_mask = attention_mask[text_masks]
311
+ if labels is not None:
312
+ labels = labels[text_masks]
313
+ if position_ids is not None:
314
+ position_ids = position_ids[text_masks]
315
+ pos_start = [0] + torch.nonzero(position_ids == 0)[:, 0].tolist()
316
+ pos_end = pos_start[1:] + [len(input_ids)]
317
+ position_ids = torch.cat([
318
+ torch.arange(end - start, device=input_ids.device)
319
+ for start, end in zip(pos_start, pos_end)
320
+ ])
321
+
322
+ return mm_features, input_ids, attention_mask, position_ids, labels
323
+
324
+ def prepare_inputs_labels_for_multimodal(
325
+ self,
326
+ input_ids: torch.LongTensor = None,
327
+ attention_mask: Optional[torch.Tensor] = None,
328
+ position_ids: Optional[torch.LongTensor] = None,
329
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
330
+ labels: Optional[torch.LongTensor] = None,
331
+ pixel_values: Optional[torch.FloatTensor] = None,
332
+ grid_sizes: Optional[torch.LongTensor] = None,
333
+ merge_sizes: Optional[torch.LongTensor] = None,
334
+ modals: Optional[List[str]] = None,
335
+ ):
336
+ """Prepare inputs and labels for multimodal training/inference."""
337
+ vision_encoder = self.get_vision_encoder()
338
+
339
+ # Text-only situation
340
+ if vision_encoder is None or pixel_values is None or input_ids.shape[1] == 1:
341
+ return input_ids, attention_mask, position_ids, past_key_values, None, labels
342
+
343
+ # 1. Flatten text inputs
344
+ B, N = input_ids.shape
345
+ input_ids = input_ids.view(B * N)
346
+ if attention_mask is not None:
347
+ attention_mask = attention_mask.view(B * N)
348
+ if position_ids is not None:
349
+ position_ids = position_ids.view(B * N)
350
+ if labels is not None:
351
+ labels = labels.view(B * N)
352
+
353
+ # 2. Embed visual tokens
354
+ batched_num_patches = grid_sizes.prod(dim=1).div(merge_sizes ** 2).long()
355
+ mm_features = self.encode_images(pixel_values, grid_sizes, merge_sizes).to(input_ids.device)
356
+ mm_features = self._get_valid_visual_tokens(mm_features, batched_num_patches, modals)
357
+
358
+ compression_mask = self._get_compression_mask(
359
+ pixel_values, batched_num_patches, grid_sizes, merge_sizes, modals
360
+ )
361
+ mm_features, compression_mask = self._maybe_truncate_visual_tokens(
362
+ mm_features, compression_mask, batched_num_patches, modals, input_ids, position_ids
363
+ )
364
+
365
+ # 3. Compress visual tokens if enabled
366
+ if self.config.use_token_compression:
367
+ assert B == 1, "Token compression is only supported for batch_size=1"
368
+ mm_features, input_ids, attention_mask, position_ids, labels = self._compress_visual_tokens(
369
+ compression_mask, mm_features, input_ids, attention_mask, position_ids, labels
370
+ )
371
+
372
+ # 4. Embed text tokens
373
+ inputs_embeds = self.get_model().embed_tokens(input_ids).clone()
374
+
375
+ # 5. Replace multimodal tokens with features
376
+ image_selected = (input_ids == self.config.image_token_index)
377
+ inputs_embeds[image_selected] = inputs_embeds[image_selected] * 0.0 + mm_features
378
+
379
+ # 6. Reshape back to batched format
380
+ C = inputs_embeds.shape[-1]
381
+ inputs_embeds = inputs_embeds.reshape(B, -1, C)
382
+ if attention_mask is not None:
383
+ attention_mask = attention_mask.view(B, -1)
384
+ if labels is not None:
385
+ labels = labels.view(B, -1)
386
+ if position_ids is not None:
387
+ position_ids = position_ids.view(B, -1)
388
+
389
+ return None, attention_mask, position_ids, past_key_values, inputs_embeds, labels
390
+
391
+
392
+ class HulumedQwen2ForCausalLM(Qwen2ForCausalLM, HulumedMetaForCausalLM):
393
+ """HuluMed Qwen2 model for causal language modeling with multimodal support."""
394
+
395
+ config_class = HulumedQwen2Config
396
+
397
+ def __init__(self, config, **kwargs):
398
+ super(Qwen2ForCausalLM, self).__init__(config)
399
+ self.model = HulumedQwen2Model(config)
400
+ self.vocab_size = config.vocab_size
401
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
402
+
403
+ # Initialize weights and apply final processing
404
+ self.post_init()
405
+
406
+ def get_model(self):
407
+ return self.model
408
+
409
+ def forward(
410
+ self,
411
+ input_ids: torch.LongTensor = None,
412
+ attention_mask: Optional[torch.Tensor] = None,
413
+ position_ids: Optional[torch.LongTensor] = None,
414
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
415
+ inputs_embeds: Optional[torch.FloatTensor] = None,
416
+ labels: Optional[torch.LongTensor] = None,
417
+ use_cache: Optional[bool] = None,
418
+ output_attentions: Optional[bool] = None,
419
+ output_hidden_states: Optional[bool] = None,
420
+ return_dict: Optional[bool] = None,
421
+ cache_position: Optional[torch.LongTensor] = None,
422
+ num_logits_to_keep: int = 0,
423
+ # Multimodal inputs
424
+ pixel_values: Optional[torch.FloatTensor] = None,
425
+ grid_sizes: Optional[torch.LongTensor] = None,
426
+ merge_sizes: Optional[torch.LongTensor] = None,
427
+ modals: Optional[List[str]] = None,
428
+ **loss_kwargs,
429
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
430
+ """Forward pass with multimodal support."""
431
+ if inputs_embeds is None:
432
+ (
433
+ input_ids,
434
+ attention_mask,
435
+ position_ids,
436
+ past_key_values,
437
+ inputs_embeds,
438
+ labels,
439
+ ) = self.prepare_inputs_labels_for_multimodal(
440
+ input_ids=input_ids,
441
+ attention_mask=attention_mask,
442
+ position_ids=position_ids,
443
+ past_key_values=past_key_values,
444
+ labels=labels,
445
+ pixel_values=pixel_values,
446
+ grid_sizes=grid_sizes,
447
+ merge_sizes=merge_sizes,
448
+ modals=modals,
449
+ )
450
+
451
+ return super().forward(
452
+ input_ids=input_ids,
453
+ attention_mask=attention_mask,
454
+ position_ids=position_ids,
455
+ past_key_values=past_key_values,
456
+ inputs_embeds=inputs_embeds,
457
+ labels=labels,
458
+ use_cache=use_cache,
459
+ output_attentions=output_attentions,
460
+ output_hidden_states=output_hidden_states,
461
+ return_dict=return_dict,
462
+ cache_position=cache_position,
463
+ num_logits_to_keep=num_logits_to_keep,
464
+ **loss_kwargs,
465
+ )
466
+
467
+ @torch.no_grad()
468
+ def generate(
469
+ self,
470
+ # Multimodal inputs
471
+ pixel_values: Optional[torch.FloatTensor] = None,
472
+ grid_sizes: Optional[torch.LongTensor] = None,
473
+ merge_sizes: Optional[torch.LongTensor] = None,
474
+ modals: Optional[List[str]] = None,
475
+ **kwargs,
476
+ ) -> Union[GenerateOutput, torch.LongTensor]:
477
+ """Generate with multimodal support."""
478
+ input_ids = kwargs.pop("input_ids", None)
479
+ attention_mask = kwargs.pop("attention_mask", None)
480
+ position_ids = kwargs.pop("position_ids", None)
481
+ past_key_values = kwargs.pop("past_key_values", None)
482
+
483
+ if "inputs_embeds" in kwargs:
484
+ raise NotImplementedError("`inputs_embeds` is not supported")
485
+
486
+ if pixel_values is not None:
487
+ (
488
+ input_ids,
489
+ attention_mask,
490
+ position_ids,
491
+ past_key_values,
492
+ inputs_embeds,
493
+ labels,
494
+ ) = self.prepare_inputs_labels_for_multimodal(
495
+ input_ids=input_ids,
496
+ attention_mask=attention_mask,
497
+ position_ids=position_ids,
498
+ past_key_values=past_key_values,
499
+ labels=None,
500
+ pixel_values=pixel_values,
501
+ grid_sizes=grid_sizes,
502
+ merge_sizes=merge_sizes,
503
+ modals=modals,
504
+ )
505
+ else:
506
+ inputs_embeds = self.get_model().embed_tokens(input_ids)
507
+
508
+ return super().generate(
509
+ position_ids=position_ids,
510
+ attention_mask=attention_mask,
511
+ inputs_embeds=inputs_embeds,
512
+ **kwargs
513
+ )
514
+
515
+ def prepare_inputs_for_generation(
516
+ self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
517
+ ):
518
+ """Prepare inputs for generation."""
519
+ images = kwargs.pop("images", None)
520
+ _inputs = super().prepare_inputs_for_generation(
521
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
522
+ )
523
+ if images is not None:
524
+ _inputs['images'] = images
525
+ return _inputs
preprocessor_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "image_processing_hulumed.HulumedImageProcessor",
4
+ "AutoProcessor": "processing_hulumed.HulumedProcessor"
5
+ },
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.5,
12
+ 0.5,
13
+ 0.5
14
+ ],
15
+ "image_processor_type": "HulumedImageProcessor",
16
+ "image_std": [
17
+ 0.5,
18
+ 0.5,
19
+ 0.5
20
+ ],
21
+ "max_tokens": 16384,
22
+ "min_tokens": 16,
23
+ "patch_size": 14,
24
+ "processor_class": "HulumedProcessor",
25
+ "resample": 3,
26
+ "rescale_factor": 0.00392156862745098
27
+ }
processing_hulumed.py ADDED
@@ -0,0 +1,873 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Processor class for HuluMed with 3D support."""
2
+
3
+ import copy
4
+ import importlib.util
5
+ import os
6
+ import os.path as osp
7
+ import warnings
8
+ from collections import defaultdict
9
+ from typing import Any, List, Union, Dict, Optional, Tuple, TypedDict
10
+
11
+ import cv2
12
+ import ffmpeg
13
+ import imageio
14
+ import json
15
+ import numpy as np
16
+ import torch
17
+ import transformers
18
+ from decord import VideoReader, cpu
19
+ from PIL import Image
20
+ from transformers.feature_extraction_utils import BatchFeature
21
+ from transformers.image_utils import ImageInput
22
+ from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
23
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
24
+
25
+ try:
26
+ import nibabel as nib
27
+ NIBABEL_AVAILABLE = True
28
+ except ImportError:
29
+ NIBABEL_AVAILABLE = False
30
+ warnings.warn("nibabel is not installed. 3D medical imaging support will be limited. Install with: pip install nibabel")
31
+
32
+ try:
33
+ from . import image_processing_hulumed
34
+ from .image_processing_hulumed import (
35
+ is_valid_image, is_valid_video,
36
+ )
37
+ except ModuleNotFoundError:
38
+ spec = importlib.util.spec_from_file_location(
39
+ "image_processing_hulumed",
40
+ osp.join(osp.dirname(__file__), "image_processing_hulumed.py"),
41
+ )
42
+ image_processing_hulumed = importlib.util.module_from_spec(spec)
43
+ spec.loader.exec_module(image_processing_hulumed)
44
+ is_valid_image = getattr(image_processing_hulumed, "is_valid_image")
45
+ is_valid_video = getattr(image_processing_hulumed, "is_valid_video")
46
+
47
+ DEFAULT_IMAGE_TOKEN = "<image>"
48
+ IGNORE_INDEX = -100
49
+
50
+ Conversation = List[Dict[str, Any]]
51
+ SingleImage = Union[Image.Image, np.ndarray, torch.Tensor]
52
+ SingleVideo = Union[List[SingleImage], np.ndarray, torch.Tensor]
53
+ BatchedImage = List[Union[SingleImage, SingleVideo]]
54
+ BatchedNamedImage = List[Tuple[str, Union[SingleImage, SingleVideo]]]
55
+
56
+
57
+ def _custom_import(class_name: str):
58
+ try:
59
+ attribute_class = getattr(transformers, class_name)
60
+ except AttributeError:
61
+ attribute_class = getattr(image_processing_hulumed, class_name)
62
+ return attribute_class
63
+
64
+
65
+ def is_named_image(image) -> bool:
66
+ return isinstance(image, (list, tuple)) and \
67
+ len(image) == 2 and \
68
+ isinstance(image[0], str) and \
69
+ image[0] in ["image", "video", "3d"] and \
70
+ (is_valid_image(image[1]) or is_valid_video(image[1]))
71
+
72
+
73
+ def make_batched_images(images) -> Tuple[List[str], List[ImageInput]]:
74
+ if isinstance(images, (list, tuple)) and all(is_named_image(image) for image in images):
75
+ modals = [image[0] if image[0] != "3d" else "video" for image in images]
76
+ data = [image[1] for image in images]
77
+ return modals, data
78
+ elif isinstance(images, (list, tuple)) and all(is_valid_image(image) or is_valid_video(image) for image in images):
79
+ batch = []
80
+ for image in images:
81
+ if is_valid_video(image):
82
+ batch.append(("video", image))
83
+ elif is_valid_image(image):
84
+ batch.append(("image", image))
85
+ else:
86
+ raise ValueError(f"Could not make batched images from {images}")
87
+ return [x[0] for x in batch], [x[1] for x in batch]
88
+ elif is_named_image(images):
89
+ modal = images[0] if images[0] != "3d" else "video"
90
+ return [modal], [images[1]]
91
+ elif is_valid_video(images):
92
+ return ["video"], [images]
93
+ elif is_valid_image(images):
94
+ return ["image"], [images]
95
+
96
+ raise ValueError(f"Could not make batched images from {images}")
97
+
98
+
99
+ def frame_sample(duration, mode='uniform', num_frames=None, vid_fps=None, fps=None):
100
+ if mode == 'uniform':
101
+ assert num_frames is not None, "Number of frames must be provided for uniform sampling."
102
+ if duration <= num_frames:
103
+ return np.arange(duration).astype(int)
104
+ return np.linspace(0, duration-1, num_frames, dtype=int)
105
+ elif mode == 'fps':
106
+ assert vid_fps is not None, "FPS must be provided for FPS sampling."
107
+ assert fps is not None, "FPS must be provided for FPS sampling."
108
+ segment_len = min(vid_fps // fps, duration)
109
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
110
+ else:
111
+ raise ValueError(f'Unsupported frame sampling mode: {mode}')
112
+
113
+
114
+ def load_video_from_ids(video_path, s=None, e=None, fps=None, max_frames=128, temporal_factor=1):
115
+ if s is not None and e is not None:
116
+ s = s if s >= 0. else 0.
117
+ e = e if e >= 0. else 0.
118
+ if s > e:
119
+ s, e = e, s
120
+ elif s == e:
121
+ e = s + 1
122
+
123
+ if os.path.isdir(video_path):
124
+ frame_files = sorted(os.listdir(video_path))
125
+ vid_fps = 3
126
+ num_frames_of_video = len(frame_files)
127
+ elif video_path.endswith('.gif'):
128
+ gif_reader = imageio.get_reader(video_path)
129
+ vid_fps = 25
130
+ num_frames_of_video = len(gif_reader)
131
+ else:
132
+ vreader = VideoReader(video_path, ctx=cpu(0), num_threads=2)
133
+ vid_fps = vreader.get_avg_fps()
134
+ num_frames_of_video = len(vreader)
135
+
136
+ f_start = 0 if s is None else max(int(s * vid_fps) - 1, 0)
137
+ f_end = num_frames_of_video - 1 if e is None else min(int(e * vid_fps) - 1, num_frames_of_video - 1)
138
+ frame_indices = list(range(f_start, f_end + 1))
139
+
140
+ duration = len(frame_indices)
141
+ if fps is not None and duration / vid_fps < max_frames:
142
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', vid_fps=vid_fps, fps=fps)]
143
+ else:
144
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
145
+
146
+ if os.path.isdir(video_path):
147
+ frames = np.array([cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in sampled_frame_indices])
148
+ elif video_path.endswith('.gif'):
149
+ frames = np.array([cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices])
150
+ else:
151
+ frames = vreader.get_batch(sampled_frame_indices).asnumpy()
152
+
153
+ frames = frames.transpose(0, 3, 1, 2)
154
+ timestamps = [x / vid_fps for x in sampled_frame_indices]
155
+
156
+ if temporal_factor > 1:
157
+ pad_length = temporal_factor - len(frames) % temporal_factor
158
+ frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
159
+ [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
160
+
161
+ frames = [frame for frame in frames]
162
+
163
+ return frames, timestamps
164
+
165
+
166
+ class ChatTemplateKwargs(TypedDict, total=False):
167
+ chat_template: Optional[str]
168
+ add_system_prompt: Optional[bool]
169
+ add_generation_prompt: Optional[bool]
170
+
171
+
172
+ class HulumedProcessorKwargs(ProcessingKwargs, ChatTemplateKwargs, total=False):
173
+ chat_template_kwargs: ChatTemplateKwargs = {
174
+ **ChatTemplateKwargs.__annotations__,
175
+ }
176
+
177
+ _defaults = {
178
+ "text_kwargs": {
179
+ "padding": False,
180
+ },
181
+ "images_kwargs": {
182
+
183
+ },
184
+ "chat_template_kwargs": {
185
+ "chat_template": None,
186
+ "add_system_prompt": False,
187
+ "add_generation_prompt": False,
188
+ },
189
+ }
190
+
191
+
192
+ class HulumedProcessor(ProcessorMixin):
193
+ attributes = ["image_processor", "tokenizer"]
194
+ image_processor_class = "HulumedImageProcessor"
195
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
196
+ valid_kwargs = ["chat_template", "image_merge_size", "video_merge_size", "fps", "max_frames"]
197
+
198
+ def __init__(
199
+ self,
200
+ image_processor=None,
201
+ tokenizer=None,
202
+ chat_template: str = None,
203
+ image_merge_size: int = 1,
204
+ video_merge_size: int = 2,
205
+ fps: Optional[int] = 1,
206
+ max_frames: Optional[int] = 128,
207
+ ):
208
+ self.image_processor = image_processor
209
+ self.tokenizer = tokenizer
210
+ if chat_template is None:
211
+ chat_template = self.tokenizer.chat_template
212
+ self.chat_template = chat_template
213
+
214
+ self.image_merge_size = image_merge_size
215
+ self.video_merge_size = video_merge_size
216
+ self.fps = fps
217
+ self.max_frames = max_frames
218
+
219
+ self.generation_prompt = self._infer_generation_prompt()
220
+ self.generation_prompt_ids = self.tokenizer.encode(self.generation_prompt, return_tensors="pt")
221
+ self.generation_prompt_length = len(self.generation_prompt_ids[0])
222
+ self.image_token_id = self.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
223
+ self.eos_token_id = self.tokenizer.eos_token_id
224
+
225
+ @classmethod
226
+ def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
227
+ args = []
228
+ for attribute_name in cls.attributes:
229
+ class_name = getattr(cls, f"{attribute_name}_class")
230
+ if isinstance(class_name, tuple):
231
+ classes = tuple(_custom_import(n) if n is not None else None for n in class_name)
232
+ use_fast = kwargs.get("use_fast", True)
233
+ if use_fast and classes[1] is not None:
234
+ attribute_class = classes[1]
235
+ else:
236
+ attribute_class = classes[0]
237
+ else:
238
+ attribute_class = _custom_import(class_name)
239
+
240
+ args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
241
+ return args
242
+
243
+ def get_generation_prompt(self):
244
+ return self.generation_prompt
245
+
246
+ def get_generation_prompt_ids(self):
247
+ return self.generation_prompt_ids
248
+
249
+ def _infer_generation_prompt(self):
250
+ pseudo_message = [{"role": "user", "content": ""}]
251
+ instruction = self.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=True)
252
+ conversation = self.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=False)
253
+ return instruction.replace(conversation, "")
254
+
255
+ def _get_downsampled_grid_sizes(self, image_inputs: Dict[str, Any]):
256
+ grid_sizes = []
257
+ for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
258
+ if not torch.all(grid_size[1:] % merge_size == 0):
259
+ warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
260
+ if grid_size[0] == 1:
261
+ grid_sizes.append(grid_size[1:] / merge_size)
262
+ elif grid_size[0] > 1:
263
+ grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])
264
+ return grid_sizes
265
+
266
+ def _get_visual_seq_len(self, grid_size: torch.Tensor):
267
+ num_tokens = int(grid_size.prod().item())
268
+ return num_tokens
269
+
270
+ def load_images(self, image_path: Union[str, List[str], Image.Image, List[Image.Image]]):
271
+ if isinstance(image_path, str) and os.path.isfile(image_path):
272
+ images = [Image.open(image_path).convert('RGB')]
273
+ elif isinstance(image_path, str) and os.path.isdir(image_path):
274
+ images = [Image.open(os.path.join(image_path, f)).convert('RGB') for f in sorted(os.listdir(image_path))]
275
+ elif isinstance(image_path, list) and isinstance(image_path[0], str):
276
+ images = [Image.open(f).convert('RGB') for f in image_path]
277
+ elif isinstance(image_path, list) and isinstance(image_path[0], Image.Image):
278
+ images = [np.array(x) for x in image_path]
279
+ elif isinstance(image_path, Image.Image):
280
+ images = [np.array(image_path)]
281
+ else:
282
+ raise ValueError(f"Unsupported image path type: {type(image_path)}")
283
+ return images
284
+
285
+ def load_nii(
286
+ self,
287
+ nii_path: str,
288
+ num_slices: Optional[int] = None,
289
+ axis: int = 2,
290
+ window_center: Optional[float] = None,
291
+ window_width: Optional[float] = None,
292
+ normalize: bool = True,
293
+ ):
294
+ if not NIBABEL_AVAILABLE:
295
+ raise ImportError("nibabel is required for NIfTI support. Install with: pip install nibabel")
296
+
297
+ if not os.path.exists(nii_path):
298
+ raise FileNotFoundError(f"NIfTI file not found: {nii_path}")
299
+
300
+ nii_img = nib.load(nii_path)
301
+ volume = nii_img.get_fdata()
302
+
303
+ if axis == 0:
304
+ slices = [volume[i, :, :] for i in range(volume.shape[0])]
305
+ elif axis == 1:
306
+ slices = [volume[:, i, :] for i in range(volume.shape[1])]
307
+ elif axis == 2:
308
+ slices = [volume[:, :, i] for i in range(volume.shape[2])]
309
+ else:
310
+ raise ValueError(f"Invalid axis: {axis}. Must be 0, 1, or 2.")
311
+
312
+ if num_slices is not None and num_slices < len(slices):
313
+ indices = np.linspace(0, len(slices) - 1, num_slices, dtype=int)
314
+ slices = [slices[i] for i in indices]
315
+
316
+ processed_slices = []
317
+ for slice_2d in slices:
318
+ if window_center is not None and window_width is not None:
319
+ lower = window_center - window_width / 2
320
+ upper = window_center + window_width / 2
321
+ slice_2d = np.clip(slice_2d, lower, upper)
322
+
323
+ if normalize:
324
+ slice_min = slice_2d.min()
325
+ slice_max = slice_2d.max()
326
+ if slice_max > slice_min:
327
+ slice_2d = (slice_2d - slice_min) / (slice_max - slice_min) * 255.0
328
+ else:
329
+ slice_2d = np.zeros_like(slice_2d)
330
+
331
+ slice_2d = slice_2d.astype(np.uint8)
332
+ slice_rgb = np.stack([slice_2d] * 3, axis=0)
333
+
334
+ processed_slices.append(slice_rgb)
335
+
336
+ return processed_slices
337
+
338
+ def load_video(
339
+ self,
340
+ video_path: str,
341
+ start_time: Optional[float] = None,
342
+ end_time: Optional[float] = None,
343
+ fps: Optional[float] = None,
344
+ max_frames: Optional[float] = None,
345
+ size: Optional[int] = None,
346
+ size_divisible: int = 1,
347
+ precise_time: bool = False,
348
+ verbose: bool = False,
349
+ temporal_factor: int = 1
350
+ ):
351
+ fps = self.fps if fps is None else fps
352
+ max_frames = self.max_frames if max_frames is None else max_frames
353
+
354
+ if start_time is not None and end_time is not None and end_time - start_time < 1:
355
+ return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
356
+ if os.path.isdir(video_path):
357
+ return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
358
+ if video_path.endswith('.gif'):
359
+ return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
360
+
361
+ probe = ffmpeg.probe(video_path)
362
+ duration = float(probe['format']['duration'])
363
+ video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
364
+ w, h = int(video_stream['width']), int(video_stream['height'])
365
+
366
+ kwargs, input_kwargs, output_kwargs = {}, {}, {}
367
+ do_trim = start_time is not None or end_time is not None
368
+ if start_time is not None:
369
+ new_start_time = max(float(video_stream['start_time']), start_time)
370
+ duration -= new_start_time - start_time
371
+ start_time = new_start_time
372
+ else:
373
+ start_time = float(video_stream['start_time'])
374
+ if end_time is not None:
375
+ duration = min(duration, end_time - start_time)
376
+ if do_trim:
377
+ kwargs = {'ss': start_time, 't': duration}
378
+ if precise_time:
379
+ output_kwargs.update(kwargs)
380
+ else:
381
+ input_kwargs.update(kwargs)
382
+
383
+ if size is not None:
384
+ scale_factor = size / min(w, h)
385
+ new_w, new_h = round(w * scale_factor), round(h * scale_factor)
386
+ else:
387
+ new_w, new_h = w, h
388
+ new_w = new_w // size_divisible * size_divisible
389
+ new_h = new_h // size_divisible * size_divisible
390
+
391
+ stream = ffmpeg.input(video_path, **input_kwargs)
392
+ if fps is not None:
393
+ stream = ffmpeg.filter(stream, "fps", fps=fps, round="down")
394
+ if new_w != w or new_h != h:
395
+ stream = ffmpeg.filter(stream, 'scale', new_w, new_h)
396
+ stream = ffmpeg.output(stream, "pipe:", format="rawvideo", pix_fmt="rgb24", **output_kwargs)
397
+ out, _ = ffmpeg.run(stream, capture_stdout=True, quiet=not verbose)
398
+
399
+ frames = np.frombuffer(out, np.uint8).reshape([-1, new_h, new_w, 3]).transpose([0, 3, 1, 2])
400
+
401
+ if fps is not None:
402
+ timestamps = np.arange(start_time, start_time + duration + 1 / fps, 1 / fps)[:len(frames)]
403
+ else:
404
+ timestamps = np.linspace(start_time, start_time + duration, len(frames))
405
+
406
+ if max_frames is not None and len(frames) > max_frames:
407
+ indices = np.linspace(0, len(frames) - 1, max_frames, dtype=int)
408
+ frames = frames[indices]
409
+ timestamps = timestamps[indices]
410
+
411
+ if temporal_factor > 1:
412
+ pad_length = temporal_factor - len(frames) % temporal_factor
413
+ frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
414
+ timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
415
+
416
+ frames = [frame for frame in frames]
417
+ timestamps = [timestamp for timestamp in timestamps]
418
+
419
+ return frames, timestamps
420
+
421
+ def _load_multimodal_data(self, conversation: Conversation):
422
+ multimodal_info = defaultdict(list)
423
+ new_conversation = []
424
+ for message in conversation:
425
+ new_message = {"role": message["role"]}
426
+ if not isinstance(message["content"], (list, tuple)):
427
+ new_message["content"] = message["content"]
428
+ new_conversation.append(new_message)
429
+ continue
430
+
431
+ new_contents = []
432
+ for content in message["content"]:
433
+ if not isinstance(content, dict):
434
+ new_contents.append(content)
435
+ continue
436
+ assert "type" in content, "Content must have 'type' field."
437
+
438
+ if content["type"] in ["image", "video", "3d"] and content["type"] in content and isinstance(content[content["type"]], dict):
439
+ load_args = content[content["type"]]
440
+ data_id = json.dumps({k: v for k, v in load_args.items() if k not in ["start_time", "end_time"]})
441
+ new_content = copy.deepcopy(content)
442
+ multimodal_info[data_id].append(new_content)
443
+ new_contents.append(new_content)
444
+ else:
445
+ new_contents.append(content)
446
+
447
+ new_message["content"] = new_contents
448
+ new_conversation.append(new_message)
449
+
450
+ for data_id, contents in multimodal_info.items():
451
+ data_type = contents[0]["type"]
452
+
453
+ if data_type == "image":
454
+ image = self.load_images(contents[0][data_type]["image_path"])[0]
455
+ for content in contents:
456
+ content["image"] = [image.copy()]
457
+
458
+ elif data_type == "3d":
459
+ load_args = contents[0]["3d"]
460
+ nii_path = load_args["image_path"]
461
+ num_slices = load_args.get("nii_num_slices", None)
462
+ axis = load_args.get("nii_axis", 2)
463
+ window_center = load_args.get("window_center", None)
464
+ window_width = load_args.get("window_width", None)
465
+
466
+ slices = self.load_nii(
467
+ nii_path=nii_path,
468
+ num_slices=num_slices,
469
+ axis=axis,
470
+ window_center=window_center,
471
+ window_width=window_width,
472
+ )
473
+
474
+ for content in contents:
475
+ content["type"] = "video"
476
+ content["video"] = slices
477
+ content["num_frames"] = len(slices)
478
+ content.pop("3d", None)
479
+
480
+ elif data_type == "video":
481
+ start_times = [content["video"].get("start_time", 0.) for content in contents]
482
+ end_times = [content["video"].get("end_time", float("inf")) for content in contents]
483
+
484
+ load_args = contents[0][data_type]
485
+ start_time, end_time = min(start_times), max(end_times)
486
+ if start_time > 0:
487
+ load_args["start_time"] = start_time
488
+ if end_time < float("inf"):
489
+ load_args["end_time"] = end_time
490
+ images, timestamps = self.load_video(**load_args)
491
+
492
+ for content, start_time, end_time in zip(contents, start_times, end_times):
493
+ cur_images, cur_timestamps = [], []
494
+ for image, timestamp in zip(images, timestamps):
495
+ if start_time <= timestamp <= end_time:
496
+ cur_images.append(image.copy())
497
+ cur_timestamps.append(timestamp)
498
+
499
+ content[data_type] = cur_images
500
+ content["num_frames"] = len(cur_images)
501
+ content["timestamps"] = cur_timestamps
502
+
503
+ return new_conversation
504
+
505
+ def _gather_multimodal_data(self, conversation: Conversation):
506
+ images = []
507
+ for message in conversation:
508
+ if not isinstance(message["content"], (list, tuple)):
509
+ continue
510
+ for content in message["content"]:
511
+ if not isinstance(content, dict):
512
+ continue
513
+ if content["type"] == "video":
514
+ video = content["video"]
515
+ assert is_valid_video(video), f"Invalid video data: {video}."
516
+ images.append(("video", video))
517
+ elif content["type"] == "image":
518
+ image = content["image"]
519
+ images.append(("image", image))
520
+ images = images if len(images) > 0 else None
521
+ return images
522
+
523
+ def _process_conversation_with_label(
524
+ self,
525
+ conversation: Conversation,
526
+ image_inputs: Dict[str, Any],
527
+ **kwargs,
528
+ ):
529
+ assert kwargs.pop("return_tensors", "pt") == "pt", "Only PyTorch tensors are supported when return_labels=True."
530
+ assert "add_generation_prompt" not in kwargs, "'add_generation_prompt' argument is not supported when return_labels=True."
531
+
532
+ output_kwargs = self._merge_kwargs(
533
+ HulumedProcessorKwargs,
534
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
535
+ **kwargs,
536
+ )
537
+ output_kwargs["chat_template_kwargs"].pop("add_generation_prompt")
538
+
539
+ grid_sizes = self._get_downsampled_grid_sizes(image_inputs)
540
+ text_inputs = {"input_ids": [], "labels": []}
541
+ sample_types_list = []
542
+ image_idx = 0
543
+
544
+ for message_idx, message in enumerate(conversation):
545
+ prompt = self.apply_chat_template(
546
+ [message],
547
+ tokenize=False,
548
+ add_generation_prompt=False,
549
+ **output_kwargs["chat_template_kwargs"],
550
+ )
551
+ prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
552
+ prompt = []
553
+ for chunk_idx in range(len(prompt_chunks) - 1):
554
+ prompt.append(prompt_chunks[chunk_idx])
555
+ num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
556
+ prompt.append(DEFAULT_IMAGE_TOKEN * num_tokens)
557
+ image_idx += 1
558
+ prompt.append(prompt_chunks[-1])
559
+ prompt = "".join(prompt)
560
+
561
+ input_ids = self.tokenizer.encode(prompt, return_tensors="pt", **output_kwargs["text_kwargs"])[0]
562
+ text_inputs["input_ids"].append(input_ids)
563
+
564
+ targets = torch.full_like(input_ids, IGNORE_INDEX)
565
+ sample_types = torch.full_like(input_ids, IGNORE_INDEX)
566
+ if message["role"] == "assistant":
567
+ targets[self.generation_prompt_length:-1] = input_ids[self.generation_prompt_length:-1].clone()
568
+ elif message["role"] == "stream":
569
+ diff = torch.diff((input_ids == self.image_token_id).float())
570
+ image_end_indices = torch.nonzero(diff < 0)[:, 0]
571
+ targets[image_end_indices + 1] = input_ids[image_end_indices + 1]
572
+ sample_types = targets.clone()
573
+ sample_types[torch.logical_and(sample_types > 0, sample_types != self.eos_token_id)] = 0
574
+ targets[-2] = input_ids[-2]
575
+
576
+ if message_idx > 0 and conversation[message_idx - 1]["role"] == "stream":
577
+ targets[0] = input_ids[0]
578
+ sample_types[0] = input_ids[0]
579
+
580
+ text_inputs["labels"].append(targets)
581
+ sample_types_list.append(sample_types)
582
+
583
+ text_inputs = {k: torch.cat(v) for k, v in text_inputs.items()}
584
+ sample_types = torch.cat(sample_types_list)
585
+ types, counts = torch.unique(sample_types[sample_types > -1], return_counts=True)
586
+
587
+ if len(types) > 0:
588
+ target_num_samples = counts.amin()
589
+ for type_id, type_count in zip(types, counts):
590
+ if type_count > target_num_samples:
591
+ indices = torch.nonzero(sample_types == type_id)[:, 0]
592
+ random_selector = torch.randperm(indices.size(0))[:-target_num_samples]
593
+ text_inputs["labels"][indices[random_selector]] = IGNORE_INDEX
594
+
595
+ assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
596
+
597
+ return text_inputs
598
+
599
+ def _process_conversation_without_label(
600
+ self,
601
+ conversation: Conversation,
602
+ image_inputs: Dict[str, Any],
603
+ **kwargs,
604
+ ):
605
+ output_kwargs = self._merge_kwargs(
606
+ HulumedProcessorKwargs,
607
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
608
+ **kwargs,
609
+ )
610
+ prompt = self.apply_chat_template(
611
+ conversation,
612
+ tokenize=False,
613
+ **output_kwargs["chat_template_kwargs"],
614
+ )
615
+ return self.process_text(prompt, image_inputs, **output_kwargs["text_kwargs"])
616
+
617
+ def _process_conversation(
618
+ self,
619
+ conversation: Conversation,
620
+ images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
621
+ return_labels: bool = False,
622
+ **kwargs: Unpack[HulumedProcessorKwargs],
623
+ ) -> BatchFeature:
624
+ assert isinstance(conversation, list), "Conversation must be a list of messages."
625
+
626
+ if images is None:
627
+ conversation = self._load_multimodal_data(conversation)
628
+ images = self._gather_multimodal_data(conversation)
629
+
630
+ if not images:
631
+ images = None
632
+ elif isinstance(images, (list, tuple)):
633
+ images = [img for img in images if img and (not isinstance(img, (list, tuple)) or len(img) > 0)]
634
+ if not images:
635
+ images = None
636
+ output_kwargs = self._merge_kwargs(
637
+ HulumedProcessorKwargs,
638
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
639
+ **kwargs,
640
+ )
641
+
642
+ if images is not None:
643
+ if "merge_size" not in output_kwargs["images_kwargs"]:
644
+ has_video_or_3d = any(
645
+ content.get("type") in ["video", "3d"] or "video" in content or "3d" in content
646
+ for message in conversation
647
+ if isinstance(message.get("content"), list)
648
+ for content in message["content"]
649
+ if isinstance(content, dict)
650
+ )
651
+
652
+ output_kwargs["images_kwargs"]["merge_size"] = 2 if has_video_or_3d else 1
653
+
654
+ image_inputs = self.process_images(images, **output_kwargs["images_kwargs"])
655
+ else:
656
+ image_inputs = {}
657
+
658
+ if return_labels:
659
+ text_inputs = self._process_conversation_with_label(conversation, image_inputs, **kwargs)
660
+ else:
661
+ text_inputs = self._process_conversation_without_label(conversation, image_inputs, **kwargs)
662
+
663
+ return BatchFeature(data={**text_inputs, **image_inputs})
664
+
665
+ def _process_plain(
666
+ self,
667
+ text: Union[TextInput, PreTokenizedInput] = None,
668
+ images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
669
+ return_labels: bool = False,
670
+ **kwargs: Unpack[HulumedProcessorKwargs],
671
+ ) -> BatchFeature:
672
+ if text is None:
673
+ raise ValueError("You must provide 'text' or 'conversation'.")
674
+ if return_labels:
675
+ raise ValueError("return_labels is not supported for plain text processing.")
676
+
677
+ output_kwargs = self._merge_kwargs(
678
+ HulumedProcessorKwargs,
679
+ tokenizer_init_kwargs=self.tokenizer.init_kwargs,
680
+ **kwargs,
681
+ )
682
+
683
+ if images is not None:
684
+ image_inputs = self.process_images(images, **output_kwargs["images_kwargs"])
685
+ else:
686
+ image_inputs = {}
687
+
688
+ text_inputs = self.process_text(text, image_inputs, **output_kwargs["text_kwargs"])
689
+
690
+ return BatchFeature(data={**text_inputs, **image_inputs})
691
+
692
+ def process_images(self, images: Union[BatchedImage, BatchedNamedImage], **kwargs):
693
+ modals, images = make_batched_images(images)
694
+
695
+ if "merge_size" not in kwargs:
696
+ kwargs["merge_size"] = [
697
+ self.video_merge_size if modal == "video" else self.image_merge_size
698
+ for modal in modals
699
+ ]
700
+
701
+ image_inputs = self.image_processor(images=images, **kwargs)
702
+ image_inputs["modals"] = modals
703
+ return image_inputs
704
+
705
+ def process_text(
706
+ self,
707
+ text: TextInput,
708
+ image_inputs: Dict[str, Any],
709
+ **kwargs,
710
+ ):
711
+ grid_sizes = self._get_downsampled_grid_sizes(image_inputs)
712
+
713
+ kwargs.pop("padding", None)
714
+ kwargs.pop("padding_side", None)
715
+
716
+ if len(grid_sizes) > 0:
717
+ image_idx = 0
718
+ while DEFAULT_IMAGE_TOKEN in text:
719
+ num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
720
+ text = text.replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * num_tokens, 1)
721
+ image_idx += 1
722
+ text = text.replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
723
+
724
+ assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
725
+
726
+ text_inputs = self.tokenizer(text, **kwargs)
727
+ return text_inputs
728
+
729
+ def __call__(
730
+ self,
731
+ text: Optional[TextInput] = None,
732
+ conversation: Optional[Conversation] = None,
733
+ images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
734
+ return_labels: bool = False,
735
+ **kwargs: Unpack[HulumedProcessorKwargs],
736
+ ) -> BatchFeature:
737
+ if conversation is not None:
738
+ if text is not None:
739
+ raise ValueError("You cannot provide both 'conversation' and 'text'.")
740
+ return self._process_conversation(conversation, images, return_labels, **kwargs)
741
+ return self._process_plain(text, images, return_labels, **kwargs)
742
+
743
+ def batch_decode(self, *args, skip_special_tokens=True, use_think=False, **kwargs):
744
+ outputs = self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
745
+
746
+ if not use_think:
747
+ outputs = [self._remove_think_tags(output) for output in outputs]
748
+
749
+ return outputs
750
+
751
+ def decode(self, *args, skip_special_tokens=True, use_think=False, **kwargs):
752
+ output = self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
753
+
754
+ if not use_think:
755
+ output = self._remove_think_tags(output)
756
+
757
+ return output
758
+
759
+ def _remove_think_tags(self, text: str) -> str:
760
+ import re
761
+ pattern = r'<think>.*?</think>'
762
+ cleaned = re.sub(pattern, '', text, flags=re.DOTALL)
763
+ cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
764
+ cleaned = cleaned.strip()
765
+ return cleaned
766
+
767
+ def apply_chat_template(
768
+ self,
769
+ conversation: Conversation,
770
+ chat_template: Optional[str] = None,
771
+ tokenize: bool = False,
772
+ add_system_prompt: bool = False,
773
+ add_generation_prompt: bool = False,
774
+ image_token: Optional[str] = DEFAULT_IMAGE_TOKEN,
775
+ **kwargs,
776
+ ) -> str:
777
+ if chat_template is None:
778
+ if self.chat_template is not None:
779
+ chat_template = self.chat_template
780
+ else:
781
+ raise ValueError(
782
+ "No chat template is set for this processor. Please either set the `chat_template` attribute, "
783
+ "or provide a chat template as an argument."
784
+ )
785
+ return self.tokenizer.apply_chat_template(
786
+ conversation,
787
+ chat_template=chat_template,
788
+ tokenize=tokenize,
789
+ add_system_prompt=add_system_prompt,
790
+ add_generation_prompt=add_generation_prompt,
791
+ image_token=image_token,
792
+ **kwargs
793
+ )
794
+
795
+ @property
796
+ def model_input_names(self):
797
+ tokenizer_input_names = self.tokenizer.model_input_names
798
+ image_processor_input_names = self.image_processor.model_input_names
799
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + ["modals"]
800
+
801
+ def _merge_kwargs(
802
+ self,
803
+ ModelProcessorKwargs: ProcessingKwargs,
804
+ tokenizer_init_kwargs: Optional[Dict] = None,
805
+ **kwargs,
806
+ ) -> Dict[str, Dict]:
807
+ output_kwargs = {
808
+ "text_kwargs": {},
809
+ "images_kwargs": {},
810
+ "audio_kwargs": {},
811
+ "videos_kwargs": {},
812
+ "chat_template_kwargs": {},
813
+ "common_kwargs": {},
814
+ }
815
+
816
+ default_kwargs = {
817
+ "text_kwargs": {},
818
+ "images_kwargs": {},
819
+ "audio_kwargs": {},
820
+ "videos_kwargs": {},
821
+ "chat_template_kwargs": {},
822
+ "common_kwargs": {},
823
+ }
824
+
825
+ used_keys = set()
826
+
827
+ for modality in default_kwargs:
828
+ default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
829
+ for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
830
+ if modality_key in tokenizer_init_kwargs:
831
+ value = (
832
+ getattr(self.tokenizer, modality_key)
833
+ if hasattr(self.tokenizer, modality_key)
834
+ else tokenizer_init_kwargs[modality_key]
835
+ )
836
+ default_kwargs[modality][modality_key] = value
837
+
838
+ output_kwargs.update(default_kwargs)
839
+
840
+ non_modality_kwargs = set(kwargs) - set(output_kwargs)
841
+ for modality in output_kwargs:
842
+ for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
843
+ if modality in kwargs:
844
+ kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
845
+ if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
846
+ raise ValueError(
847
+ f"Keyword argument {modality_key} was passed twice: "
848
+ f"in a dictionary for {modality} and as a **kwarg."
849
+ )
850
+ elif modality_key in kwargs:
851
+ kwarg_value = kwargs.get(modality_key, "__empty__")
852
+ else:
853
+ kwarg_value = "__empty__"
854
+ if kwarg_value != "__empty__":
855
+ output_kwargs[modality][modality_key] = kwarg_value
856
+ used_keys.add(modality_key)
857
+
858
+ if any(key in default_kwargs for key in kwargs):
859
+ for modality, subdict in kwargs.items():
860
+ if modality in default_kwargs:
861
+ for subkey, subvalue in subdict.items():
862
+ if subkey not in used_keys:
863
+ output_kwargs[modality][subkey] = subvalue
864
+ used_keys.add(subkey)
865
+ else:
866
+ for key in kwargs:
867
+ if key not in used_keys:
868
+ output_kwargs["common_kwargs"][key] = kwargs[key]
869
+
870
+ for modality in output_kwargs:
871
+ output_kwargs[modality].update(output_kwargs["common_kwargs"])
872
+
873
+ return output_kwargs
processor_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_hulumed.HulumedProcessor"
4
+ },
5
+ "fps": 1,
6
+ "image_merge_size": 1,
7
+ "max_frames": 128,
8
+ "processor_class": "HulumedProcessor",
9
+ "video_merge_size": 2
10
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<image>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|stream_start|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|stream_end|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ }
205
+ },
206
+ "additional_special_tokens": [
207
+ "<|im_start|>",
208
+ "<|im_end|>",
209
+ "<|object_ref_start|>",
210
+ "<|object_ref_end|>",
211
+ "<|box_start|>",
212
+ "<|box_end|>",
213
+ "<|quad_start|>",
214
+ "<|quad_end|>",
215
+ "<|vision_start|>",
216
+ "<|vision_end|>",
217
+ "<|vision_pad|>",
218
+ "<|image_pad|>",
219
+ "<|video_pad|>"
220
+ ],
221
+ "bos_token": null,
222
+ "chat_template": "\n{%- set identifier = 'im' %}\n{% for message in messages %}\n {% if message['role'] == 'stream' %}\n {% set identifier = 'stream' %}\n {% else %}\n {% set identifier = 'im' %}\n {% endif %}\n {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}\n {% if message['content'] is string %}\n {{- message['content'] + '<|' + identifier + '_end|>\n' -}}\n {% else %}\n {% for content in message['content'] %}\n {% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}\n {% if 'time' in content %}\n {{- 'Time ' + content['time'] | round(1) | string + 's: ' -}}\n {% endif %}\n\n {{- '<image>\n' -}}\n\n {% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}\n {% for i in range(content['num_frames']) %}\n {% if 'timestamps' in content %}\n {{- 'Time ' + content['timestamps'][i] | round(1) | string + 's:' -}}\n {% endif %}\n {% if i < content['num_frames'] - 1 %}\n\n {{- '<image>,' -}}\n\n {% else %}\n\n {{- '<image>\n' -}}\n\n {% endif %}\n {% endfor %}\n {% elif content['type'] == 'text' or 'text' in content %}\n {{- content['text'] -}}\n {% endif %}\n {% endfor %}\n {{- '<|' + identifier + '_end|>\n' -}}\n {% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\n {{- '<|im_start|>assistant\n' -}}\n{% endif %}\n",
223
+ "clean_up_tokenization_spaces": false,
224
+ "eos_token": "<|im_end|>",
225
+ "errors": "replace",
226
+ "extra_special_tokens": {},
227
+ "model_max_length": 16384,
228
+ "pad_token": "<|endoftext|>",
229
+ "padding_side": "right",
230
+ "split_special_tokens": false,
231
+ "tokenizer_class": "Qwen2Tokenizer",
232
+ "unk_token": null
233
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff