Flare77 commited on Dec 5, 2025

Commit

b047851

verified ·

1 Parent(s): b79a0ed

Upload model HuLuLLM via Colab

Browse files

Files changed (24) hide show

.gitattributes +9 -35
README.md +570 -0
added_tokens.json +27 -0
chat_template.json +3 -0
config.json +44 -0
configuration.json +1 -0
configuration_hulumed_encoder.py +49 -0
configuration_hulumed_qwen2.py +83 -0
generation_config.json +14 -0
image_processing_hulumed.py +485 -0
merges.txt +0 -0
model-00001-of-00004.safetensors +3 -0
model-00002-of-00004.safetensors +3 -0
model-00003-of-00004.safetensors +3 -0
model-00004-of-00004.safetensors +3 -0
model.safetensors.index.json +786 -0
modeling_hulumed_encoder.py +534 -0
modeling_hulumed_qwen2.py +525 -0
preprocessor_config.json +27 -0
processing_hulumed.py +873 -0
processor_config.json +10 -0
special_tokens_map.json +31 -0
tokenizer_config.json +233 -0
vocab.json +0 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,9 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+model-00001-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00004-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+model-00001-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00002-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00003-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text
+model-00004-of-00003.safetensors filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,570 @@

+---
+license: apache-2.0
+tags:
+  - medical
+  - multimodal
+  - vision-language-model
+  - image-to-text
+  - video-understanding
+  - 3d-understanding
+  - qwen
+  - pytorch
+frameworks:
+  - pytorch
+pipeline_tag: image-text-to-text
+library_name: transformers
+---
+<div style="display: flex; align-items: center; justify-content: center;">
+  <h1 style="margin: 0; text-align: left;">
+    Hulu-Med: A Transparent Generalist Model towards Holistic Medical Vision-Language Understanding
+  </h1>
+</div>
+<div align="center">
+[![Paper](https://img.shields.io/badge/Paper-arXiv-red)](https://arxiv.org/abs/2510.08668)
+[![HuggingFace](https://img.shields.io/badge/🤗%20Hugging%20Face-Models-yellow)](https://huggingface.co/ZJU-AI4H/Hulu-Med)
+[![ModelScope](https://img.shields.io/badge/ModelScope-Models-blue)](https://modelscope.cn/models/Med-Team/Hulu-Med)
+[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](LICENSE)
+[![GitHub](https://img.shields.io/badge/GitHub-Code-blue?logo=github)](https://github.com/ZJUI-AI4H/Hulu-Med)
+![Total Downloads](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fhuggingface.co%2Fapi%2Fmodels%2FZJU-AI4H%2FHulu-Med-7B%3Fexpand%255B%255D%3DdownloadsAllTime&query=%24.downloadsAllTime&label=Total%20Downloads&color=blue)
+[📄 Paper](http://arxiv.org/abs/2510.08668) | [🤗 Hulu-Med-4B](https://huggingface.co/ZJU-AI4H/Hulu-Med-4B) | [🤗 Hulu-Med-7B](https://huggingface.co/ZJU-AI4H/Hulu-Med-7B) |[🤗 Hulu-Med-14B](https://huggingface.co/ZJU-AI4H/Hulu-Med-14B) |[🤗 Hulu-Med-32B](https://huggingface.co/ZJU-AI4H/Hulu-Med-32B) | [🔮 ModelScope Models](https://modelscope.cn/models/Med-Team/Hulu-Med) | [📊 Demo](#demo)
+</div>
+## 🔥 News
+- **[2025-11-27]** ⚡ **Hulu-Med** is now compatible with the latest **vLLM**, offering **faster inference** and **tensor parallel** support! Thank you all for your patience and feedback 💪   **[see here for installation](#🧩-vllm-installation)**
+- **[2025-11-18]** 🎊 We released **Hulu-Med-4B**, a lightweight model with strong multimodal and text reasoning abilities that surpasses **MedGemma-4B** and **Lingshu-7B**!
+- **[2025-11-01]** 📊 Releasing our new evaluation code, **MedUniEval**! Built on MedEvalKit, MedUniEval is designed for the comprehensive evaluation of medical visual-language models across various modalities—including text, 2D, 3D, and video. More benchmarks are coming soon.
+- **[2025-10-15]** 🎉 Hulu-Med now supports Transformers integration! HuggingFace-compatible models released with simplified loading and inference. Integration with VLLM is ongoing. *The HF models are now available in the **main branch** on Hugging Face*.
+- The model has been updated in the main branch of our Hugging Face repository. You can now load it directly using `AutoModelForCausalLM.from_pretrained` - the weights will be automatically downloaded.
+- **[2025-10-08]** Hulu-Med models and inference code released!
+## 📖 Overview
+**Hulu-Med** is a transparent medical vision-language model that unifies understanding across diverse modalities including **medical text, 2D/3D images, and videos**. Built with a focus on transparency and accessibility, Hulu-Med achieves state-of-the-art performance on 30 medical benchmarks while being trained entirely on public data.
+<div align="center">
+<img src="https://cdn-uploads.huggingface.co/production/uploads/68e4dbf1beab849e9baa6e26/ckBITEJ6W_VszDKujCaMW.jpeg" width="100%">
+</div>
+### Key Features
+- 🌟 **Holistic Multimodal Understanding**: Seamlessly processes medical text, 2D images, 3D volumes, and surgical videos
+- 🔓 **Fully Transparent**: Complete open-source pipeline including data curation, training code, and model weights
+- 📊 **State-of-the-Art Performance**: Outperforms leading open-source models and competes with proprietary systems
+- ⚡ **Efficient Training**: Only 4,000-40,000 GPU hours required for 7B-32B variants
+- 🗂️ **Comprehensive Coverage**: Trained on 16.7M samples spanning 12 anatomical systems and 14 imaging modalities
+- 🤗 **Transformers Native**: Now with native HuggingFace Transformers support for easier integration
+### Comprehensive Data Coverage
+Our training corpus encompasses:
+- **12 Major Anatomical Systems**: Multi-System, Skin/Integumentary, Respiratory, Cellular/Tissue Level, Digestive, Nervous, Cardiovascular, Musculoskeletal, Reproductive, Urinary, Whole Body, Endocrine, Immune/Lymphatic, and Hematologic systems
+- **14 Medical Imaging Modalities**: CT, MRI, X-Ray, Ultrasound, PET, OCT, Endoscopy, Microscopy, Histopathology, Fundus, Dermoscopy, Angiography, Digital Photograph, and Medical Chart
+- **Diverse Downstream Tasks**: Medical Dialogue, Anomaly Detection, Prognosis Prediction, Treatment Planning, Surgical Skill Assessment, Education, Medical Report Generation, Surgical Phase Recognition, Medical Computation, and more
+## 🏆 Performance Highlights
+### Medical Multimodal Benchmarks
+Performance comparison on medical multimodal benchmarks (For the 'Medical VLM < 10B' subgroup, **bold** indicates the best method):
+| Models | OM.VQA | PMC-VQA | VQA-RAD | SLAKE | PathVQA | MedXQA | MMMU-Med |
+|--------|--------|---------|---------|-------|---------|--------|----------|
+| **Proprietary Models** |
+| GPT-4.1 | 75.5 | 55.2 | 65.0 | 72.2 | 55.5 | 45.2 | 75.2 |
+| GPT-4o | 67.5 | 49.7 | 61.0 | 71.2 | 55.5 | 44.3 | 62.8 |
+| Claude Sonnet 4 | 65.5 | 54.4 | 67.6 | 70.6 | 54.2 | 43.3 | 74.6 |
+| Gemini-2.5-Flash | 71.0 | 55.4 | 68.5 | 75.8 | 55.4 | 52.8 | 76.9 |
+| **General VLMs < 10B** |
+| Qwen2.5VL-7B | 63.6 | 51.9 | 63.2 | 66.8 | 44.1 | 20.1 | 50.6 |
+| InternVL2.5-8B | 81.3 | 51.3 | 59.4 | 69.0 | 42.1 | 21.7 | 53.5 |
+| InternVL3-8B | 79.1 | 53.8 | 65.4 | 72.8 | 48.6 | 22.4 | 59.2 |
+| **General VLMs > 10B** |
+| InternVL3-14B | 78.9 | 54.1 | 66.3 | 72.8 | 48.0 | 23.1 | 63.1 |
+| Qwen2.5V-32B | 68.2 | 54.5 | 71.8 | 71.2 | 41.9 | 25.2 | 59.6 |
+| InternVL3-38B | 79.8 | 56.6 | 65.4 | 72.7 | 51.0 | 25.2 | 65.2 |
+| **Medical VLMs < 10B** |
+| LLaVA-Med-7B | 34.8 | 22.7 | 46.6 | 51.9 | 35.2 | 20.8 | 28.1 |
+| MedGemma-4B | 70.7 | 49.2 | 72.3 | 78.2 | 48.1 | 25.4 | 43.2 |
+| HuatuoGPT-V-7B | 74.3 | 53.1 | 67.6 | 68.1 | 44.8 | 23.2 | 49.8 |
+| Lingshu-7B | 82.9 | 56.3 | 67.9 | 83.1 | 61.9 | 26.7 | - |
+| **Hulu-Med-4B** | **81.6** | **64.6** | **71.6** | **85.0** | **60.1** | **26.4** | **50.5** |
+| **Hulu-Med-7B** | **84.2** | **66.8** | **78.0** | **86.8** | **65.6** | **29.0** | **51.4** |
+| **Medical VLMs > 10B** |
+| HealthGPT-14B | 75.2 | 56.4 | 65.0 | 66.1 | 56.7 | 24.7 | 49.6 |
+| HuatuoGPT-V-34B | 74.0 | 56.6 | 61.4 | 69.5 | 44.4 | 22.1 | 51.8 |
+| Lingshu-32B | 83.4 | 57.9 | 76.7 | 86.7 | 65.5 | 30.9 | - |
+| **Hulu-Med-14B** | **85.1** | **68.9** | **76.1** | **86.5** | **64.4** | **30.0** | **54.8** |
+| **Hulu-Med-32B** | **84.6** | **69.4** | **81.4** | **85.7** | **67.3** | **34.0** | **60.4** |
+### Medical Text Benchmarks
+Performance comparison on medical text benchmarks (**bold** indicates the best method in each subgroup):
+| Models | MMLU-Pro | MedXQA | Medbullets | SGPQA | PubMedQA | MedMCQA | MedQA | MMLU-Med |
+|--------|----------|--------|------------|-------|----------|---------|-------|----------|
+| **Proprietary Models** |
+| GPT-4.1 | 78.0 | 30.9 | 77.0 | 49.9 | 75.6 | 77.7 | 89.1 | 89.6 |
+| o3-mini | 78.1 | 35.4 | 83.7 | 50.1 | 73.6 | 60.6 | 74.5 | 87.0 |
+| Claude Sonnet 4 | 79.5 | 33.6 | 80.2 | 56.3 | 78.6 | 79.3 | 92.1 | 91.3 |
+| Gemini-2.5-Flash | 70.0 | 35.6 | 77.6 | 53.3 | 73.8 | 73.6 | 91.2 | 84.2 |
+| **General VLMs < 10B** |
+| Qwen2.5VL-7B | 50.5 | 12.8 | 42.1 | 26.3 | 76.4 | 52.6 | 57.3 | 73.4 |
+| InternVL2.5-8B | 50.6 | 11.6 | 42.4 | 26.1 | 76.4 | 52.4 | 53.7 | 74.2 |
+| InternVL3-8B | 57.9 | 13.1 | 48.5 | 31.2 | 75.4 | 57.7 | 62.1 | 77.5 |
+| **General VLMs > 10B** |
+| Qwen2.5VL-32B | 66.5 | 15.6 | 54.2 | 37.6 | 68.4 | 63.0 | 71.6 | 83.2 |
+| InternVL3-14B | 65.4 | 14.1 | 49.5 | 37.9 | 77.2 | 62.0 | 70.1 | 81.7 |
+| InternVL3-38B | 72.1 | 16.0 | 54.6 | 42.5 | 73.2 | 64.9 | 73.5 | 83.8 |
+| **Medical VLMs < 10B** |
+| LLaVA-Med-7B | 16.6 | 9.9 | 34.4 | 16.1 | 26.4 | 39.4 | 42.0 | 50.6 |
+| MedGemma-4B | 38.6 | 12.8 | 45.6 | 21.6 | 72.2 | 52.2 | 56.2 | 66.7 |
+| HuatuoGPT-V-7B | 44.6 | 10.1 | 40.9 | 21.9 | 72.8 | 51.2 | 52.9 | 69.3 |
+| Lingshu-7B | 50.4 | 16.5 | 56.2 | 26.3 | 76.6 | 55.9 | 63.3 | 74.5 |
+| **Hulu-Med-4B** | **58.6** | **16.8** | **59.4** | **29.5** | **77.6** | **64.8** | **71.9** | **78.6** |
+| **Hulu-Med-7B** | **60.6** | **19.6** | **61.5** | **31.1** | **77.4** | **67.6** | **73.5** | **79.5** |
+| **Medical VLMs > 10B** |
+| HealthGPT-14B | 63.4 | 11.3 | 39.8 | 25.7 | 68.0 | 63.4 | 66.2 | 80.2 |
+| Lingshu-32B | 70.2 | 22.7 | 65.4 | 41.1 | 77.8 | 66.1 | 74.7 | 84.7 |
+| HuatuoGPT-V-34B | 51.8 | 11.4 | 42.7 | 26.5 | 72.2 | 54.7 | 58.8 | 74.7 |
+| **Hulu-Med-14B** | **68.0** | **23.2** | **68.5** | **37.7** | **79.8** | **70.4** | **78.1** | **83.3** |
+| **Hulu-Med-32B** | **72.9** | **24.2** | **68.8** | **41.8** | **80.8** | **72.8** | **80.4** | **85.6** |
+## 🚀 Model Zoo
+We provide three model variants with different parameter scales:
+| Model | Parameters | LLM Base | Training Cost | HuggingFace | ModelScope |
+|-------|-----------|----------|---------------|-------------|------------|
+| **Hulu-Med-7B** | 7B | Qwen2.5-7B | ~4,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-7B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-7B) |
+| **Hulu-Med-14B** | 14B | Qwen3-14B | ~8,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-14B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-14B) |
+| **Hulu-Med-32B** | 32B | Qwen2.5-32B | ~40,000 GPU hours | [🤗 Link](https://huggingface.co/ZJU-AI4H/Hulu-Med-32B) | [🔮 Link](https://modelscope.cn/models/Med-Team/Hulu-Med-32B) |
+**Note**: HuggingFace-compatible versions (Hulu-Med-HF) are also available for easier integration with the Transformers library.
+## 🛠️ Installation
+```bash
+# Clone the repository
+git clone https://github.com/ZJUI-AI4H/Hulu-Med.git
+cd Hulu-Med
+# Create conda environment
+conda create -n hulumed python=3.10
+conda activate hulumed
+# PyTorch and torchvision for CUDA 11.8
+pip install torch==2.4.0 torchvision==0.19.0 --extra-index-url https://download.pytorch.org/whl/cu118
+# Flash-attn pinned to a compatible version
+pip install flash-attn==2.7.3 --no-build-isolation --upgrade
+# Transformers and accelerate
+pip install transformers==4.51.2 accelerate==1.7.0
+# Video processing dependencies
+pip install decord ffmpeg-python imageio opencv-python
+# For 3D medical image processing (NIfTI files)
+pip install nibabel
+# Install other dependencies
+pip install -r requirements.txt
+```
+<a id="vllm-install"></a>
+### 🧩 vLLM Installation
+```bash
+pip install git+https://github.com/jiangsongtao/vllm.git
+# or try this way
+git clone https://github.com/jiangsongtao/vllm.git
+cd vllm-main
+export VLLM_USE_PRECOMPILED=1
+rm -rf build/ .deps/
+pip install -e .
+pip uninstall flash-attn -y
+pip install flash-attn --no-build-isolation
+```
+## 💻 Quick Start
+We provide two ways to use Hulu-Med:
+### Option 1: Using HuggingFace Transformers (Recommended for Hulu-Med-HF models)
+For easier integration, use the HuggingFace-compatible models with native Transformers support:
+```python
+from transformers import AutoModelForCausalLM, AutoProcessor
+import torch
+model_path = "ZJU-AI4H/Hulu-Med-32B"
+# Load model and processor
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    trust_remote_code=True,
+    torch_dtype="bfloat16",
+    device_map="auto",
+    attn_implementation="flash_attention_2",
+)
+processor = AutoProcessor.from_pretrained(
+    model_path,
+    trust_remote_code=True
+)
+tokenizer = processor.tokenizer
+```
+#### Text-Only Example
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Hello, I have a headache, what should I eat?"},
+        ]
+    }
+]
+modal = 'text'
+inputs = processor(
+    conversation=conversation,
+    return_tensors="pt",
+    add_generation_prompt=True
+)
+inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+with torch.inference_mode():
+    output_ids = model.generate(
+        **inputs,
+        do_sample=True,
+        modals=[modal],
+        temperature=0.6,
+        max_new_tokens=4096,
+        use_cache=True,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+# Decode output
+# Enable thinking mode by adding: "Please reason step by step, and put your final answer within \boxed{}."
+# use_think=False: Only return the final answer without thinking process
+# use_think=True: Include the model's reasoning/thinking process in the output
+outputs = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False  # Set to True to see the thinking process
+)[0].strip()
+print(outputs)
+```
+#### 2D Image Example
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "image_path": "./demo/demo.jpg",
+                }
+            },
+            {
+                "type": "text",
+                "text": "Generate a medical report for this image."
+            },
+        ]
+    }
+]
+inputs = processor(
+    conversation=conversation,
+    add_system_prompt=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+if "pixel_values" in inputs:
+    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+outputs = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False
+)[0].strip()
+print(outputs)
+```
+#### Multi Images Example
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": {
+                    "image_path": "./demo/demo1.jpg",
+                }
+            },
+            {
+                "type": "image",
+                "image": {
+                    "image_path": "./demo/demo2.jpg",
+                }
+            },
+            {
+                "type": "text",
+                "text": "Are these two images the same?"
+            },
+        ]
+    }
+]
+inputs = processor(
+    conversation=conversation,
+    add_system_prompt=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+if "pixel_values" in inputs:
+    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+outputs_no_think = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False
+)[0].strip()
+print(outputs_no_think)
+```
+#### Interleaved Example
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Image A:"
+            },
+            {
+                "type": "image",
+                "image": {
+                    "image_path": "./demo/XRay.jpg",
+                }
+            },
+             {
+                "type": "text",
+                "text": "Image B:"
+            },
+            {
+                "type": "image",
+                "image": {
+                    "image_path": "./demo/pathology.png",
+                }
+            },
+            {
+                "type": "text",
+                "text": "Which image is the pathology slide?"
+            },
+        ]
+    }
+]
+inputs = processor(
+    conversation=conversation,
+    add_system_prompt=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+if "pixel_values" in inputs:
+    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+outputs_no_think = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False
+)[0].strip()
+print(outputs_no_think)
+#The pathology slide is Image B. It shows a microscopic view of tissue with various cellular structures and components, such as cells in different stages of maturation and areas of fibrous tissue. This type of image is typically used to examine the cellular architecture and identify any pathological changes within the tissue.
+```
+#### 3D Medical Image Example
+```python
+# Requires: pip install nibabel
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "3d",
+                "3d": {
+                    "image_path": "./demo/amos_0013.nii",
+                    "nii_num_slices": 180,
+                    "nii_axis": 2,  # 0=sagittal, 1=coronal, 2=axial
+                }
+            },
+            {
+                "type": "text",
+                "text": "Generate a medical report for this 3D CT scan."
+            },
+        ]
+    }
+]
+inputs = processor(
+    conversation=conversation,
+    add_system_prompt=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+if "pixel_values" in inputs:
+    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+outputs = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False
+)[0].strip()
+print(outputs)
+```
+#### Video Example
+```python
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "video",
+                "video": {
+                    "video_path": "./demo/1min_demo.mp4",
+                    "fps": 1,
+                    "max_frames": 1800
+                }
+            },
+            {
+                "type": "text",
+                "text": "Describe this video in detail."
+            },
+        ]
+    }
+]
+inputs = processor(
+    conversation=conversation,
+    add_system_prompt=True,
+    add_generation_prompt=True,
+    return_tensors="pt"
+)
+inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v
+          for k, v in inputs.items()}
+if "pixel_values" in inputs:
+    inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+output_ids = model.generate(**inputs, max_new_tokens=1024)
+outputs = processor.batch_decode(
+    output_ids,
+    skip_special_tokens=True,
+    use_think=False
+)[0].strip()
+print(outputs)
+```
+**Understanding the `use_think` parameter:**
+- `use_think=False`: Returns only the final answer (default for most use cases)
+- `use_think=True`: Includes the model's internal reasoning/thinking process before the final answer
+## 📊 Training
+### Data Preparation
+Our training data consists of 16.7M samples across four categories:
+- **Medical Multimodal Data** (9M samples): Covering 14 imaging modalities
+- **Medical Text Data** (4.9M samples): Clinical notes, literature, QA pairs
+- **General Multimodal Data** (1.3M samples): Enhancing generalization
+- **General Text Data** (1.5M samples): Improving reasoning capabilities
+Download and prepare the data:
+Coming soon
+## 🏗️ Model Architecture
+Hulu-Med consists of four core components:
+1. **Vision Encoder**: SigLIP-based encoder with 2D RoPE for unified 2D/3D/video processing
+2. **Multimodal Projector**: Projects visual tokens into language model space
+3. **LLM Decoder**: Qwen-based decoder for generating responses
+4. **Medical-Aware Token Reduction**: Efficient processing with ~55% token reduction
+## 📋 Supported Tasks
+- ✅ Visual Question Answering (2D/3D/Video)
+- ✅ Medical Report Generation
+- ✅ Disease Diagnosis
+- ✅ Anatomical Understanding
+- ✅ Surgical Phase Recognition
+- ✅ Clinical Dialogue
+- ✅ Medical Text Reasoning
+- ✅ Multilingual Medical QA
+- ✅ Rare Disease Diagnosis
+- ✅ And more
+## 📄 Citation
+If you find Hulu-Med useful in your research, please cite:
+```bibtex
+@misc{jiang2025hulumedtransparentgeneralistmodel,
+      title={Hulu-Med: A Transparent Generalist Model towards Holistic Medical Vision-Language Understanding},
+      author={Songtao Jiang and Yuan Wang and Sibo Song and Tianxiang Hu and Chenyi Zhou and Bin Pu and Yan Zhang and Zhibo Yang and Yang Feng and Joey Tianyi Zhou and Jin Hao and Zijian Chen and Ruijia Wu and Tao Tang and Junhui Lv and Hongxia Xu and Hongwei Wang and Jun Xiao and Bin Feng and Fudong Zhu and Kenli Li and Weidi Xie and Jimeng Sun and Jian Wu and Zuozhu Liu},
+      year={2025},
+      eprint={2510.08668},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV},
+      url={https://arxiv.org/abs/2510.08668},
+}
+```
+## 📜 License
+This project is released under the [Apache 2.0 License](LICENSE).

added_tokens.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "</tool_call>": 151658,
+  "<image>": 151665,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|stream_end|>": 151667,
+  "<|stream_start|>": 151666,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+  "chat_template": "\n{%- set identifier = 'im' %}\n{% for message in messages %}\n    {% if add_system_prompt and loop.first and message['role'] != 'system' %}\n        {{- '<|im_start|>system\nYou are Hulu-Med, a helpful health assistant that can understand text, 2D images, videos, and 3D images.<|im_end|>\n' -}}\n    {% endif %}\n    {% if message['role'] == 'stream' %}\n        {% set identifier = 'stream' %}\n    {% else %}\n        {% set identifier = 'im' %}\n    {% endif %}\n    {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}\n    {% if message['content'] is string %}\n        {{- message['content'] + '<|' + identifier + '_end|>\n' -}}\n    {% else %}\n        {% for content in message['content'] %}\n            {% if content is string %}\n                {{- content -}}\n            {% elif content['type'] == 'text' or 'text' in content %}\n                {{- content['text'] -}}\n            {% elif content['type'] == 'image' or 'image' in content %}\n                {% if 'timestamp' in content %}\n                    {{- 'Time ' + content['timestamp'] | round(1) | string + 's: ' -}}\n                {% endif %}\n                {{- image_token + '\n' -}}\n            {% elif content['type'] == 'video' or 'video' in content %}\n                {% for i in range(content['num_frames']) %}\n                    {% if 'timestamps' in content %}\n                        {{- 'Time ' + content['timestamps'][i] | round(1) | string + 's:' -}}\n                    {% endif %}\n                    {% if i < content['num_frames'] - 1 %}\n                        {{- image_token + ',' -}}\n                    {% else %}\n                        {{- image_token + '\n' -}}\n                    {% endif %}\n                {% endfor %}\n            {% endif %}\n        {% endfor %}\n        {% if identifier == 'stream' %}\n            {{- '<|' + identifier + '_end|>' -}}\n        {% else %}\n            {{- '<|' + identifier + '_end|>\n' -}}\n        {% endif %}\n    {% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' -}}\n{% endif %}\n"
+}

config.json ADDED Viewed

	@@ -0,0 +1,44 @@

+{
+    "architectures": [
+      "HulumedQwen2ForCausalLM"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_hulumed_qwen2.HulumedQwen2Config",
+      "AutoModelForCausalLM": "modeling_hulumed_qwen2.HulumedQwen2ForCausalLM"
+    },
+    "bos_token_id": 151643,
+    "eos_token_id": 151645,
+    "hidden_act": "silu",
+    "hidden_size": 3584,
+    "image_token_index": 151665,
+    "initializer_range": 0.02,
+    "intermediate_size": 18944,
+    "max_position_embeddings": 32768,
+    "max_window_layers": 28,
+    "mm_projector_type": "mlp2x_gelu",
+    "model_type": "hulumed_qwen2",
+    "num_attention_heads": 28,
+    "num_hidden_layers": 28,
+    "num_key_value_heads": 4,
+    "rms_norm_eps": 1e-06,
+    "rope_scaling": null,
+    "rope_theta": 1000000.0,
+    "sliding_window": null,
+    "tie_word_embeddings": false,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.51.2",
+    "use_cache": true,
+    "use_sliding_window": false,
+    "use_token_compression": false,
+    "vision_encoder_config": {
+      "hidden_size": 1152,
+      "intermediate_size": 4304,
+      "model_type": "hulumed_vision_encoder",
+      "num_attention_heads": 16,
+      "num_hidden_layers": 27,
+      "patch_size": 14
+    },
+    "vocab_size": 152064
+  }

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework": "pytorch", "task": "text-generation", "allow_remote": true}

configuration_hulumed_encoder.py ADDED Viewed

	@@ -0,0 +1,49 @@

+# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/siglip/configuration_siglip.py.
+# Below is the original copyright:
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HuluMed vision encoder model configuration."""
+from transformers import PretrainedConfig
+class HulumedVisionEncoderConfig(PretrainedConfig):
+    model_type = "hulumed_vision_encoder"
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act

configuration_hulumed_qwen2.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""HuluMed model configuration."""
+import importlib.util
+import os.path as osp
+from typing import Optional, Dict, Any
+from transformers import AutoConfig, AutoModel, PretrainedConfig, Qwen2Config
+try:
+    from .configuration_hulumed_encoder import HulumedVisionEncoderConfig
+except ModuleNotFoundError:
+    spec = importlib.util.spec_from_file_location(
+        "configuration_hulumed_encoder",
+        osp.join(osp.dirname(__file__), "configuration_hulumed_encoder.py"),
+    )
+    configuration_hulumed_encoder = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(configuration_hulumed_encoder)
+    HulumedVisionEncoderConfig = getattr(
+        configuration_hulumed_encoder,
+        "HulumedVisionEncoderConfig",
+    )
+try:
+    from .modeling_hulumed_encoder import HulumedVisionEncoderModel
+except ModuleNotFoundError:
+    spec = importlib.util.spec_from_file_location(
+        "modeling_hulumed_encoder",
+        osp.join(osp.dirname(__file__), "modeling_hulumed_encoder.py"),
+    )
+    modeling_hulumed_encoder = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(modeling_hulumed_encoder)
+    HulumedVisionEncoderModel = getattr(
+        modeling_hulumed_encoder,
+        "HulumedVisionEncoderModel",
+    )
+AutoConfig.register("hulumed_vision_encoder", HulumedVisionEncoderConfig)
+AutoModel.register(HulumedVisionEncoderConfig, HulumedVisionEncoderModel)
+class HulumedQwen2Config(Qwen2Config):
+    """
+    HuluMed model configuration.
+    This configuration class extends Qwen2Config to store the configuration of a HuluMed model.
+    It includes configuration for the vision encoder and multimodal projector.
+    """
+    model_type = "hulumed_qwen2"
+    sub_configs = {"vision_encoder_config": HulumedVisionEncoderConfig}
+    def __init__(
+        self,
+        vision_encoder: Optional[str] = None,
+        vision_encoder_config: Dict[str, Any] = {},
+        mm_projector_type: str = "mlp2x_gelu",
+        use_token_compression: bool = True,
+        image_token_index: int = -1,
+        **kwargs,
+    ):
+        """
+        Initialize HuluMed configuration.
+        Args:
+            vision_encoder (str, optional): Path or identifier of the vision encoder.
+            vision_encoder_config (dict, optional): Configuration for the vision encoder.
+            mm_projector_type (str): Type of multimodal projector. Default is "mlp2x_gelu".
+            use_token_compression (bool): Whether to use token compression for videos. Default is True.
+            image_token_index (int): Token index for image placeholders. Default is -1.
+            **kwargs: Additional arguments passed to Qwen2Config.
+        """
+        super().__init__(**kwargs)
+        self.model_type = "hulumed_qwen2"
+        self.vision_encoder = vision_encoder
+        if vision_encoder_config is not None and not isinstance(vision_encoder_config, PretrainedConfig):
+            vision_encoder_config = HulumedVisionEncoderConfig(**vision_encoder_config)
+        self.vision_encoder_config = vision_encoder_config
+        self.mm_projector_type = mm_projector_type
+        self.use_token_compression = use_token_compression
+        self.image_token_index = image_token_index

generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.51.2"
+}

image_processing_hulumed.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/image_processing_qwen2_vl.py.
+# Below is the original copyright:
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for HuluMed."""
+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.image_transforms import (
+    convert_to_rgb,
+    resize,
+    to_channel_dimension_format,
+)
+from transformers.image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    make_list_of_images,
+    to_numpy_array,
+)
+try:
+    from transformers.video_utils import VideoInput
+except:
+    from transformers.image_utils import VideoInput
+from transformers.utils import TensorType, is_vision_available, logging
+logger = logging.get_logger(__name__)
+if is_vision_available():
+    from PIL import Image
+def is_valid_video(video) -> bool:
+    if isinstance(video, (list, tuple)):
+        return all(is_valid_image(frame) for frame in video)
+    elif isinstance(video, np.ndarray):
+        return video.ndim == 4
+    elif isinstance(video, torch.Tensor):
+        return video.ndim == 4
+    return False
+def make_batched_images(images) -> List[List[ImageInput]]:
+    """
+    Accepts images in list or nested list format, and makes a list of images for preprocessing.
+    Args:
+        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images.
+    """
+    if isinstance(images, (list, tuple)):
+        # list of images/videos
+        if not all(is_valid_video(image) or is_valid_image(image) for image in images):
+            raise ValueError(f"Could not make batched images from {images}")
+        return images
+    elif is_valid_video(images) or is_valid_image(images):
+        # single image/video
+        return [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def simple_batched_resize(
+    images, factor: int = 28, min_tokens: int = 4 * 4, max_tokens: int = 16384, input_data_format: str = None
+):
+    min_pixels = min_tokens * factor * factor
+    max_pixels = max_tokens * factor * factor
+    num_images = 0
+    for image in images:
+        if is_valid_video(image):
+            num_images += len(image)
+        else:
+            num_images += 1
+    image_sizes = []
+    for image in images:
+        if is_valid_video(image):
+            image = image[0]
+        if isinstance(image, Image.Image):
+            height, width = image.size
+        else:
+            height, width = get_image_size(image, channel_dim=input_data_format)
+        image_sizes.append([height, width])
+    tmp_image_sizes = []
+    for height, width in image_sizes:
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > (max_pixels // num_images):
+            beta = math.sqrt((height * width) / (max_pixels // num_images))
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        # per image min_pixels
+        if h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        tmp_image_sizes.append((h_bar, w_bar))
+    image_sizes = tmp_image_sizes
+    return image_sizes
+def batched_resize(
+    images, factors: List[int], min_tokens: int = 4 * 4, max_tokens: int = 16384, input_data_format: str = None
+):
+    image_sizes = []
+    for image in images:
+        if is_valid_video(image):
+            num_frame = len(image)
+            image = image[0]
+        else:
+            num_frame = 1
+        if isinstance(image, Image.Image):
+            height, width = image.size
+        else:
+            height, width = get_image_size(image, channel_dim=input_data_format)
+        image_sizes.append([num_frame, height, width])
+    # global max_pixels
+    smart_scale_factors = 1.0
+    total_tokens = 0
+    for (num_frame, height, width), factor in zip(image_sizes, factors):
+        total_tokens += num_frame * math.ceil(height / factor) * math.ceil(width / factor)
+    # TODO: add min_pixels
+    if total_tokens > max_tokens:
+        beta = math.sqrt(total_tokens / max_tokens)
+        tmp_image_sizes = []
+        for (_, height, width), factor in zip(image_sizes, factors):
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+            tmp_image_sizes.append((h_bar, w_bar))
+        image_sizes = tmp_image_sizes
+    else:
+        tmp_image_sizes = []
+        for (_, height, width), factor in zip(image_sizes, factors):
+            height = round(height / factor) * factor
+            width = round(width / factor) * factor
+            tmp_image_sizes.append((height, width))
+        image_sizes = tmp_image_sizes
+    return image_sizes
+class HulumedImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a HuluMed image processor that dynamically resizes images based on the original images.
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use when resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats for each channel in the image.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        min_pixels (`int`, *optional*, defaults to `56 * 56`):
+            The min pixels of the image to resize the image.
+        max_pixels (`int`, *optional*, defaults to `28 * 28 * 1280`):
+            The max pixels of the image to resize the image.
+        patch_size (`int`, *optional*, defaults to 14):
+            The spacial patch size of the vision encoder.
+        merge_size (`int`, *optional*, defaults to `None`):
+            The default merge size for processing. If None, no default merge size is applied.
+    """
+    model_input_names = ["pixel_values", "grid_sizes", "merge_sizes"]
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_tokens: int = 4 * 4,
+        max_tokens: int = 16384,
+        patch_size: int = 14,
+        merge_size: Optional[int] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.min_tokens = min_tokens
+        self.max_tokens = max_tokens
+        self.patch_size = patch_size
+        self.do_convert_rgb = do_convert_rgb
+        self.merge_size = merge_size
+    def _preprocess(
+        self,
+        images: Union[ImageInput, VideoInput],
+        target_size: List[int],
+        merge_size: int = 1,
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images. Copy of the `preprocess` method from `CLIPImageProcessor`.
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
+            target_size (`List[int]`):
+                The target size to resize the image to. Should be a list of two integers: [target_height, target_width].
+            merge_size (`int`, *optional*, defaults to `1`):
+                The merge size after the vision encoder.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` enums.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Scale factor to use if rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Mean to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation to use if normalizing the image. Can be a float or a list of floats corresponding to the number of channels in the image.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            data_format (`ChannelDimension`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        images = make_list_of_images(images)
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+        height, width = get_image_size(images[0], channel_dim=input_data_format)
+        resized_height, resized_width = height, width
+        processed_images = []
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = target_size
+                image = resize(
+                    image, size=(resized_height, resized_width), resample=resample, input_data_format=input_data_format
+                )
+            if do_rescale:
+                image = self.rescale(image, scale=rescale_factor, input_data_format=input_data_format)
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+        patches = np.array(processed_images)
+        if data_format == ChannelDimension.LAST:
+            patches = patches.transpose(0, 3, 1, 2)
+        t = patches.shape[0]
+        channel = patches.shape[1]
+        grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
+        patches = patches.reshape(
+            t,
+            channel,
+            grid_h // merge_size,
+            merge_size,
+            self.patch_size,
+            grid_w // merge_size,
+            merge_size,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 2, 5, 3, 6, 1, 4, 7)
+        flatten_patches = patches.reshape(
+            t * grid_h * grid_w, channel * self.patch_size * self.patch_size
+        )
+        return flatten_patches, (t, grid_h, grid_w)
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: bool = None,
+        resample: PILImageResampling = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = None,
+        merge_size: Optional[Union[int, List[int]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            merge_size (`int` or `List[int]`, *optional*, defaults to `self.merge_size`):
+                The merge size for processing. Can be a single value or a list of values (one per image).
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        # Handle merge_size: use provided value, or fall back to instance default, or use 1
+        if merge_size is None:
+            merge_size = self.merge_size if self.merge_size is not None else 1
+        images = make_batched_images(images)
+        if isinstance(merge_size, (list, tuple)):
+            assert len(merge_size) == len(images), "Merge size must be the same length as images."
+            merge_sizes = merge_size
+        else:
+            merge_sizes = [merge_size for _ in images]
+        if all(merge_size == merge_sizes[0] for merge_size in merge_sizes):
+            target_sizes = simple_batched_resize(
+                images,
+                factor=self.patch_size * merge_sizes[0],
+                min_tokens=self.min_tokens,
+                max_tokens=self.max_tokens,
+                input_data_format=input_data_format,
+            )
+        else:
+            target_sizes = batched_resize(
+                images,
+                factors=[self.patch_size * merge_size for merge_size in merge_sizes],
+                min_tokens=self.min_tokens,
+                max_tokens=self.max_tokens,
+                input_data_format=input_data_format,
+            )
+        pixel_values, grid_sizes = [], []
+        for image, merge_size, target_size in zip(images, merge_sizes, target_sizes):
+            patches, grid_size = self._preprocess(
+                image,
+                target_size=target_size,
+                merge_size=merge_size,
+                do_resize=do_resize,
+                resample=resample,
+                do_rescale=do_rescale,
+                rescale_factor=rescale_factor,
+                do_normalize=do_normalize,
+                image_mean=image_mean,
+                image_std=image_std,
+                data_format=data_format,
+                do_convert_rgb=do_convert_rgb,
+                input_data_format=input_data_format,
+            )
+            pixel_values.append(patches)
+            grid_sizes.append(grid_size)
+        pixel_values = np.concatenate(pixel_values, axis=0)
+        grid_sizes = np.array(grid_sizes)
+        merge_sizes = np.array(merge_sizes)
+        data = {
+            "pixel_values": pixel_values,
+            "grid_sizes": grid_sizes,
+            "merge_sizes": merge_sizes,
+        }
+        return BatchFeature(data=data, tensor_type=return_tensors)

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model-00001-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:360c4f7335d9cf1edd72f3f4eb0f95dcccbadcfb2bf964ac2c60e7aae39a93b8
+size 5343777696

model-00002-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6af174d7d5af3d13b926f8112871892b68e6a44710b74c94b4fa799227219e6f
+size 5263077248

model-00003-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8d08aa83687fc5365c09223d42a1cb4059977aecb4d373a62852688652806b9
+size 4392737312

model-00004-of-00004.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95316e809197f95a3543696ae49a8ab3894c9d7eae20d8c0168406a10edbb63e
+size 1089994848

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,786 @@

+{
+  "metadata": {
+    "total_size": 16089489888
+  },
+  "weight_map": {
+    "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.norm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.embeddings.patch_embedding.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.embeddings.patch_embedding.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.0.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.1.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.2.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.3.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.4.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.5.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.6.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.7.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.8.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.9.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.10.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.11.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.12.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.13.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.14.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.15.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.16.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.17.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.18.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.19.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.20.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.21.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.22.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.23.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.24.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.25.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.out_proj.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.self_attn.out_proj.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.layer_norm1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.layer_norm1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.mlp.fc1.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.mlp.fc1.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.mlp.fc2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.mlp.fc2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.layer_norm2.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.encoder.layers.26.layer_norm2.bias": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.post_layernorm.weight": "model-00003-of-00004.safetensors",
+    "model.vision_encoder.post_layernorm.bias": "model-00003-of-00004.safetensors",
+    "model.mm_projector.readout.0.weight": "model-00003-of-00004.safetensors",
+    "model.mm_projector.readout.0.bias": "model-00003-of-00004.safetensors",
+    "model.mm_projector.readout.2.weight": "model-00003-of-00004.safetensors",
+    "model.mm_projector.readout.2.bias": "model-00003-of-00004.safetensors",
+    "lm_head.weight": "model-00004-of-00004.safetensors"
+  }
+}

modeling_hulumed_encoder.py ADDED Viewed

	@@ -0,0 +1,534 @@

+# Adopted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py.
+# Below is the original copyright:
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch HuluMed vision encoder model."""
+import importlib.util
+import os.path as osp
+import math
+import warnings
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import is_flash_attn_2_available
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None
+try:
+    from .configuration_hulumed_encoder import HulumedVisionEncoderConfig
+except ImportError:
+    spec = importlib.util.spec_from_file_location(
+        "configuration_hulumed_encoder",
+        osp.join(osp.dirname(__file__), "configuration_hulumed_encoder.py"),
+    )
+    configuration_hulumed_encoder = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(configuration_hulumed_encoder)
+    HulumedVisionEncoderConfig = getattr(
+        configuration_hulumed_encoder,
+        "HulumedVisionEncoderConfig",
+    )
+def _trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    # Values are generated by using a truncated uniform distribution and
+    # then using the inverse CDF for the normal distribution.
+    # Get upper and lower cdf values
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    # Uniformly fill tensor with values from [l, u], then translate to
+    # [2l-1, 2u-1].
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    # Use inverse cdf transform for normal distribution to get truncated
+    # standard normal
+    tensor.erfinv_()
+    # Transform to proper mean, std
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    # Clamp to ensure it's in the proper range
+    tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \\leq \text{mean} \\leq b`.
+    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
+    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
+    and the result is subsequently scaled and shifted by the mean and std args.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        # constant is stddev of standard normal truncated to (-2, 2)
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(tensor: torch.Tensor, freqs: torch.Tensor) -> torch.Tensor:
+    orig_dtype = tensor.dtype
+    tensor = tensor.float()
+    cos = freqs.cos()
+    sin = freqs.sin()
+    cos = cos.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    sin = sin.unsqueeze(1).repeat(1, 1, 2).unsqueeze(0).float()
+    output = (tensor * cos) + (rotate_half(tensor) * sin)
+    output = output.to(orig_dtype)
+    return output
+class VisionRotaryEmbedding(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class HulumedVisionEmbeddings(nn.Module):
+    def __init__(self, config: HulumedVisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.view(
+            -1, self.config.num_channels, self.patch_size, self.patch_size
+        )
+        patch_embeds = self.patch_embedding(hidden_states)  # shape = [*, width, grid, grid]
+        # embeddings = patch_embeds.flatten(2).transpose(1, 2)
+        embeddings = patch_embeds.view(-1, self.embed_dim)
+        return embeddings
+class VisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        """Input shape: Time x Channel"""
+        q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, q_len, q_len], device=query_states.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_weights = torch.matmul(query_states, key_states.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(q_len, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VisionFlashAttention2(VisionAttention):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    # Adapted from transformers.models.llama.modeling_llama.LlamaFlashAttention2.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        q_len, _ = hidden_states.size()
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(q_len, self.num_heads, self.head_dim)
+        key_states = key_states.view(q_len, self.num_heads, self.head_dim)
+        value_states = value_states.view(q_len, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens, cu_seqlens, max_seqlen, max_seqlen).reshape(
+            q_len, -1
+        )
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+class VisionSdpaAttention(VisionAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: torch.Tensor = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+        query_states = query_states.view(seq_length, self.num_heads, self.head_dim)
+        key_states = key_states.view(seq_length, self.num_heads, self.head_dim)
+        value_states = value_states.view(seq_length, self.num_heads, self.head_dim)
+        query_states = apply_rotary_pos_emb_vision(query_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        key_states = apply_rotary_pos_emb_vision(key_states.unsqueeze(0), rotary_pos_emb).squeeze(0)
+        attention_mask = torch.zeros([1, seq_length, seq_length], device=query_states.device, dtype=torch.bool)
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True
+        query_states = query_states.transpose(0, 1)
+        key_states = key_states.transpose(0, 1)
+        value_states = value_states.transpose(0, 1)
+        attn_output = F.scaled_dot_product_attention(query_states, key_states, value_states, attention_mask, dropout_p=0.0)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output
+VISION_ATTENTION_CLASSES = {
+    "eager": VisionAttention,
+    "flash_attention_2": VisionFlashAttention2,
+    "sdpa": VisionSdpaAttention,
+}
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Hulumed
+class HulumedVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class HulumedVisionEncoderLayer(nn.Module):
+    def __init__(self, config: HulumedVisionEncoderConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = VISION_ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = HulumedVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+    # Ignore copy
+    def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) -> torch.Tensor:
+        hidden_states = hidden_states + self.self_attn(
+            self.layer_norm1(hidden_states), cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb
+        )
+        hidden_states = hidden_states + self.mlp(self.layer_norm2(hidden_states))
+        return hidden_states
+class HulumedVisionTransformerEncoder(nn.Module):
+    def __init__(self, config: HulumedVisionEncoderConfig):
+        super().__init__()
+        self.config = config
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.rotary_pos_emb = VisionRotaryEmbedding(head_dim // 2)
+        self.layers = nn.ModuleList([HulumedVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def rot_pos_emb(self, grid_sizes, merge_sizes):
+        pos_ids = []
+        for (t, h, w), merge_size in zip(grid_sizes, merge_sizes):
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // merge_size,
+                merge_size,
+                w // merge_size,
+                merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // merge_size,
+                merge_size,
+                w // merge_size,
+                merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = grid_sizes[:, 1:].max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def forward(self, hidden_states, grid_sizes, merge_sizes) -> torch.Tensor:
+        rotary_pos_emb = self.rot_pos_emb(grid_sizes, merge_sizes)
+        cu_seqlens = torch.repeat_interleave(grid_sizes[:, 1] * grid_sizes[:, 2], grid_sizes[:, 0]).cumsum(dim=0, dtype=torch.int32)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for blk in self.layers:
+            if self.gradient_checkpointing and self.training:
+                hidden_states = self._gradient_checkpointing_func(
+                    blk.__call__,
+                    hidden_states,
+                    cu_seqlens,
+                    rotary_pos_emb
+                )
+            else:
+                hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens, rotary_pos_emb=rotary_pos_emb)
+        return hidden_states
+class HulumedVisionEncoderModel(PreTrainedModel):
+    config_class = HulumedVisionEncoderConfig
+    base_model_prefix = "hulumed"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "HulumedVisionEncoderLayer",
+        "HulumedVisionEmbeddings",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def __init__(self, config: HulumedVisionEncoderConfig):
+        super().__init__(config=config)
+        embed_dim = config.hidden_size
+        self.embeddings = HulumedVisionEmbeddings(config)
+        self.encoder = HulumedVisionTransformerEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.post_init()
+    def forward(self, pixel_values, grid_sizes, merge_sizes=None) -> torch.Tensor:
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.encoder(hidden_states, grid_sizes, merge_sizes)
+        hidden_states = self.post_layernorm(hidden_states)
+        hidden_states_chunks = hidden_states.split(grid_sizes.prod(dim=1).tolist(), dim=0)
+        outputs = []
+        for hidden_states, grid_size, merge_size in zip(hidden_states_chunks, grid_sizes, merge_sizes):
+            # NOTE: previous implementation, which supports downsampling with any factor
+            c = hidden_states.shape[-1]
+            hidden_states = hidden_states.view(
+                grid_size[0], grid_size[1] // merge_size, grid_size[2] // merge_size, merge_size, merge_size,  c
+            ).permute(0, 1, 3, 2, 4, 5)
+            hidden_states = hidden_states.reshape(
+                grid_size[0], grid_size[1], grid_size[2], c
+            ).permute(0, 3, 1, 2)
+            hidden_states = torch.nn.functional.interpolate(
+                hidden_states,
+                size=(grid_size[1] // merge_size, grid_size[2] // merge_size),
+                mode='bilinear'
+            )
+            hidden_states = hidden_states.permute(0, 2, 3, 1).view(-1, c)
+            # NOTE: simplified implementation, which only supports downsampling with integer factor
+            # NOTE: this implementation is mathematically equivalent to the previous one when merge_size is 1 or 2 but may cause slightly different results
+            # hidden_states = hidden_states.view(-1, merge_size * merge_size, hidden_states.size(-1))
+            # hidden_states = hidden_states.mean(dim=1)
+            outputs.append(hidden_states)
+        return torch.cat(outputs, dim=0)
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, VisionAttention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, HulumedVisionMLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)

modeling_hulumed_qwen2.py ADDED Viewed

	@@ -0,0 +1,525 @@

+# Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+"""PyTorch HuluMed model."""
+import importlib.util
+import os.path as osp
+import re
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from transformers import AutoModel, Qwen2ForCausalLM, Qwen2Model
+from transformers.generation.utils import GenerateOutput
+from transformers.modeling_outputs import CausalLMOutputWithPast
+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+# Image arguments
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"
+IMAGE_PLACEHOLDER = "<image-placeholder>"
+# Video arguments
+VIDEO_TOKEN_INDEX = -201
+DEFAULT_VIDEO_TOKEN = "<video>"
+NUM_FRAMES = 128
+MAX_FRAMES = 768
+NUM_FRAMES_PER_SECOND = 1
+# Audio arguments
+AUDIO_TOKEN_INDEX = -202
+DEFAULT_AUDIO_TOKEN = "<audio>"
+# Stream arguments
+STREAM_START_TOKEN = "<|stream_start|>"
+STREAM_END_TOKEN = "<|stream_end|>"
+STREAM_MAX_FRAMES = 400
+MODAL_INDEX_MAP = {
+    "<image>": -200,
+    "<video>": -201,
+    "<audio>": -202,
+}
+subimage_token_num=196
+try:
+    from .configuration_hulumed_qwen2 import HulumedQwen2Config
+except ModuleNotFoundError:
+    spec = importlib.util.spec_from_file_location(
+        "configuration_hulumed_qwen2",
+        osp.join(osp.dirname(__file__), "configuration_hulumed_qwen2.py"),
+    )
+    configuration_hulumed_qwen2 = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(configuration_hulumed_qwen2)
+    HulumedQwen2Config = getattr(
+        configuration_hulumed_qwen2,
+        "HulumedQwen2Config",
+    )
+def build_mlp(depth, hidden_size, output_hidden_size):
+    """Build MLP layers for projection."""
+    modules = [nn.Linear(hidden_size, output_hidden_size)]
+    for _ in range(1, depth):
+        modules.append(nn.GELU())
+        modules.append(nn.Linear(output_hidden_size, output_hidden_size))
+    return nn.Sequential(*modules)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    """Build vision projector based on config."""
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == "linear":
+        return nn.Linear(config.vision_encoder_config.hidden_size, config.hidden_size)
+    elif projector_type.startswith("mlp"):
+        return MlpGeluProjector(config, projector_type)
+    else:
+        raise ValueError(f'Unknown projector type: {projector_type}')
+class MlpGeluProjector(nn.Module):
+    """MLP projector with GELU activation."""
+    def __init__(self, config, projector_type):
+        super().__init__()
+        mlp_gelu_match = re.match(r"^mlp(\d+)x_gelu$", projector_type)
+        if mlp_gelu_match is None:
+            raise ValueError(f"Invalid projector type format: {projector_type}")
+        mlp_depth = int(mlp_gelu_match.group(1))
+        self.readout = build_mlp(
+            mlp_depth,
+            config.vision_encoder_config.hidden_size,
+            config.hidden_size
+        )
+    def forward(self, x):
+        return self.readout(x)
+class HulumedMetaModel:
+    """Meta model for HuluMed that handles vision encoder initialization."""
+    def __init__(self, config):
+        super(HulumedMetaModel, self).__init__(config)
+        print('config.vision_encoder',config.vision_encoder)
+        if config.vision_encoder is not None:
+            # Load from pretrained path
+            print('Load from pretrained path')
+            self.vision_encoder = AutoModel.from_pretrained(
+                config.vision_encoder,
+                attn_implementation=self.config._attn_implementation,
+                torch_dtype=self.dtype,
+            )
+            self.config.vision_encoder_config = self.vision_encoder.config
+            self.config.vision_encoder = None
+        elif config.vision_encoder_config is not None:
+            # Build from config
+            print('Build from config')
+            self.vision_encoder = AutoModel.from_config(
+                self.config.vision_encoder_config,
+                attn_implementation=self.config._attn_implementation,
+                torch_dtype=self.dtype,
+            )
+        else:
+            raise ValueError("Vision encoder is not provided in config")
+        self.mm_projector = build_vision_projector(config)
+    def get_vision_encoder(self):
+        return self.vision_encoder
+    def get_mm_projector(self):
+        return self.mm_projector
+class HulumedQwen2Model(HulumedMetaModel, Qwen2Model):
+    """HuluMed Qwen2 Model."""
+    config_class = HulumedQwen2Config
+    def __init__(self, config: HulumedQwen2Config):
+        super(HulumedQwen2Model, self).__init__(config)
+class HulumedMetaForCausalLM(ABC):
+    """Meta class for HuluMed Causal LM with multimodal support."""
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_encoder(self):
+        return self.get_model().get_vision_encoder()
+    def get_mm_projector(self):
+        return self.get_model().get_mm_projector()
+    def encode_images(
+        self,
+        pixel_values: torch.FloatTensor,
+        grid_sizes: torch.LongTensor,
+        merge_sizes: torch.LongTensor,
+    ) -> torch.FloatTensor:
+        """Encode images using vision encoder and projector."""
+        mm_features = self.get_model().get_vision_encoder()(
+            pixel_values=pixel_values,
+            grid_sizes=grid_sizes,
+            merge_sizes=merge_sizes,
+        )
+        mm_features = self.get_model().mm_projector(mm_features)
+        return mm_features
+    def _get_valid_visual_tokens(
+        self,
+        mm_features: torch.FloatTensor,
+        batched_num_patches: torch.LongTensor,
+        modals: List[str],
+    ):
+        """Filter out text-only samples and keep only valid visual tokens."""
+        valid_masks = []
+        for num_patches, modal in zip(batched_num_patches, modals):
+            valid_mask = torch.full(
+                (num_patches,),
+                modal != "text",
+                dtype=torch.bool,
+                device=mm_features.device
+            )
+            valid_masks.append(valid_mask)
+        mm_features = mm_features[torch.cat(valid_masks)]
+        return mm_features
+    def _maybe_truncate_visual_tokens(
+        self,
+        mm_features: torch.FloatTensor,
+        compression_mask: torch.BoolTensor,
+        batched_num_patches: torch.LongTensor,
+        modals: List[str],
+        input_ids: torch.LongTensor,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        """Truncate visual tokens if necessary based on position_ids."""
+        if position_ids is None or mm_features.shape[0] == input_ids.eq(self.config.image_token_index).sum():
+            return mm_features, compression_mask
+        truncation_mask = []
+        for num_patches, modal in zip(batched_num_patches, modals):
+            if modal == "text":
+                truncation_mask.append(torch.ones((0,), dtype=torch.bool, device=input_ids.device))
+            else:
+                truncation_mask.append(torch.ones((num_patches,), dtype=torch.bool, device=input_ids.device))
+        seq_end_indices = torch.nonzero(position_ids == 0)[:, 0]
+        seq_end_indices = seq_end_indices[seq_end_indices > 0].tolist() + [len(input_ids)]
+        seq_start_indices = [0] + seq_end_indices[:-1]
+        num_visual_tokens = [
+            input_ids[start:end].eq(self.config.image_token_index).sum()
+            for start, end in zip(seq_start_indices, seq_end_indices)
+        ]
+        for n, mask in zip(num_visual_tokens, truncation_mask):
+            if len(mask) > 0:
+                mask[n:] = False
+        truncation_mask = torch.cat(truncation_mask)
+        return mm_features[truncation_mask], compression_mask[truncation_mask]
+    def _get_compression_mask(
+        self,
+        pixel_values: torch.FloatTensor,
+        batched_num_patches: torch.LongTensor,
+        grid_sizes: torch.LongTensor,
+        merge_sizes: torch.LongTensor,
+        modals: List[str],
+        threshold: float = 0.1,
+        min_tokens: int = 1,
+    ) -> torch.BoolTensor:
+        """Get compression mask for video tokens based on frame differences."""
+        batched_images = pixel_values.split(grid_sizes.prod(dim=1).tolist(), dim=0)
+        compression_masks = []
+        for images, num_patches, grid_size, merge_size, modal in zip(
+            batched_images, batched_num_patches, grid_sizes, merge_sizes, modals
+        ):
+            t, h, w = grid_size
+            if modal == "image" or (modal == "video" and t == 1):
+                compression_masks.append(torch.ones((num_patches,), dtype=torch.bool, device=images.device))
+            elif modal == "video":
+                # Video token compression based on pixel differences
+                images = images.view(t, (h // merge_size) * (w // merge_size), -1)
+                pixel_diff = images[1:] - images[:-1]
+                pixel_diff = torch.abs(pixel_diff).mean(dim=-1) * 255
+                pixel_diff = torch.cat([torch.full_like(pixel_diff[0:1], threshold + 1), pixel_diff], dim=0)
+                mask = (pixel_diff / 255.0) > threshold
+                padding_ids = torch.nonzero(mask.sum(dim=1) < min_tokens)[:, 0]
+                mask[padding_ids, :min_tokens] = 1
+                compression_masks.append(mask.flatten())
+            else:
+                # Pseudo image case
+                compression_masks.append(torch.ones((0,), dtype=torch.bool, device=images.device))
+        return torch.cat(compression_masks)
+    def _compress_visual_tokens(
+        self,
+        compression_mask: torch.BoolTensor,
+        mm_features: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ):
+        """Compress visual tokens based on compression mask."""
+        mm_features = mm_features[compression_mask]
+        image_selected = (input_ids == self.config.image_token_index)
+        text_masks = torch.logical_not(image_selected)
+        text_masks[image_selected] = compression_mask
+        input_ids = input_ids[text_masks]
+        if attention_mask is not None:
+            attention_mask = attention_mask[text_masks]
+        if labels is not None:
+            labels = labels[text_masks]
+        if position_ids is not None:
+            position_ids = position_ids[text_masks]
+            pos_start = [0] + torch.nonzero(position_ids == 0)[:, 0].tolist()
+            pos_end = pos_start[1:] + [len(input_ids)]
+            position_ids = torch.cat([
+                torch.arange(end - start, device=input_ids.device)
+                for start, end in zip(pos_start, pos_end)
+            ])
+        return mm_features, input_ids, attention_mask, position_ids, labels
+    def prepare_inputs_labels_for_multimodal(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        grid_sizes: Optional[torch.LongTensor] = None,
+        merge_sizes: Optional[torch.LongTensor] = None,
+        modals: Optional[List[str]] = None,
+    ):
+        """Prepare inputs and labels for multimodal training/inference."""
+        vision_encoder = self.get_vision_encoder()
+        # Text-only situation
+        if vision_encoder is None or pixel_values is None or input_ids.shape[1] == 1:
+            return input_ids, attention_mask, position_ids, past_key_values, None, labels
+        # 1. Flatten text inputs
+        B, N = input_ids.shape
+        input_ids = input_ids.view(B * N)
+        if attention_mask is not None:
+            attention_mask = attention_mask.view(B * N)
+        if position_ids is not None:
+            position_ids = position_ids.view(B * N)
+        if labels is not None:
+            labels = labels.view(B * N)
+        # 2. Embed visual tokens
+        batched_num_patches = grid_sizes.prod(dim=1).div(merge_sizes ** 2).long()
+        mm_features = self.encode_images(pixel_values, grid_sizes, merge_sizes).to(input_ids.device)
+        mm_features = self._get_valid_visual_tokens(mm_features, batched_num_patches, modals)
+        compression_mask = self._get_compression_mask(
+            pixel_values, batched_num_patches, grid_sizes, merge_sizes, modals
+        )
+        mm_features, compression_mask = self._maybe_truncate_visual_tokens(
+            mm_features, compression_mask, batched_num_patches, modals, input_ids, position_ids
+        )
+        # 3. Compress visual tokens if enabled
+        if self.config.use_token_compression:
+            assert B == 1, "Token compression is only supported for batch_size=1"
+            mm_features, input_ids, attention_mask, position_ids, labels = self._compress_visual_tokens(
+                compression_mask, mm_features, input_ids, attention_mask, position_ids, labels
+            )
+        # 4. Embed text tokens
+        inputs_embeds = self.get_model().embed_tokens(input_ids).clone()
+        # 5. Replace multimodal tokens with features
+        image_selected = (input_ids == self.config.image_token_index)
+        inputs_embeds[image_selected] = inputs_embeds[image_selected] * 0.0 + mm_features
+        # 6. Reshape back to batched format
+        C = inputs_embeds.shape[-1]
+        inputs_embeds = inputs_embeds.reshape(B, -1, C)
+        if attention_mask is not None:
+            attention_mask = attention_mask.view(B, -1)
+        if labels is not None:
+            labels = labels.view(B, -1)
+        if position_ids is not None:
+            position_ids = position_ids.view(B, -1)
+        return None, attention_mask, position_ids, past_key_values, inputs_embeds, labels
+class HulumedQwen2ForCausalLM(Qwen2ForCausalLM, HulumedMetaForCausalLM):
+    """HuluMed Qwen2 model for causal language modeling with multimodal support."""
+    config_class = HulumedQwen2Config
+    def __init__(self, config, **kwargs):
+        super(Qwen2ForCausalLM, self).__init__(config)
+        self.model = HulumedQwen2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        # Multimodal inputs
+        pixel_values: Optional[torch.FloatTensor] = None,
+        grid_sizes: Optional[torch.LongTensor] = None,
+        merge_sizes: Optional[torch.LongTensor] = None,
+        modals: Optional[List[str]] = None,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        """Forward pass with multimodal support."""
+        if inputs_embeds is None:
+            (
+                input_ids,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                labels=labels,
+                pixel_values=pixel_values,
+                grid_sizes=grid_sizes,
+                merge_sizes=merge_sizes,
+                modals=modals,
+            )
+        return super().forward(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+            **loss_kwargs,
+        )
+    @torch.no_grad()
+    def generate(
+        self,
+        # Multimodal inputs
+        pixel_values: Optional[torch.FloatTensor] = None,
+        grid_sizes: Optional[torch.LongTensor] = None,
+        merge_sizes: Optional[torch.LongTensor] = None,
+        modals: Optional[List[str]] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        """Generate with multimodal support."""
+        input_ids = kwargs.pop("input_ids", None)
+        attention_mask = kwargs.pop("attention_mask", None)
+        position_ids = kwargs.pop("position_ids", None)
+        past_key_values = kwargs.pop("past_key_values", None)
+        if "inputs_embeds" in kwargs:
+            raise NotImplementedError("`inputs_embeds` is not supported")
+        if pixel_values is not None:
+            (
+                input_ids,
+                attention_mask,
+                position_ids,
+                past_key_values,
+                inputs_embeds,
+                labels,
+            ) = self.prepare_inputs_labels_for_multimodal(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                labels=None,
+                pixel_values=pixel_values,
+                grid_sizes=grid_sizes,
+                merge_sizes=merge_sizes,
+                modals=modals,
+            )
+        else:
+            inputs_embeds = self.get_model().embed_tokens(input_ids)
+        return super().generate(
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            **kwargs
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        """Prepare inputs for generation."""
+        images = kwargs.pop("images", None)
+        _inputs = super().prepare_inputs_for_generation(
+            input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
+        )
+        if images is not None:
+            _inputs['images'] = images
+        return _inputs

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "auto_map": {
+    "AutoImageProcessor": "image_processing_hulumed.HulumedImageProcessor",
+    "AutoProcessor": "processing_hulumed.HulumedProcessor"
+  },
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "HulumedImageProcessor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_tokens": 16384,
+  "min_tokens": 16,
+  "patch_size": 14,
+  "processor_class": "HulumedProcessor",
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098
+}

processing_hulumed.py ADDED Viewed

	@@ -0,0 +1,873 @@

+"""Processor class for HuluMed with 3D support."""
+import copy
+import importlib.util
+import os
+import os.path as osp
+import warnings
+from collections import defaultdict
+from typing import Any, List, Union, Dict, Optional, Tuple, TypedDict
+import cv2
+import ffmpeg
+import imageio
+import json
+import numpy as np
+import torch
+import transformers
+from decord import VideoReader, cpu
+from PIL import Image
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+try:
+    import nibabel as nib
+    NIBABEL_AVAILABLE = True
+except ImportError:
+    NIBABEL_AVAILABLE = False
+    warnings.warn("nibabel is not installed. 3D medical imaging support will be limited. Install with: pip install nibabel")
+try:
+    from . import image_processing_hulumed
+    from .image_processing_hulumed import (
+        is_valid_image, is_valid_video,
+    )
+except ModuleNotFoundError:
+    spec = importlib.util.spec_from_file_location(
+        "image_processing_hulumed",
+        osp.join(osp.dirname(__file__), "image_processing_hulumed.py"),
+    )
+    image_processing_hulumed = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(image_processing_hulumed)
+    is_valid_image = getattr(image_processing_hulumed, "is_valid_image")
+    is_valid_video = getattr(image_processing_hulumed, "is_valid_video")
+DEFAULT_IMAGE_TOKEN = "<image>"
+IGNORE_INDEX = -100
+Conversation = List[Dict[str, Any]]
+SingleImage = Union[Image.Image, np.ndarray, torch.Tensor]
+SingleVideo = Union[List[SingleImage], np.ndarray, torch.Tensor]
+BatchedImage = List[Union[SingleImage, SingleVideo]]
+BatchedNamedImage = List[Tuple[str, Union[SingleImage, SingleVideo]]]
+def _custom_import(class_name: str):
+    try:
+        attribute_class = getattr(transformers, class_name)
+    except AttributeError:
+        attribute_class = getattr(image_processing_hulumed, class_name)
+    return attribute_class
+def is_named_image(image) -> bool:
+    return isinstance(image, (list, tuple)) and \
+        len(image) == 2 and \
+        isinstance(image[0], str) and \
+        image[0] in ["image", "video", "3d"] and \
+        (is_valid_image(image[1]) or is_valid_video(image[1]))
+def make_batched_images(images) -> Tuple[List[str], List[ImageInput]]:
+    if isinstance(images, (list, tuple)) and all(is_named_image(image) for image in images):
+        modals = [image[0] if image[0] != "3d" else "video" for image in images]
+        data = [image[1] for image in images]
+        return modals, data
+    elif isinstance(images, (list, tuple)) and all(is_valid_image(image) or is_valid_video(image) for image in images):
+        batch = []
+        for image in images:
+            if is_valid_video(image):
+                batch.append(("video", image))
+            elif is_valid_image(image):
+                batch.append(("image", image))
+            else:
+                raise ValueError(f"Could not make batched images from {images}")
+        return [x[0] for x in batch], [x[1] for x in batch]
+    elif is_named_image(images):
+        modal = images[0] if images[0] != "3d" else "video"
+        return [modal], [images[1]]
+    elif is_valid_video(images):
+        return ["video"], [images]
+    elif is_valid_image(images):
+        return ["image"], [images]
+    raise ValueError(f"Could not make batched images from {images}")
+def frame_sample(duration, mode='uniform', num_frames=None, vid_fps=None, fps=None):
+    if mode == 'uniform':
+        assert num_frames is not None, "Number of frames must be provided for uniform sampling."
+        if duration <= num_frames:
+            return np.arange(duration).astype(int)
+        return np.linspace(0, duration-1, num_frames, dtype=int)
+    elif mode == 'fps':
+        assert vid_fps is not None, "FPS must be provided for FPS sampling."
+        assert fps is not None, "FPS must be provided for FPS sampling."
+        segment_len = min(vid_fps // fps, duration)
+        return np.arange(segment_len // 2, duration, segment_len, dtype=int)
+    else:
+        raise ValueError(f'Unsupported frame sampling mode: {mode}')
+def load_video_from_ids(video_path, s=None, e=None, fps=None, max_frames=128, temporal_factor=1):
+    if s is not None and e is not None:
+        s = s if s >= 0. else 0.
+        e = e if e >= 0. else 0.
+        if s > e:
+            s, e = e, s
+        elif s == e:
+            e = s + 1
+    if os.path.isdir(video_path):
+        frame_files = sorted(os.listdir(video_path))
+        vid_fps = 3
+        num_frames_of_video = len(frame_files)
+    elif video_path.endswith('.gif'):
+        gif_reader = imageio.get_reader(video_path)
+        vid_fps = 25
+        num_frames_of_video = len(gif_reader)
+    else:
+        vreader = VideoReader(video_path, ctx=cpu(0), num_threads=2)
+        vid_fps = vreader.get_avg_fps()
+        num_frames_of_video = len(vreader)
+    f_start = 0 if s is None else max(int(s * vid_fps) - 1, 0)
+    f_end = num_frames_of_video - 1 if e is None else min(int(e * vid_fps) - 1, num_frames_of_video - 1)
+    frame_indices = list(range(f_start, f_end + 1))
+    duration = len(frame_indices)
+    if fps is not None and duration / vid_fps < max_frames:
+        sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', vid_fps=vid_fps, fps=fps)]
+    else:
+        sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=max_frames)]
+    if os.path.isdir(video_path):
+        frames = np.array([cv2.cvtColor(cv2.imread(os.path.join(video_path, frame_files[frame_idx])), cv2.COLOR_BGR2RGB) for frame_idx in sampled_frame_indices])
+    elif video_path.endswith('.gif'):
+        frames = np.array([cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices])
+    else:
+        frames = vreader.get_batch(sampled_frame_indices).asnumpy()
+    frames = frames.transpose(0, 3, 1, 2)
+    timestamps = [x / vid_fps for x in sampled_frame_indices]
+    if temporal_factor > 1:
+        pad_length = temporal_factor - len(frames) % temporal_factor
+        frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+        [timestamps.append(timestamps[-1] + 1 / fps) for _ in range(pad_length)]
+    frames = [frame for frame in frames]
+    return frames, timestamps
+class ChatTemplateKwargs(TypedDict, total=False):
+    chat_template: Optional[str]
+    add_system_prompt: Optional[bool]
+    add_generation_prompt: Optional[bool]
+class HulumedProcessorKwargs(ProcessingKwargs, ChatTemplateKwargs, total=False):
+    chat_template_kwargs: ChatTemplateKwargs = {
+        **ChatTemplateKwargs.__annotations__,
+    }
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "images_kwargs": {
+        },
+        "chat_template_kwargs": {
+            "chat_template": None,
+            "add_system_prompt": False,
+            "add_generation_prompt": False,
+        },
+    }
+class HulumedProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "HulumedImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+    valid_kwargs = ["chat_template", "image_merge_size", "video_merge_size", "fps", "max_frames"]
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template: str = None,
+        image_merge_size: int = 1,
+        video_merge_size: int = 2,
+        fps: Optional[int] = 1,
+        max_frames: Optional[int] = 128,
+    ):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        if chat_template is None:
+            chat_template = self.tokenizer.chat_template
+        self.chat_template = chat_template
+        self.image_merge_size = image_merge_size
+        self.video_merge_size = video_merge_size
+        self.fps = fps
+        self.max_frames = max_frames
+        self.generation_prompt = self._infer_generation_prompt()
+        self.generation_prompt_ids = self.tokenizer.encode(self.generation_prompt, return_tensors="pt")
+        self.generation_prompt_length = len(self.generation_prompt_ids[0])
+        self.image_token_id = self.tokenizer.convert_tokens_to_ids(DEFAULT_IMAGE_TOKEN)
+        self.eos_token_id = self.tokenizer.eos_token_id
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        args = []
+        for attribute_name in cls.attributes:
+            class_name = getattr(cls, f"{attribute_name}_class")
+            if isinstance(class_name, tuple):
+                classes = tuple(_custom_import(n) if n is not None else None for n in class_name)
+                use_fast = kwargs.get("use_fast", True)
+                if use_fast and classes[1] is not None:
+                    attribute_class = classes[1]
+                else:
+                    attribute_class = classes[0]
+            else:
+                attribute_class = _custom_import(class_name)
+            args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs))
+        return args
+    def get_generation_prompt(self):
+        return self.generation_prompt
+    def get_generation_prompt_ids(self):
+        return self.generation_prompt_ids
+    def _infer_generation_prompt(self):
+        pseudo_message = [{"role": "user", "content": ""}]
+        instruction = self.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=True)
+        conversation = self.apply_chat_template(pseudo_message, tokenize=False, add_generation_prompt=False)
+        return instruction.replace(conversation, "")
+    def _get_downsampled_grid_sizes(self, image_inputs: Dict[str, Any]):
+        grid_sizes = []
+        for grid_size, merge_size in zip(image_inputs.get("grid_sizes", []), image_inputs.get("merge_sizes", [])):
+            if not torch.all(grid_size[1:] % merge_size == 0):
+                warnings.warn(f"Grid size {grid_size} is not divisible by merge size. Some undesired errors may occur.")
+            if grid_size[0] == 1:
+                grid_sizes.append(grid_size[1:] / merge_size)
+            elif grid_size[0] > 1:
+                grid_sizes.extend([grid_size[1:] / merge_size] * grid_size[0])
+        return grid_sizes
+    def _get_visual_seq_len(self, grid_size: torch.Tensor):
+        num_tokens = int(grid_size.prod().item())
+        return num_tokens
+    def load_images(self, image_path: Union[str, List[str], Image.Image, List[Image.Image]]):
+        if isinstance(image_path, str) and os.path.isfile(image_path):
+            images = [Image.open(image_path).convert('RGB')]
+        elif isinstance(image_path, str) and os.path.isdir(image_path):
+            images = [Image.open(os.path.join(image_path, f)).convert('RGB') for f in sorted(os.listdir(image_path))]
+        elif isinstance(image_path, list) and isinstance(image_path[0], str):
+            images = [Image.open(f).convert('RGB') for f in image_path]
+        elif isinstance(image_path, list) and isinstance(image_path[0], Image.Image):
+            images = [np.array(x) for x in image_path]
+        elif isinstance(image_path, Image.Image):
+            images = [np.array(image_path)]
+        else:
+            raise ValueError(f"Unsupported image path type: {type(image_path)}")
+        return images
+    def load_nii(
+        self,
+        nii_path: str,
+        num_slices: Optional[int] = None,
+        axis: int = 2,
+        window_center: Optional[float] = None,
+        window_width: Optional[float] = None,
+        normalize: bool = True,
+    ):
+        if not NIBABEL_AVAILABLE:
+            raise ImportError("nibabel is required for NIfTI support. Install with: pip install nibabel")
+        if not os.path.exists(nii_path):
+            raise FileNotFoundError(f"NIfTI file not found: {nii_path}")
+        nii_img = nib.load(nii_path)
+        volume = nii_img.get_fdata()
+        if axis == 0:
+            slices = [volume[i, :, :] for i in range(volume.shape[0])]
+        elif axis == 1:
+            slices = [volume[:, i, :] for i in range(volume.shape[1])]
+        elif axis == 2:
+            slices = [volume[:, :, i] for i in range(volume.shape[2])]
+        else:
+            raise ValueError(f"Invalid axis: {axis}. Must be 0, 1, or 2.")
+        if num_slices is not None and num_slices < len(slices):
+            indices = np.linspace(0, len(slices) - 1, num_slices, dtype=int)
+            slices = [slices[i] for i in indices]
+        processed_slices = []
+        for slice_2d in slices:
+            if window_center is not None and window_width is not None:
+                lower = window_center - window_width / 2
+                upper = window_center + window_width / 2
+                slice_2d = np.clip(slice_2d, lower, upper)
+            if normalize:
+                slice_min = slice_2d.min()
+                slice_max = slice_2d.max()
+                if slice_max > slice_min:
+                    slice_2d = (slice_2d - slice_min) / (slice_max - slice_min) * 255.0
+                else:
+                    slice_2d = np.zeros_like(slice_2d)
+            slice_2d = slice_2d.astype(np.uint8)
+            slice_rgb = np.stack([slice_2d] * 3, axis=0)
+            processed_slices.append(slice_rgb)
+        return processed_slices
+    def load_video(
+        self,
+        video_path: str,
+        start_time: Optional[float] = None,
+        end_time: Optional[float] = None,
+        fps: Optional[float] = None,
+        max_frames: Optional[float] = None,
+        size: Optional[int] = None,
+        size_divisible: int = 1,
+        precise_time: bool = False,
+        verbose: bool = False,
+        temporal_factor: int = 1
+    ):
+        fps = self.fps if fps is None else fps
+        max_frames = self.max_frames if max_frames is None else max_frames
+        if start_time is not None and end_time is not None and end_time - start_time < 1:
+            return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
+        if os.path.isdir(video_path):
+            return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
+        if video_path.endswith('.gif'):
+            return load_video_from_ids(video_path, start_time, end_time, fps=fps, max_frames=max_frames)
+        probe = ffmpeg.probe(video_path)
+        duration = float(probe['format']['duration'])
+        video_stream = next((stream for stream in probe['streams'] if stream['codec_type'] == 'video'), None)
+        w, h = int(video_stream['width']), int(video_stream['height'])
+        kwargs, input_kwargs, output_kwargs = {}, {}, {}
+        do_trim = start_time is not None or end_time is not None
+        if start_time is not None:
+            new_start_time = max(float(video_stream['start_time']), start_time)
+            duration -= new_start_time - start_time
+            start_time = new_start_time
+        else:
+            start_time = float(video_stream['start_time'])
+        if end_time is not None:
+            duration = min(duration, end_time - start_time)
+        if do_trim:
+            kwargs = {'ss': start_time, 't': duration}
+        if precise_time:
+            output_kwargs.update(kwargs)
+        else:
+            input_kwargs.update(kwargs)
+        if size is not None:
+            scale_factor = size / min(w, h)
+            new_w, new_h = round(w * scale_factor), round(h * scale_factor)
+        else:
+            new_w, new_h = w, h
+        new_w = new_w // size_divisible * size_divisible
+        new_h = new_h // size_divisible * size_divisible
+        stream = ffmpeg.input(video_path, **input_kwargs)
+        if fps is not None:
+            stream = ffmpeg.filter(stream, "fps", fps=fps, round="down")
+        if new_w != w or new_h != h:
+            stream = ffmpeg.filter(stream, 'scale', new_w, new_h)
+        stream = ffmpeg.output(stream, "pipe:", format="rawvideo", pix_fmt="rgb24", **output_kwargs)
+        out, _ = ffmpeg.run(stream, capture_stdout=True, quiet=not verbose)
+        frames = np.frombuffer(out, np.uint8).reshape([-1, new_h, new_w, 3]).transpose([0, 3, 1, 2])
+        if fps is not None:
+            timestamps = np.arange(start_time, start_time + duration + 1 / fps, 1 / fps)[:len(frames)]
+        else:
+            timestamps = np.linspace(start_time, start_time + duration, len(frames))
+        if max_frames is not None and len(frames) > max_frames:
+            indices = np.linspace(0, len(frames) - 1, max_frames, dtype=int)
+            frames = frames[indices]
+            timestamps = timestamps[indices]
+        if temporal_factor > 1:
+            pad_length = temporal_factor - len(frames) % temporal_factor
+            frames = np.concatenate([frames, frames[-1:].repeat(pad_length, axis=0)])
+            timestamps = np.concatenate([timestamps, timestamps[-1:].repeat(pad_length) + np.arange(1, pad_length + 1) / fps])
+        frames = [frame for frame in frames]
+        timestamps = [timestamp for timestamp in timestamps]
+        return frames, timestamps
+    def _load_multimodal_data(self, conversation: Conversation):
+        multimodal_info = defaultdict(list)
+        new_conversation = []
+        for message in conversation:
+            new_message = {"role": message["role"]}
+            if not isinstance(message["content"], (list, tuple)):
+                new_message["content"] = message["content"]
+                new_conversation.append(new_message)
+                continue
+            new_contents = []
+            for content in message["content"]:
+                if not isinstance(content, dict):
+                    new_contents.append(content)
+                    continue
+                assert "type" in content, "Content must have 'type' field."
+                if content["type"] in ["image", "video", "3d"] and content["type"] in content and isinstance(content[content["type"]], dict):
+                    load_args = content[content["type"]]
+                    data_id = json.dumps({k: v for k, v in load_args.items() if k not in ["start_time", "end_time"]})
+                    new_content = copy.deepcopy(content)
+                    multimodal_info[data_id].append(new_content)
+                    new_contents.append(new_content)
+                else:
+                    new_contents.append(content)
+            new_message["content"] = new_contents
+            new_conversation.append(new_message)
+        for data_id, contents in multimodal_info.items():
+            data_type = contents[0]["type"]
+            if data_type == "image":
+                image = self.load_images(contents[0][data_type]["image_path"])[0]
+                for content in contents:
+                    content["image"] = [image.copy()]
+            elif data_type == "3d":
+                load_args = contents[0]["3d"]
+                nii_path = load_args["image_path"]
+                num_slices = load_args.get("nii_num_slices", None)
+                axis = load_args.get("nii_axis", 2)
+                window_center = load_args.get("window_center", None)
+                window_width = load_args.get("window_width", None)
+                slices = self.load_nii(
+                    nii_path=nii_path,
+                    num_slices=num_slices,
+                    axis=axis,
+                    window_center=window_center,
+                    window_width=window_width,
+                )
+                for content in contents:
+                    content["type"] = "video"
+                    content["video"] = slices
+                    content["num_frames"] = len(slices)
+                    content.pop("3d", None)
+            elif data_type == "video":
+                start_times = [content["video"].get("start_time", 0.) for content in contents]
+                end_times = [content["video"].get("end_time", float("inf")) for content in contents]
+                load_args = contents[0][data_type]
+                start_time, end_time = min(start_times), max(end_times)
+                if start_time > 0:
+                    load_args["start_time"] = start_time
+                if end_time < float("inf"):
+                    load_args["end_time"] = end_time
+                images, timestamps = self.load_video(**load_args)
+                for content, start_time, end_time in zip(contents, start_times, end_times):
+                    cur_images, cur_timestamps = [], []
+                    for image, timestamp in zip(images, timestamps):
+                        if start_time <= timestamp <= end_time:
+                            cur_images.append(image.copy())
+                            cur_timestamps.append(timestamp)
+                    content[data_type] = cur_images
+                    content["num_frames"] = len(cur_images)
+                    content["timestamps"] = cur_timestamps
+        return new_conversation
+    def _gather_multimodal_data(self, conversation: Conversation):
+        images = []
+        for message in conversation:
+            if not isinstance(message["content"], (list, tuple)):
+                continue
+            for content in message["content"]:
+                if not isinstance(content, dict):
+                    continue
+                if content["type"] == "video":
+                    video = content["video"]
+                    assert is_valid_video(video), f"Invalid video data: {video}."
+                    images.append(("video", video))
+                elif content["type"] == "image":
+                    image = content["image"]
+                    images.append(("image", image))
+        images = images if len(images) > 0 else None
+        return images
+    def _process_conversation_with_label(
+        self,
+        conversation: Conversation,
+        image_inputs: Dict[str, Any],
+        **kwargs,
+    ):
+        assert kwargs.pop("return_tensors", "pt") == "pt", "Only PyTorch tensors are supported when return_labels=True."
+        assert "add_generation_prompt" not in kwargs, "'add_generation_prompt' argument is not supported when return_labels=True."
+        output_kwargs = self._merge_kwargs(
+            HulumedProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        output_kwargs["chat_template_kwargs"].pop("add_generation_prompt")
+        grid_sizes = self._get_downsampled_grid_sizes(image_inputs)
+        text_inputs = {"input_ids": [], "labels": []}
+        sample_types_list = []
+        image_idx = 0
+        for message_idx, message in enumerate(conversation):
+            prompt = self.apply_chat_template(
+                [message],
+                tokenize=False,
+                add_generation_prompt=False,
+                **output_kwargs["chat_template_kwargs"],
+            )
+            prompt_chunks = prompt.split(DEFAULT_IMAGE_TOKEN)
+            prompt = []
+            for chunk_idx in range(len(prompt_chunks) - 1):
+                prompt.append(prompt_chunks[chunk_idx])
+                num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
+                prompt.append(DEFAULT_IMAGE_TOKEN * num_tokens)
+                image_idx += 1
+            prompt.append(prompt_chunks[-1])
+            prompt = "".join(prompt)
+            input_ids = self.tokenizer.encode(prompt, return_tensors="pt", **output_kwargs["text_kwargs"])[0]
+            text_inputs["input_ids"].append(input_ids)
+            targets = torch.full_like(input_ids, IGNORE_INDEX)
+            sample_types = torch.full_like(input_ids, IGNORE_INDEX)
+            if message["role"] == "assistant":
+                targets[self.generation_prompt_length:-1] = input_ids[self.generation_prompt_length:-1].clone()
+            elif message["role"] == "stream":
+                diff = torch.diff((input_ids == self.image_token_id).float())
+                image_end_indices = torch.nonzero(diff < 0)[:, 0]
+                targets[image_end_indices + 1] = input_ids[image_end_indices + 1]
+                sample_types = targets.clone()
+                sample_types[torch.logical_and(sample_types > 0, sample_types != self.eos_token_id)] = 0
+                targets[-2] = input_ids[-2]
+            if message_idx > 0 and conversation[message_idx - 1]["role"] == "stream":
+                targets[0] = input_ids[0]
+                sample_types[0] = input_ids[0]
+            text_inputs["labels"].append(targets)
+            sample_types_list.append(sample_types)
+        text_inputs = {k: torch.cat(v) for k, v in text_inputs.items()}
+        sample_types = torch.cat(sample_types_list)
+        types, counts = torch.unique(sample_types[sample_types > -1], return_counts=True)
+        if len(types) > 0:
+            target_num_samples = counts.amin()
+            for type_id, type_count in zip(types, counts):
+                if type_count > target_num_samples:
+                    indices = torch.nonzero(sample_types == type_id)[:, 0]
+                    random_selector = torch.randperm(indices.size(0))[:-target_num_samples]
+                    text_inputs["labels"][indices[random_selector]] = IGNORE_INDEX
+        assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
+        return text_inputs
+    def _process_conversation_without_label(
+        self,
+        conversation: Conversation,
+        image_inputs: Dict[str, Any],
+        **kwargs,
+    ):
+        output_kwargs = self._merge_kwargs(
+            HulumedProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        prompt = self.apply_chat_template(
+            conversation,
+            tokenize=False,
+            **output_kwargs["chat_template_kwargs"],
+        )
+        return self.process_text(prompt, image_inputs, **output_kwargs["text_kwargs"])
+    def _process_conversation(
+        self,
+        conversation: Conversation,
+        images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
+        return_labels: bool = False,
+        **kwargs: Unpack[HulumedProcessorKwargs],
+    ) -> BatchFeature:
+        assert isinstance(conversation, list), "Conversation must be a list of messages."
+        if images is None:
+            conversation = self._load_multimodal_data(conversation)
+            images = self._gather_multimodal_data(conversation)
+        if not images:
+            images = None
+        elif isinstance(images, (list, tuple)):
+            images = [img for img in images if img and (not isinstance(img, (list, tuple)) or len(img) > 0)]
+            if not images:
+                images = None
+        output_kwargs = self._merge_kwargs(
+            HulumedProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            if "merge_size" not in output_kwargs["images_kwargs"]:
+                has_video_or_3d = any(
+                    content.get("type") in ["video", "3d"] or "video" in content or "3d" in content
+                    for message in conversation
+                    if isinstance(message.get("content"), list)
+                    for content in message["content"]
+                    if isinstance(content, dict)
+                )
+                output_kwargs["images_kwargs"]["merge_size"] = 2 if has_video_or_3d else 1
+            image_inputs = self.process_images(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+        if return_labels:
+            text_inputs = self._process_conversation_with_label(conversation, image_inputs, **kwargs)
+        else:
+            text_inputs = self._process_conversation_without_label(conversation, image_inputs, **kwargs)
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    def _process_plain(
+        self,
+        text: Union[TextInput, PreTokenizedInput] = None,
+        images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
+        return_labels: bool = False,
+        **kwargs: Unpack[HulumedProcessorKwargs],
+    ) -> BatchFeature:
+        if text is None:
+            raise ValueError("You must provide 'text' or 'conversation'.")
+        if return_labels:
+            raise ValueError("return_labels is not supported for plain text processing.")
+        output_kwargs = self._merge_kwargs(
+            HulumedProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.process_images(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+        text_inputs = self.process_text(text, image_inputs, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs})
+    def process_images(self, images: Union[BatchedImage, BatchedNamedImage], **kwargs):
+        modals, images = make_batched_images(images)
+        if "merge_size" not in kwargs:
+            kwargs["merge_size"] = [
+                self.video_merge_size if modal == "video" else self.image_merge_size
+                for modal in modals
+            ]
+        image_inputs = self.image_processor(images=images, **kwargs)
+        image_inputs["modals"] = modals
+        return image_inputs
+    def process_text(
+        self,
+        text: TextInput,
+        image_inputs: Dict[str, Any],
+        **kwargs,
+    ):
+        grid_sizes = self._get_downsampled_grid_sizes(image_inputs)
+        kwargs.pop("padding", None)
+        kwargs.pop("padding_side", None)
+        if len(grid_sizes) > 0:
+            image_idx = 0
+            while DEFAULT_IMAGE_TOKEN in text:
+                num_tokens = self._get_visual_seq_len(grid_sizes[image_idx])
+                text = text.replace(DEFAULT_IMAGE_TOKEN, "<placeholder>" * num_tokens, 1)
+                image_idx += 1
+            text = text.replace("<placeholder>", DEFAULT_IMAGE_TOKEN)
+            assert len(grid_sizes) == image_idx, "Number of images does not match the number of image tokens in the text."
+        text_inputs = self.tokenizer(text, **kwargs)
+        return text_inputs
+    def __call__(
+        self,
+        text: Optional[TextInput] = None,
+        conversation: Optional[Conversation] = None,
+        images: Optional[Union[BatchedImage, BatchedNamedImage]] = None,
+        return_labels: bool = False,
+        **kwargs: Unpack[HulumedProcessorKwargs],
+    ) -> BatchFeature:
+        if conversation is not None:
+            if text is not None:
+                raise ValueError("You cannot provide both 'conversation' and 'text'.")
+            return self._process_conversation(conversation, images, return_labels, **kwargs)
+        return self._process_plain(text, images, return_labels, **kwargs)
+    def batch_decode(self, *args, skip_special_tokens=True, use_think=False, **kwargs):
+        outputs = self.tokenizer.batch_decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+        if not use_think:
+            outputs = [self._remove_think_tags(output) for output in outputs]
+        return outputs
+    def decode(self, *args, skip_special_tokens=True, use_think=False, **kwargs):
+        output = self.tokenizer.decode(*args, skip_special_tokens=skip_special_tokens, **kwargs)
+        if not use_think:
+            output = self._remove_think_tags(output)
+        return output
+    def _remove_think_tags(self, text: str) -> str:
+        import re
+        pattern = r'<think>.*?</think>'
+        cleaned = re.sub(pattern, '', text, flags=re.DOTALL)
+        cleaned = re.sub(r'\n\s*\n', '\n\n', cleaned)
+        cleaned = cleaned.strip()
+        return cleaned
+    def apply_chat_template(
+        self,
+        conversation: Conversation,
+        chat_template: Optional[str] = None,
+        tokenize: bool = False,
+        add_system_prompt: bool = False,
+        add_generation_prompt: bool = False,
+        image_token: Optional[str] = DEFAULT_IMAGE_TOKEN,
+        **kwargs,
+    ) -> str:
+        if chat_template is None:
+            if self.chat_template is not None:
+                chat_template = self.chat_template
+            else:
+                raise ValueError(
+                    "No chat template is set for this processor. Please either set the `chat_template` attribute, "
+                    "or provide a chat template as an argument."
+                )
+        return self.tokenizer.apply_chat_template(
+            conversation,
+            chat_template=chat_template,
+            tokenize=tokenize,
+            add_system_prompt=add_system_prompt,
+            add_generation_prompt=add_generation_prompt,
+            image_token=image_token,
+            **kwargs
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + ["modals"]
+    def _merge_kwargs(
+        self,
+        ModelProcessorKwargs: ProcessingKwargs,
+        tokenizer_init_kwargs: Optional[Dict] = None,
+        **kwargs,
+    ) -> Dict[str, Dict]:
+        output_kwargs = {
+            "text_kwargs": {},
+            "images_kwargs": {},
+            "audio_kwargs": {},
+            "videos_kwargs": {},
+            "chat_template_kwargs": {},
+            "common_kwargs": {},
+        }
+        default_kwargs = {
+            "text_kwargs": {},
+            "images_kwargs": {},
+            "audio_kwargs": {},
+            "videos_kwargs": {},
+            "chat_template_kwargs": {},
+            "common_kwargs": {},
+        }
+        used_keys = set()
+        for modality in default_kwargs:
+            default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy()
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+                if modality_key in tokenizer_init_kwargs:
+                    value = (
+                        getattr(self.tokenizer, modality_key)
+                        if hasattr(self.tokenizer, modality_key)
+                        else tokenizer_init_kwargs[modality_key]
+                    )
+                    default_kwargs[modality][modality_key] = value
+        output_kwargs.update(default_kwargs)
+        non_modality_kwargs = set(kwargs) - set(output_kwargs)
+        for modality in output_kwargs:
+            for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys():
+                if modality in kwargs:
+                    kwarg_value = kwargs[modality].pop(modality_key, "__empty__")
+                    if kwarg_value != "__empty__" and modality_key in non_modality_kwargs:
+                        raise ValueError(
+                            f"Keyword argument {modality_key} was passed twice: "
+                            f"in a dictionary for {modality} and as a **kwarg."
+                        )
+                elif modality_key in kwargs:
+                    kwarg_value = kwargs.get(modality_key, "__empty__")
+                else:
+                    kwarg_value = "__empty__"
+                if kwarg_value != "__empty__":
+                    output_kwargs[modality][modality_key] = kwarg_value
+                    used_keys.add(modality_key)
+        if any(key in default_kwargs for key in kwargs):
+            for modality, subdict in kwargs.items():
+                if modality in default_kwargs:
+                    for subkey, subvalue in subdict.items():
+                        if subkey not in used_keys:
+                            output_kwargs[modality][subkey] = subvalue
+                            used_keys.add(subkey)
+        else:
+            for key in kwargs:
+                if key not in used_keys:
+                    output_kwargs["common_kwargs"][key] = kwargs[key]
+        for modality in output_kwargs:
+            output_kwargs[modality].update(output_kwargs["common_kwargs"])
+        return output_kwargs

processor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_hulumed.HulumedProcessor"
+  },
+  "fps": 1,
+  "image_merge_size": 1,
+  "max_frames": 128,
+  "processor_class": "HulumedProcessor",
+  "video_merge_size": 2
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,233 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151665": {
+      "content": "<image>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151666": {
+      "content": "<|stream_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151667": {
+      "content": "<|stream_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "\n{%- set identifier = 'im' %}\n{% for message in messages %}\n    {% if message['role'] == 'stream' %}\n        {% set identifier = 'stream' %}\n    {% else %}\n        {% set identifier = 'im' %}\n    {% endif %}\n    {{- '<|' + identifier + '_start|>' + message['role'] + '\n' -}}\n    {% if message['content'] is string %}\n        {{- message['content'] + '<|' + identifier + '_end|>\n' -}}\n    {% else %}\n        {% for content in message['content'] %}\n            {% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}\n                {% if 'time' in content %}\n                    {{- 'Time ' + content['time'] | round(1) | string + 's: ' -}}\n                {% endif %}\n\n                {{- '<image>\n' -}}\n\n            {% elif content['type'] == 'video' or 'video' in content or 'video_url' in content %}\n                {% for i in range(content['num_frames']) %}\n                    {% if 'timestamps' in content %}\n                        {{- 'Time ' + content['timestamps'][i] | round(1) | string + 's:' -}}\n                    {% endif %}\n                    {% if i < content['num_frames'] - 1 %}\n\n                        {{- '<image>,' -}}\n\n                    {% else %}\n\n                        {{- '<image>\n' -}}\n\n                    {% endif %}\n                {% endfor %}\n            {% elif content['type'] == 'text' or 'text' in content %}\n                {{- content['text'] -}}\n            {% endif %}\n        {% endfor %}\n        {{- '<|' + identifier + '_end|>\n' -}}\n    {% endif %}\n{% endfor %}\n{% if add_generation_prompt %}\n    {{- '<|im_start|>assistant\n' -}}\n{% endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 16384,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff