Yinsongliu commited on Jan 26

Commit

c13c3aa

1 Parent(s): fdcd432

Upload model with LFS assets

Browse files

Files changed (36) hide show

.gitattributes +2 -0
LICENSE.txt +200 -0
README.md +183 -0
__init__.py +31 -0
assets/OminiDocBench_v1.5.png +3 -0
assets/art.png +3 -0
assets/chart.png +3 -0
assets/chart2.png +3 -0
assets/formula.png +3 -0
assets/handwriting.png +3 -0
assets/hierarchical.png +3 -0
assets/hierarchical2.png +3 -0
assets/olmOCR.png +3 -0
assets/parallel_decoder.png +3 -0
assets/printed.png +3 -0
assets/seal.png +3 -0
assets/static_v40.png +3 -0
assets/wired.png +3 -0
assets/wireless.png +3 -0
assets/youtu-parsing-logo.png +3 -0
chat_template.json +3 -0
config.json +70 -0
configuration_siglip2.py +178 -0
configuration_youtu_vl.py +209 -0
generation_config.json +7 -0
image_processing_siglip2_fast.py +328 -0
model-00001-of-00002.safetensors +3 -0
model-00002-of-00002.safetensors +3 -0
model.safetensors.index.json +835 -0
modeling_siglip2.py +1594 -0
modeling_youtu_vl.py +1542 -0
preprocessor_config.json +26 -0
processing_youtu_vl.py +173 -0
special_tokens_map.json +23 -0
tokenizer.json +3 -0
tokenizer_config.json +15 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+Tencent is pleased to support the community by making Youtu-Parsing available.
+Copyright (C) 2026 Tencent. All rights reserved. Youtu-Parsing IS NOT INTENDED FOR USE WITHIN THE EUROPEAN UNION.
+Youtu-Parsing is licensed under the License Terms of Youtu-Parsing except for the third-party components listed below, which is licensed under different terms. Youtu-Parsing does not impose any additional limitations beyond what is outlined in the respective licenses of these third-party components. Users must comply with all terms and conditions of original licenses of these third-party components and must ensure that the usage of the third party components adheres to all relevant laws and regulations.
+For avoidance of doubts, Youtu-Parsing refers to the inference enabling code, parameters and weights made publicly available by Tencent in accordance with the License Terms of Youtu-Parsing in this repository.
+Terms of the License Terms of Youtu-Parsing:
+--------------------------------------------------------------------
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+0. Additional Territorial Limitation
+*Youtu-Parsing IS NOT INTENDED FOR USE WITHIN THE EUROPEAN UNION.*
+IN THE EVENT OF ANY CONFLICT, THIS CLAUSE SHALl PREVAIL.
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+The Code of this project is built on and with the aid of the following open source projects. Credits are given to these projects.
+The below software in this distribution may have been modified by Tencent ("Tencent Modifications"). All Tencent Modifications
+are Copyright(C)Tencent.
+Open Source Software Licensed under the Apache-2.0:
+--------------------------------------------------------------------
+1. transformers
+Copyright 2018- The Hugging Face team.
+2. Opencv
+Copyright (c) 2025 opencv original author and authors
+3. vLLM
+Copyright (c) 2025 vllm original author and authors
+Terms of the Apache-2.0:
+--------------------------------------------------------------------
+Apache License
+Version 2.0, January 2004
+http://www.apache.org/licenses/
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+1. Definitions.
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+END OF TERMS AND CONDITIONS
+Open Source Software Licensed under the BSD-3-Clause:
+--------------------------------------------------------------------
+1. numpy
+Copyright (c) 2005-2024, NumPy Developers.
+2. scipy
+Copyright (c) 2001-2002 Enthought, Inc. 2003-2024, SciPy Developers.
+3. torch
+Copyright (c) 2016-     Facebook, Inc            (Adam Paszke); Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala); Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert); Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu); Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu); Copyright (c) 2011-2013 NYU                      (Clement Farabet); Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston); Copyright (c) 2006      Idiap Research Institute (Samy Bengio); Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz); Copyright (c) 2016-present, Facebook Inc. All rights reserved.; Copyright (c) 2016 Facebook Inc.; Copyright (c) 2015 Google Inc.; Copyright (c) 2015 Yangqing Jia; Copyright 2019-2020 Kakao Brain; Copyright (c) 2022 Cruise LLC.; Copyright (c) 2024 Tri Dao.; Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates; Copyright(c) 2013, 2014, 2015, the respective contributors; Copyright(c) 2015, 2016 the respective contributors
+4. torchvision
+Copyright (c) Soumith Chintala 2016,
+Terms of the BSD-3-Clause:
+--------------------------------------------------------------------
+BSD 3-Clause License
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+3. Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+Open Source Software Licensed under the MIT-CMU:
+--------------------------------------------------------------------
+1. Pillow
+Copyright © 1997-2011 by Secret Labs AB; Copyright © 1995-2011 by Fredrik Lundh and contributors; Copyright © 2010 by Jeffrey A. Clark and contributors
+Terms of the MIT-CMU:
+--------------------------------------------------------------------
+By obtaining, using, and/or copying this software and/or its associated documentation, you agree that you have read, understood, and will comply with the following terms and conditions:
+Permission to use, copy, modify, and distribute this software and its associated documentation for any purpose and without fee is hereby granted, provided that the above copyright notice appears in all copies, and that both that copyright notice and this permission notice appear in supporting documentation, and that the name of the copyright holder not be used in advertising or publicity pertaining to distribution of the software without specific, written prior permission.
+THE COPYRIGHT HOLDER DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE COPYRIGHT HOLDER BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM THE LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+==================================================
+End of the Attribution Notice of this project.

README.md ADDED Viewed

	@@ -0,0 +1,183 @@

+---
+license: other
+license_name: youtu-parsing
+license_link: https://huggingface.co/tencent/Youtu-Parsing/blob/main/LICENSE.txt
+pipeline_tag: image-text-to-text
+base_model:
+  - tencent/Youtu-LLM-2B
+base_model_relation: finetune
+---
+<div align="center">
+# <img src="assets/youtu-parsing-logo.png" alt="Youtu-Parsing Logo" height="100px">
+[📃 License](https://huggingface.co/tencent/Youtu-Parsing/blob/main/LICENSE.txt) • [💻 Code](https://github.com/TencentCloudADP/youtu-parsing) • [📊 Benchmarks](#benchmarks) • [🚀 Getting Started](#quickstart)
+</div>
+<div align="center">
+<img src="./assets/static_v40.png" width="800"/>
+</div>
+## 🎯 Introduction
+**Youtu-Parsing** is a specialized document parsing model built upon the open-source Youtu-LLM 2B foundation. By extending the capabilities of the base model with a prompt-guided framework and NaViT-style dynamic visual encoder, Youtu-Parsing offers enhanced parsing capabilities for diverse document elements including text, tables, formulas, and charts. The model incorporates an efficient parallel decoding mechanism that significantly accelerates inference, making it practical for real-world document analysis applications. We share Youtu-Parsing with the community to facilitate research and development in document understanding.
+## ✨ Key Features
+### 📄 **Document Structure Preservation**
+- **Text Localization**: Accurately detects and localizes text regions with pixel-level precision, ensuring no content is missed or misplaced across diverse document layouts.
+- **Reading Order Restoration**: Intelligently reconstructs the logical reading sequence of document content, maintaining proper flow across columns, sections, and pages for coherent understanding.
+### 📊 **Advanced Content Recognition**
+- **Text Recognition**: Provides precise text recognition across diverse scenarios.
+- **Formula Recognition**: Automatically converts mathematical expressions to LaTeX format.
+- **Table Recognition**: Automatically detects tables and converts them to HTML format.
+- **Chart Recognition**: Converts charts to markdown tables, mind maps and flow charts to mermaid format.
+### ⚡ **High-Performance Inference**
+- **Token Parallelism**: Enables simultaneous inference of multiple tokens for accelerated processing, achieving 5-11x speedup.
+- **Query Parallelism**: Combines multiple queries together to maximize Token Parallelism benefits, providing an additional 2x speedup on top of Token Parallelism acceleration.
+<div align="center">
+<img src="./assets/parallel_decoder.png" width="800"/>
+</div>
+<a id="benchmarks"></a>
+## 📊 Performance
+### 1. OminiDocBench v1.5
+<div align="center">
+<img src="./assets/OminiDocBench_v1.5.png" width="800"/>
+</div>
+### 2. olmOCR
+<div align="center">
+<img src="./assets/olmOCR.png" width="800"/>
+</div>
+<a id="quickstart"></a>
+## 🚀 Quick Start
+### Install packages
+```bash
+conda create -n youtu_parsing python=3.10
+conda activate youtu_parsing
+pip install git+https://github.com/TencentCloudADP/youtu-parsing.git#subdirectory=youtu_hf_parser
+# install the flash-attn2
+# For CUDA 12.x + PyTorch 2.6 + Python 3.10 + Linux x86_64:
+pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
+# Alternative: Install from PyPI
+pip install flash-attn==2.7.0
+```
+### Usage with transformers
+```python
+from youtu_hf_parser import YoutuOCRParserHF
+# Initialize the parser
+parser = YoutuOCRParserHF(
+    model_path=model_path,
+    enable_angle_correct=True,  # Set to False to disable angle correction
+    angle_correct_model_path=angle_correct_model_path
+)
+# Parse an image
+parser.parse_file(input_path=image_path, output_dir=output_dir)
+```
+## 🎨 Visualization
+### Text Recognition
+<div align="center">
+<img src="./assets/handwriting.png" width="800"/>
+</div>
+<div align="center">
+<img src="./assets/art.png" width="800"/>
+</div>
+<div align="center">
+<img src="./assets/printed.png" width="800"/>
+</div>
+### Formula Recognition
+<div align="center">
+<img src="./assets/formula.png" width="800"/>
+</div>
+### Table Recognition
+<div align="center">
+<img src="./assets/wired.png" width="800"/>
+</div>
+<div align="center">
+<img src="./assets/wireless.png" width="800"/>
+</div>
+### Chart Recognition
+<div align="center">
+<img src="./assets/chart.png" width="800"/>
+</div>
+<div align="center">
+<img src="./assets/chart2.png" width="800"/>
+</div>
+### Seal Recognition
+<div align="center">
+<img src="./assets/seal.png" width="800"/>
+</div>
+### Hierarchical Structure Analysis
+<div align="center">
+<img src="./assets/hierarchical.png" width="800"/>
+</div>
+<div align="center">
+<img src="./assets/hierarchical2.png" width="800"/>
+</div>
+## 🤝 Acknowledgements
+We would like to thank [Youtu-LLM](https://github.com/TencentCloudADP/youtu-tip/tree/master/youtu-llm), [OmniDocBench](https://github.com/opendatalab/OmniDocBench), [olmOCR](https://github.com/allenai/olmocr), [dots.ocr](https://github.com/rednote-hilab/dots.ocr), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [PSENet](https://github.com/whai362/PSENet) for providing model weights, benchmarks and valuable code. We also appreciate everyone's contribution to this open-source project!
+## 📚 Citation
+If you find our work useful in your research, please consider citing the following paper:
+```
+@article{youtu-parsing,
+  title={Youtu-Parsing: Perception, Structuring and Recognition via High-Parallelism Decoding},
+  author={Tencent Youtu Lab},
+  year={2026},
+  eprint={},
+  archivePrefix={},
+  primaryClass={},
+  url={},
+}
+@article{youtu-vl,
+  title={Youtu-VL: Unleashing Visual Potential via Unified Vision-Language Supervision},
+  author={Tencent Youtu Lab},
+  year={2026},
+  eprint={},
+  archivePrefix={},
+  primaryClass={},
+  url={},
+}
+@article{youtu-llm,
+  title={Youtu-LLM: Unlocking the Native Agentic Potential for Lightweight Large Language Models},
+  author={Tencent Youtu Lab},
+  year={2025},
+  eprint={2512.24618},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2512.24618},
+}
+```

__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright 2025 The Youtu Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from transformers.utils import _LazyModule
+from transformers.utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_youtu_vl import *
+    from .modeling_youtu_vl import *
+    from .processing_youtu_vl import *
+    from .configuration_siglip2 import *
+    from .image_processing_siglip2_fast import *
+    from .modeling_siglip2 import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

assets/OminiDocBench_v1.5.png ADDED Viewed

Git LFS Details

SHA256: c98f06c02028df4120e77f53cf6b8f3bd30deb6006432f047f5567c88dbe471c
Pointer size: 131 Bytes
Size of remote file: 506 kB

assets/art.png ADDED Viewed

Git LFS Details

SHA256: 0215c94c4b0130f7ef39ae4f69f17a1ea1ac9ed53a6eb76ffc89a56e6c76f5c7
Pointer size: 131 Bytes
Size of remote file: 784 kB

assets/chart.png ADDED Viewed

Git LFS Details

SHA256: 6f80a5635f58bedd2c19896b796e0efe81c15b0a053831671edc230d8333aa1f
Pointer size: 131 Bytes
Size of remote file: 588 kB

assets/chart2.png ADDED Viewed

Git LFS Details

SHA256: 71412df8dfcfd7a8a6f28cbcbcbd516716a5ee9ed44f64a7c209e70d6fc2838d
Pointer size: 131 Bytes
Size of remote file: 812 kB

assets/formula.png ADDED Viewed

Git LFS Details

SHA256: 0a61834add1b26c6c59b808e101c6f91dbee873a184a74a6cb6177016c670c8f
Pointer size: 132 Bytes
Size of remote file: 1.68 MB

assets/handwriting.png ADDED Viewed

Git LFS Details

SHA256: b727347801c2885c5b3eee0a86de7a7edfb0bc739754350f464622c930150908
Pointer size: 132 Bytes
Size of remote file: 1.18 MB

assets/hierarchical.png ADDED Viewed

Git LFS Details

SHA256: c007c2e869fdb9228663a03b38a80eb539bcae42647660aed6794fd6a4c5a142
Pointer size: 132 Bytes
Size of remote file: 1.52 MB

assets/hierarchical2.png ADDED Viewed

Git LFS Details

SHA256: 6c94e8cef1e74977d5b441cb1259b08e5fad12f04800dff75400cd2126812364
Pointer size: 132 Bytes
Size of remote file: 2.23 MB

assets/olmOCR.png ADDED Viewed

Git LFS Details

SHA256: 4dd11ce77da4b653b063115c1e7a1821e29a8c78dd57da8dd4898403a153173b
Pointer size: 131 Bytes
Size of remote file: 391 kB

assets/parallel_decoder.png ADDED Viewed

Git LFS Details

SHA256: 2f8ff7eac559b195c6f6e21aed23938c28612ded6892c04e357c2a65a87d4f76
Pointer size: 131 Bytes
Size of remote file: 303 kB

assets/printed.png ADDED Viewed

Git LFS Details

SHA256: 41716e85a2e29a6844e10c4b46d1ff56d773fa9aa46fe66480b40e7e99e44f23
Pointer size: 132 Bytes
Size of remote file: 2.23 MB

assets/seal.png ADDED Viewed

Git LFS Details

SHA256: 5f23fbb7226227a8592c5eb0dc355db372f619ad0bfc5439494648518a6f64a2
Pointer size: 131 Bytes
Size of remote file: 492 kB

assets/static_v40.png ADDED Viewed

Git LFS Details

SHA256: 6b115114ec8d011c84da118b202f76586919fdeb8a7c0283c31e6d7c3369b9b4
Pointer size: 131 Bytes
Size of remote file: 612 kB

assets/wired.png ADDED Viewed

Git LFS Details

SHA256: 02cf6f58694e62bfd57e2dd00ccc4d359d1536f055e392f98d29707ca80b32dd
Pointer size: 132 Bytes
Size of remote file: 3.15 MB

assets/wireless.png ADDED Viewed

Git LFS Details

SHA256: 3de3a6a34eeec6a278d6eaa97d97c4fa7546c39a3d1995de62374038817ad5b7
Pointer size: 131 Bytes
Size of remote file: 848 kB

assets/youtu-parsing-logo.png ADDED Viewed

Git LFS Details

SHA256: caa339d0dc0c3f144e14a48708502d587ac581c83709fbac776c34279b81f21d
Pointer size: 131 Bytes
Size of remote file: 446 kB

chat_template.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{
+    "chat_template": "{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|begin_of_text|>system\nYou are a helpful assistant.<|end_of_text|>\n{% endif %}<|begin_of_text|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|end_of_text|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|end_of_text|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|begin_of_text|>assistant\n{% endif %}"
+}

config.json ADDED Viewed

	@@ -0,0 +1,70 @@

+{
+  "architectures": [
+    "YoutuVLForConditionalGeneration"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_youtu_vl.YoutuVLConfig",
+    "AutoModelForCausalLM": "modeling_youtu_vl.YoutuVLForConditionalGeneration",
+    "AutoProcessor": "processing_youtu_vl.YoutuVLProcessor",
+    "AutoImageProcessor": "image_processing_siglip2_fast.Siglip2ImageProcessorFast"
+  },
+  "bos_token_id": 128000,
+  "embedding_initializer_range": 0.02795084971874737,
+  "eos_token_id": 128001,
+  "head_dim": 64,
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "image_token_id": 128264,
+  "initializer_range": 0.013975424859373685,
+  "intermediate_size": 6144,
+  "kv_lora_rank": 512,
+  "max_position_embeddings": 20480,
+  "mlp_bias": false,
+  "model_type": "youtu_vl",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 16,
+  "pad_token_id": 128001,
+  "q_lora_rank": 1536,
+  "qk_head_dim": 192,
+  "qk_nope_head_dim": 128,
+  "qk_rope_head_dim": 64,
+  "rms_norm_eps": 1e-06,
+  "rope_interleave": true,
+  "rope_theta": 100000,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.56.0",
+  "use_cache": false,
+  "v_head_dim": 128,
+  "video_token_id": 128265,
+  "vision_config": {
+    "attention_dropout": 0.0,
+    "hidden_act": "gelu_pytorch_tanh",
+    "hidden_size": 1152,
+    "intermediate_size": 4304,
+    "layer_norm_eps": 1e-06,
+    "model_type": "siglip2_vision_model",
+    "num_attention_heads": 16,
+    "num_channels": 3,
+    "num_hidden_layers": 27,
+    "num_patches": 4096,
+    "out_hidden_size": 2048,
+    "patch_size": 16,
+    "tokens_per_second": 2,
+    "torch_dtype": "bfloat16",
+    "vision_use_head": false,
+    "window_size": 256,
+    "fullatt_block_indexes": [
+        7,
+        15,
+        23,
+        26
+    ]
+  },
+  "vision_end_token_id": 128263,
+  "vision_start_token_id": 128262,
+  "vocab_size": 182646
+}

configuration_siglip2.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class Siglip2TextConfig(PretrainedConfig):
+    r"""
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Siglip2 text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Siglip2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 64):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the padding token in the vocabulary.
+        bos_token_id (`int`, *optional*, defaults to 49406):
+            The id of the beginning-of-sequence token in the vocabulary.
+        eos_token_id (`int`, *optional*, defaults to 49407):
+            The id of the end-of-sequence token in the vocabulary.
+        projection_size (`int`, *optional*, defaults to `hidden_size`):
+            The size of the projection head.
+    """
+    model_type = "siglip2_text_model"
+    base_config_key = "text_config"
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=64,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        pad_token_id=1,
+        bos_token_id=49406,
+        eos_token_id=49407,
+        projection_size=None,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.attention_dropout = attention_dropout
+        self.projection_size = projection_size if projection_size is not None else hidden_size
+class Siglip2VisionConfig(PretrainedConfig):
+    r"""
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_channels (`int`, *optional*, defaults to 3):
+            Number of channels in the input images.
+        num_patches (`int`, *optional*, defaults to 256):
+            The number of patches in the image with the size of (`patch_size`, `patch_size`).
+            The image is resized to fill maximum of this number of patches, and to preserve
+            the aspect ratio. In case the resulted number of patches is lower, the image is
+            padded in "patch" dimension.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+    model_type = "siglip2_vision_model"
+    base_config_key = "vision_config"
+    def __init__(
+        self,
+        hidden_size=768,
+        out_hidden_size=2048,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        num_patches=256,
+        patch_size=16,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.out_hidden_size = out_hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.num_patches = num_patches
+        self.in_features = -1
+class Siglip2Config(PretrainedConfig):
+    r"""
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Siglip2TextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`Siglip2VisionConfig`].
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+    """
+    model_type = "siglip2"
+    sub_configs = {"text_config": Siglip2TextConfig, "vision_config": Siglip2VisionConfig}
+    def __init__(self, text_config=None, vision_config=None, **kwargs):
+        super().__init__(**kwargs)
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `Siglip2TextConfig` with default values.")
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. initializing the `Siglip2VisionConfig` with default values.")
+        self.text_config = Siglip2TextConfig(**text_config)
+        self.vision_config = Siglip2VisionConfig(**vision_config)
+        self.initializer_factor = 1.0
+    @classmethod
+    def from_text_vision_configs(cls, text_config: Siglip2TextConfig, vision_config: Siglip2VisionConfig, **kwargs):
+        return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
+__all__ = ["Siglip2Config", "Siglip2TextConfig", "Siglip2VisionConfig"]

configuration_youtu_vl.py ADDED Viewed

	@@ -0,0 +1,209 @@

+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from .configuration_siglip2 import Siglip2VisionConfig
+YOUTU_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class YoutuVLConfig(PretrainedConfig):
+    r"""
+    Args:
+        vocab_size (`int`, *optional*, defaults to 129280):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`YoutuModel`]
+        hidden_size (`int`, *optional*, defaults to 7168):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 18432):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 61):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 128):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 128):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        n_shared_experts (`int`, *optional*, defaults to 1):
+            Number of shared experts.
+        n_routed_experts (`int`, *optional*, defaults to 256):
+            Number of routed experts.
+        routed_scaling_factor (`float`, *optional*, defaults to 2.5):
+            Scaling factor or routed experts.
+        kv_lora_rank (`int`, *optional*, defaults to 512):
+            Rank of the LoRA matrices for key and value projections.
+        q_lora_rank (`int`, *optional*, defaults to 1536):
+            Rank of the LoRA matrices for query projections.
+        qk_rope_head_dim (`int`, *optional*, defaults to 64):
+            Dimension of the query/key heads that use rotary position embeddings.
+        v_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the value heads.
+        qk_nope_head_dim (`int`, *optional*, defaults to 128):
+            Dimension of the query/key heads that don't use rotary position embeddings.
+        n_group (`int`, *optional*, defaults to 8):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to 4):
+            Number of selected groups for each token.
+        num_experts_per_tok (`int`, *optional*, defaults to 8):
+            Number of selected experts, None means dense model.
+        norm_topk_prob (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the weights of the routed experts.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 1):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        rope_interleave (`bool`, *optional*, defaults to `True`):
+            Whether to interleave the rotary position embeddings.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+    """
+    sub_configs = {"vision_config": Siglip2VisionConfig}
+    model_type = "youtu_vl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+    def __init__(
+        self,
+        vocab_size=129280,
+        hidden_size=7168,
+        intermediate_size=18432,
+        num_hidden_layers=61,
+        num_attention_heads=128,
+        num_key_value_heads=128,
+        n_shared_experts=1,
+        n_routed_experts=256,
+        routed_scaling_factor=2.5,
+        kv_lora_rank=512,
+        q_lora_rank=1536,
+        qk_rope_head_dim=64,
+        v_head_dim=128,
+        qk_nope_head_dim=128,
+        n_group=8,
+        topk_group=4,
+        num_experts_per_tok=8,
+        norm_topk_prob=True,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=None,
+        embedding_initializer_range=None,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=0,
+        eos_token_id=1,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        rope_interleave=True,
+        attention_bias=False,
+        attention_dropout=0.0,
+        vision_config=None,
+        **kwargs,
+    ):
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.qk_head_dim = qk_nope_head_dim + qk_rope_head_dim
+        self.head_dim = qk_rope_head_dim
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.norm_topk_prob = norm_topk_prob
+        self.rope_interleave = rope_interleave
+        self.flash_att_sliding_window = None
+        self.mlp_bias = False
+        self.mtp_loss_weight = 0.3
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = (
+            (2.0 / (5.0 * self.hidden_size)) ** 0.5
+            if initializer_range is None
+            else initializer_range
+        )
+        self.embedding_initializer_range = (
+            self.initializer_range * 2.0
+            if embedding_initializer_range is None
+            else embedding_initializer_range
+        )
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+__all__ = ["YoutuVLConfig"]

generation_config.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 128000,
+  "eos_token_id": 128001,
+  "pad_token_id": 128001,
+  "transformers_version": "4.56.0"
+}

image_processing_siglip2_fast.py ADDED Viewed

	@@ -0,0 +1,328 @@

+from typing import List, Optional, Tuple, Union
+import os
+import torch
+import math
+from torchvision.transforms import functional as F
+from transformers.image_processing_utils import BatchFeature
+from transformers.image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+)
+from transformers.image_utils import (
+    ImageInput,
+    PILImageResampling,
+)
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    logging,
+)
+BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
+    Args:
+        do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `self.size`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
+            Whether to default to a square image when resizing, if size is an int.
+        resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to `self.crop_size`):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+            Whether to convert the image to RGB.
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
+            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
+            The channel dimension format for the input image. If unset, the channel dimension format is inferred
+            from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        device (`torch.device`, *optional*, defaults to `self.device`):
+            The device to process the images on. If unset, the device is inferred from the input images."""
+BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
+    Preprocess an image or batch of images.
+    Args:
+        images (`ImageInput`):
+            Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+            passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+            Whether to resize the image.
+        size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+            Describes the maximum input dimensions to the model.
+        resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`):
+            Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+            has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+            Whether to center crop the image.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+            Size of the output image after applying `center_crop`.
+        do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+            Whether to rescale the image.
+        rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+            Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+        image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+            Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+            `True`.
+        do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+            Whether to convert the image to RGB.
+        return_tensors (`str` or `TensorType`, *optional*, defaults to `self.return_tensors`):
+            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.data_format`):
+            Only `ChannelDimension.FIRST` is supported. Added for compatibility with slow processors.
+        input_data_format (`ChannelDimension` or `str`, *optional*, defaults to `self.input_data_format`):
+            The channel dimension format for the input image. If unset, the channel dimension format is inferred
+            from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        device (`torch.device`, *optional*, defaults to `self.device`):
+            The device to process the images on. If unset, the device is inferred from the input images."""
+if is_torch_available():
+    import torch
+if is_torchvision_available():
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F
+logger = logging.get_logger(__name__)
+def get_image_size_for_patches(
+    image_height: int, image_width: int, patch_size: int, max_num_patches: int
+) -> Tuple[int, int]:
+    """
+    Args:
+        image_height (`int`):
+            Original image height.
+        image_width (`int`):
+            Original image width.
+        patch_size (`int`):
+            Patch size for processing.
+    Returns:
+        Tuple: (target_height, target_width)
+    """
+    def get_scaled_image_size(scale: float, size: int, patch_size: int) -> int:
+        patch_size = patch_size * 2
+        scaled_size = size * scale
+        scaled_size = math.ceil(scaled_size / patch_size) * patch_size
+        scaled_size = max(patch_size, scaled_size)
+        return int(scaled_size)
+    scale = 1.0
+    while True:
+        target_height = get_scaled_image_size(scale, image_height, patch_size)
+        target_width = get_scaled_image_size(scale, image_width, patch_size)
+        num_patches = (target_height / patch_size) * (target_width / patch_size)
+        if num_patches > max_num_patches:
+            scale -= 0.02
+        else:
+            break
+    return target_height, target_width
+def convert_image_to_patches(image: "torch.Tensor", patch_size: int, merge_size: int) -> "torch.Tensor":
+    """
+    Converts an input image into flattened patches.
+    Args:
+        image: Input image tensor of shape (channels, height, width)
+        patch_size: Size of each square patch (in pixels)
+        merge_size: Number of adjacent patches to merge
+    """
+    num_channels, image_height, image_width = image.shape
+    num_patches_height = image_height // patch_size
+    num_patches_width = image_width // patch_size
+    patched_image = image.reshape(num_channels,
+                                  num_patches_height//merge_size,
+                                  merge_size, patch_size,
+                                  num_patches_width//merge_size,
+                                  merge_size, patch_size)
+    patched_image = patched_image.permute(1, 4, 2, 5, 3, 6, 0)
+    patched_image = patched_image.reshape(num_patches_height * num_patches_width, -1)
+    return patched_image
+def pad_along_first_dim(
+    tensor: "torch.Tensor", target_length: int, pad_value: int = 0
+) -> Tuple["torch.Tensor", "torch.Tensor"]:
+    """
+    Pad the input tensor along its first dimension to a target length.
+    Args:
+        tensor (torch.Tensor): The input tensor to be padded.
+        target_length (int): The desired length of the first dimension after padding.
+        pad_value (int, optional): The value to use for padding. Defaults to 0.
+    """
+    current_length = tensor.shape[0]
+    padding_length = target_length - current_length
+    mask = torch.ones((target_length,), dtype=torch.int32)
+    if padding_length > 0:
+        padding = [0, 0] * (tensor.ndim - 1) + [0, padding_length]
+        tensor = torch.nn.functional.pad(tensor, padding, mode="constant", value=pad_value)
+        mask[-padding_length:] = 0
+    return tensor, mask
+class Siglip2FastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    patch_size: Optional[int]
+    max_num_patches: Optional[int]
+@add_start_docstrings(
+    r"Constructs a fast Siglip2 image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+    """
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch the image will be split to.
+        max_num_patches (`int`, *optional*, defaults to 256):
+            The image will be resized to have at most this number of patches,
+            and then padded in "patch" dimension to match this number exactly.
+    """,
+)
+class Siglip2ImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    patch_size = 16
+    max_num_patches = 256
+    valid_kwargs = Siglip2FastImageProcessorKwargs
+    unused_kwargs = ["size", "do_center_crop", "crop_size"]
+    print_max_patched = True
+    def __init__(self, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+    def _validate_preprocess_kwargs(self, **kwargs) -> tuple:
+        kwargs.pop("do_resize", None)
+        return super()._validate_preprocess_kwargs(**kwargs)
+    @add_start_docstrings(
+        BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS,
+        """
+        patch_size (`int`, *optional*, defaults to `self.patch_size`):
+            The size (resolution) of each patch the image will be split to.
+        max_num_patches (`int`, *optional*, defaults to `self.max_num_patches`):
+            The image will be resized to have at most this number of patches,
+            and then padded in "patch" dimension to match this number exactly.
+        """,
+    )
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[Siglip2FastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+    def get_max_image_patches(self, images):
+        return 4096 * 6 * 6
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        patch_size: int,
+        max_num_patches: int,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        pixel_masks = []
+        pixel_values = []
+        spatial_shapes = []
+        if Siglip2ImageProcessorFast.print_max_patched:
+            Siglip2ImageProcessorFast.print_max_patched = False
+        for i, image in enumerate(images):
+            height, width,  = get_image_size_for_patches(
+                image_height=image.shape[1],
+                image_width=image.shape[2],
+                patch_size=patch_size,
+                max_num_patches=max_num_patches,
+            )
+            side_dict = SizeDict(height=height, width=width)
+            image = self.resize(image=image, size=side_dict, interpolation=interpolation)
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            patches = convert_image_to_patches(image, patch_size, 2)
+            patches, mask = pad_along_first_dim(patches, len(patches))
+            num_patches_height = image.shape[1] // patch_size
+            num_patches_width = image.shape[2] // patch_size
+            spatial_shapes.append((num_patches_height, num_patches_width))
+            pixel_values.append(patches)
+            pixel_masks.append(mask)
+        pixel_values = torch.stack(pixel_values, dim=0)
+        pixel_masks = torch.stack(pixel_masks, dim=0)
+        spatial_shapes = torch.tensor(spatial_shapes)
+        batch_feature = BatchFeature(
+            data={
+                "pixel_values": pixel_values,
+                "pixel_attention_mask": pixel_masks,
+                "spatial_shapes": spatial_shapes,
+            },
+            tensor_type=return_tensors,
+        )
+        return batch_feature
+__all__ = ["Siglip2ImageProcessorFast"]

model-00001-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c2f63df3c26a31537118c524d375566cd022b3f5c2ec7e14aa096f7cd5d2bfc2
+size 4981728088

model-00002-of-00002.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7620ba07a90acb406e36436ac513f691c675a2ad2d459bbb5fae053b0a507c3
+size 50344496

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,835 @@

+{
+  "metadata": {
+    "total_size": 5779947232
+  },
+  "weight_map": {
+    "lm_head.weight": "model-00002-of-00002.safetensors",
+    "merger.ln_q.weight": "model-00001-of-00002.safetensors",
+    "merger.mlp.0.bias": "model-00001-of-00002.safetensors",
+    "merger.mlp.0.weight": "model-00001-of-00002.safetensors",
+    "merger.mlp.2.bias": "model-00001-of-00002.safetensors",
+    "merger.mlp.2.weight": "model-00001-of-00002.safetensors",
+    "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.0.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.1.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.10.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.11.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.12.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.13.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.14.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.15.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.16.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.17.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.18.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.19.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.2.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.20.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.21.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.22.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.23.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.24.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.25.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.26.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.27.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.28.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.29.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.3.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.30.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
+    "model.layers.31.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.31.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.4.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.5.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.6.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.7.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.8.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.kv_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.kv_a_proj_with_mqa.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.kv_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_a_layernorm.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_a_proj.weight": "model-00001-of-00002.safetensors",
+    "model.layers.9.self_attn.q_b_proj.weight": "model-00001-of-00002.safetensors",
+    "model.norm.weight": "model-00002-of-00002.safetensors",
+    "siglip2.vision_model.embeddings.patch_embedding.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
+    "siglip2.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors"
+  }
+}

modeling_siglip2.py ADDED Viewed

	@@ -0,0 +1,1594 @@

+import math
+import warnings
+from dataclasses import dataclass
+from typing import Any, Callable, Optional, Tuple, Union
+import os
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn.init import _calculate_fan_in_and_fan_out
+from transformers.activations import ACT2FN
+from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
+# from transformers.modeling_layers import GradientCheckpointingLayer
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+)
+from .configuration_siglip2 import Siglip2Config, Siglip2TextConfig, Siglip2VisionConfig
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "Siglip2Config"
+is_aiter_available = False
+if is_flash_attn_2_available():
+    try:
+        from aiter import flash_attn_varlen_func
+        is_aiter_available = True
+    except ImportError:
+        from flash_attn import flash_attn_varlen_func
+    from flash_attn.layers.rotary import apply_rotary_emb
+else:
+    flash_attn_varlen_func = None
+    apply_rotary_emb = None
+if is_flash_attn_2_available():
+    from transformers.modeling_flash_attention_utils import _flash_attention_forward
+else:
+    flash_attn_varlen_func = None
+@dataclass
+class Siglip2VisionOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+        *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`
+        is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or
+        when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class Siglip2TextOutput(ModelOutput):
+    """
+    Base class for text model's outputs that also contains a pooling of the last hidden states.
+    Args:
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`
+        *optional* returned when model is initialized with `with_projection=True`):
+            The text embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned
+        when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed
+        or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    text_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+@dataclass
+class Siglip2Output(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to
+            the pooled output of [`Siglip2VisionModel`].
+        text_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`Siglip2TextModel`].
+        vision_model_output (`BaseModelOutputWithPooling`):
+            The output of the [`Siglip2VisionModel`].
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> Tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+class Siglip2VisionEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        if hasattr(config, 'in_features') and config.in_features > 0:
+            self.in_features = config.in_features
+        else:
+            self.in_features = config.num_channels * self.patch_size * self.patch_size
+        self.patch_embedding = nn.Linear(
+            in_features=self.in_features,
+            out_features=self.embed_dim,
+        )
+        self.num_patches = config.num_patches
+        self.position_embedding_size = int(self.num_patches**0.5)
+        self.position_embedding = nn.Embedding(self.num_patches, self.embed_dim)
+    @staticmethod
+    def resize_positional_embeddings(
+        positional_embeddings: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        max_length: int,
+    ) -> torch.Tensor:
+        """
+        Resize positional embeddings to image-specific size and pad to a fixed size.
+        Args:
+            positional_embeddings (`torch.Tensor`):
+                Position embeddings of shape (height, width, embed_dim)
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+            max_length (`int`):
+                Maximum length of the positional embeddings to pad resized positional embeddings to
+        Returns:
+            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+        """
+        batch_size = spatial_shapes.shape[0]
+        embed_dim = positional_embeddings.shape[-1]
+        source_dtype = positional_embeddings.dtype
+        resulted_positional_embeddings = torch.empty(
+            (batch_size, max_length, embed_dim),
+            device=positional_embeddings.device,
+            dtype=source_dtype,
+        )
+        positional_embeddings = positional_embeddings.permute(2, 0, 1).unsqueeze(0)
+        if positional_embeddings.device.type == "cpu":
+            positional_embeddings = positional_embeddings.to(torch.float32)
+        for i in range(batch_size):
+            height, width = spatial_shapes[i]
+            resized_embeddings = F.interpolate(
+                positional_embeddings,
+                size=(height, width),
+                mode="bilinear",
+                align_corners=False,
+                antialias=True,
+            )
+            resized_embeddings = resized_embeddings.reshape(embed_dim, height * width).transpose(0, 1)
+            resized_embeddings = resized_embeddings.to(source_dtype)
+            resulted_positional_embeddings[i, : height * width] = resized_embeddings
+            resulted_positional_embeddings[i, height * width :] = resized_embeddings[0]
+        return resulted_positional_embeddings
+    def forward(self, pixel_values: torch.FloatTensor, spatial_shapes: torch.LongTensor) -> torch.Tensor:
+        """
+        Args:
+            pixel_values (`torch.FloatTensor`):
+                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
+            spatial_shapes (`List[Tuple[int, int]]`):
+                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+        """
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+        positional_embeddings = self.position_embedding.weight.reshape(
+            self.position_embedding_size, self.position_embedding_size, -1
+        )
+        resized_positional_embeddings = self.resize_positional_embeddings(
+            positional_embeddings, spatial_shapes, max_length=pixel_values.shape[1]
+        )
+        embeddings = patch_embeds + resized_positional_embeddings
+        return embeddings
+class Siglip2VisionEmbeddingsWoPos(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.patch_size = config.patch_size
+        if hasattr(config, 'in_features') and config.in_features > 0:
+            self.in_features = config.in_features
+        else:
+            self.in_features = config.num_channels * self.patch_size * self.patch_size
+        self.patch_embedding = nn.Linear(
+            in_features=self.in_features,
+            out_features=self.embed_dim,
+        )
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+        patch_embeds = patch_embeds.view(-1, self.embed_dim)
+        return patch_embeds
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def apply_rotary_pos_emb_flashatt(
+        q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    cos = cos.chunk(2, dim=-1)[0].contiguous()
+    sin = sin.chunk(2, dim=-1)[0].contiguous()
+    q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+    k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+    return q_embed, k_embed
+class Siglip2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+        batch_size, seq_length, embed_dim = hidden_states.shape
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and output_attentions:
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support"
+                    "`output_attentions=True`. Falling back to 'eager attention. This warning"
+                    'can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+class Vision_FlashAttention2(nn.Module):
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]) -> None:
+        super().__init__()
+        dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.q_proj = nn.Linear(dim, dim)
+        self.out_proj = nn.Linear(dim, dim)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q = self.q_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        k = self.k_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        v = self.v_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_flashatt(q.unsqueeze(0), k.unsqueeze(0), cos, sin)
+        q = q.squeeze(0)
+        k = k.squeeze(0)
+        max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+        if is_aiter_available:
+            attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens,
+                            max_seqlen, max_seqlen, return_lse=True)[0].reshape(
+                            seq_length, -1)
+        else:
+            attn_output = flash_attn_varlen_func(q, k, v, cu_seqlens, cu_seqlens,
+                            max_seqlen, max_seqlen).reshape(
+                            seq_length, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb_vision(
+    q: torch.Tensor, k: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    orig_q_dtype = q.dtype
+    orig_k_dtype = k.dtype
+    q, k = q.float(), k.float()
+    cos, sin = cos.unsqueeze(-2).float(), sin.unsqueeze(-2).float()
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    q_embed = q_embed.to(orig_q_dtype)
+    k_embed = k_embed.to(orig_k_dtype)
+    return q_embed, k_embed
+class Vision_EagerAttention(nn.Module):
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]) -> None:
+        super().__init__()
+        dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.q_proj = nn.Linear(dim, dim)
+        self.out_proj = nn.Linear(dim, dim)
+        self.head_dim = dim // self.num_heads
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        seq_length = hidden_states.shape[0]
+        q = self.q_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        k = self.k_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        v = self.v_proj(hidden_states).reshape(seq_length, self.num_heads, -1)
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `rotary_pos_emb` (2D tensor of RoPE theta values), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.54 `rotary_pos_emb` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        else:
+            cos, sin = position_embeddings
+        q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+        attention_mask = torch.full(
+            [1, seq_length, seq_length], torch.finfo(q.dtype).min, device=q.device, dtype=q.dtype
+        )
+        for i in range(1, len(cu_seqlens)):
+            attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = 0
+        q = q.transpose(0, 1)
+        k = k.transpose(0, 1)
+        v = v.transpose(0, 1)
+        attn_weights = torch.matmul(q, k.transpose(1, 2)) / math.sqrt(self.head_dim)
+        attn_weights = attn_weights + attention_mask
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
+        attn_output = torch.matmul(attn_weights, v)
+        attn_output = attn_output.transpose(0, 1)
+        attn_output = attn_output.reshape(seq_length, -1)
+        attn_output = self.out_proj(attn_output)
+        return attn_output, None
+VISION_ATTENTION_CLASSES = {
+    'eager': Vision_EagerAttention,
+    'flash_attention_2': Vision_FlashAttention2,
+}
+class Siglip2MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+# class Siglip2EncoderLayer(GradientCheckpointingLayer):
+class Siglip2EncoderLayer(nn.Module):
+    def __init__(self, config: Union[Siglip2VisionConfig, Siglip2TextConfig]):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.self_attn = VISION_ATTENTION_CLASSES[config._attn_implementation](config=config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        cu_seqlens: torch.Tensor,
+        rotary_pos_emb: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements
+                are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            cu_seqlens=cu_seqlens,
+            rotary_pos_emb=rotary_pos_emb,
+            position_embeddings=position_embeddings,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (attn_weights,)
+        return outputs
+class VisionRope(nn.Module):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        inv_freq = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=torch.float) / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+    def forward(self, seqlen: int) -> torch.Tensor:
+        seq = torch.arange(seqlen, device=self.inv_freq.device, dtype=self.inv_freq.dtype)
+        freqs = torch.outer(seq, self.inv_freq)
+        return freqs
+class Siglip2Encoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`Siglip2EncoderLayer`].
+    Args:
+        config: Siglip2Config
+    """
+    def __init__(self, config: Siglip2Config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([Siglip2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+        self.spatial_merge_size = 2
+        self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
+        self.patch_size = config.patch_size
+        self.window_size = self.patch_size * 2 * 8
+        assert(config.hidden_size%(config.num_attention_heads*2) == 0)
+        self.rotary_pos_emb = VisionRope(config.hidden_size//config.num_attention_heads//2)
+    def rot_pos_emb(self, spatial_shapes):
+        pos_ids = []
+        for h, w in spatial_shapes:
+            t = 1
+            hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
+            hpos_ids = hpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            hpos_ids = hpos_ids.permute(0, 2, 1, 3)
+            hpos_ids = hpos_ids.flatten()
+            wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
+            wpos_ids = wpos_ids.reshape(
+                h // self.spatial_merge_size,
+                self.spatial_merge_size,
+                w // self.spatial_merge_size,
+                self.spatial_merge_size,
+            )
+            wpos_ids = wpos_ids.permute(0, 2, 1, 3)
+            wpos_ids = wpos_ids.flatten()
+            pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
+        pos_ids = torch.cat(pos_ids, dim=0)
+        max_grid_size = spatial_shapes.max()
+        rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
+        rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
+        return rotary_pos_emb
+    def get_window_index(self, spatial_shapes):
+        window_index: list = []
+        cu_window_seqlens: list = [0]
+        window_index_id = 0
+        vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
+        for grid_h, grid_w in spatial_shapes:
+            grid_t = 1
+            llm_grid_h, llm_grid_w = (
+                grid_h // self.spatial_merge_size,
+                grid_w // self.spatial_merge_size,
+            )
+            index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
+            pad_h = (vit_merger_window_size - llm_grid_h % vit_merger_window_size) % vit_merger_window_size
+            pad_w = (vit_merger_window_size - llm_grid_w % vit_merger_window_size) % vit_merger_window_size
+            num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
+            num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
+            index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
+            index_padded = index_padded.reshape(
+                grid_t,
+                num_windows_h,
+                vit_merger_window_size,
+                num_windows_w,
+                vit_merger_window_size,
+            )
+            index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
+                grid_t,
+                num_windows_h * num_windows_w,
+                vit_merger_window_size,
+                vit_merger_window_size,
+            )
+            seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
+            index_padded = index_padded.reshape(-1)
+            index_new = index_padded[index_padded != -100]
+            window_index.append(index_new + window_index_id)
+            cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
+            cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
+            window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
+        window_index = torch.cat(window_index, dim=0)
+        return window_index, cu_window_seqlens
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        spatial_shapes: torch.LongTensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        rotary_pos_emb = self.rot_pos_emb(spatial_shapes)
+        window_index, cu_window_seqlens = self.get_window_index(spatial_shapes)
+        cu_window_seqlens = torch.tensor(
+            cu_window_seqlens,
+            device=hidden_states.device,
+            dtype=spatial_shapes.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
+        seq_len, _ = hidden_states.size()
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        hidden_states = hidden_states[window_index, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        rotary_pos_emb = rotary_pos_emb[window_index, :, :]
+        rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+        cu_seqlens = torch.repeat_interleave(spatial_shapes[:, 0] * spatial_shapes[:, 1], 1).cumsum(
+            dim=0,
+            dtype=spatial_shapes.dtype if torch.jit.is_tracing() else torch.int32,
+        )
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
+        for layer_num, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if (1+layer_num) % 8 == 0 or layer_num == len(self.layers) - 1:
+                cu_seqlens_now = cu_seqlens
+            else:
+                cu_seqlens_now = cu_window_seqlens
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                cu_seqlens=cu_seqlens_now,
+                position_embeddings=position_embeddings
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        hidden_states = hidden_states.reshape(seq_len // self.spatial_merge_unit, self.spatial_merge_unit, -1)
+        reverse_indices = torch.argsort(window_index)
+        hidden_states = hidden_states[reverse_indices, :, :]
+        hidden_states = hidden_states.reshape(seq_len, -1)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+SIGLIP2_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class Siglip2VisionTransformer(nn.Module):
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Siglip2VisionEmbeddingsWoPos(config)
+        self.encoder = Siglip2Encoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.use_head = False
+        if self.use_head:
+            self.head = Siglip2MultiheadAttentionPoolingHead(config)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        bs, length, dim = pixel_values.shape
+        hidden_states = self.embeddings(pixel_values)
+        if attention_mask is not None and not self._use_flash_attention_2:
+            encoder_attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        else:
+            encoder_attention_mask = attention_mask
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            spatial_shapes=spatial_shapes,
+            attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class Siglip2TextEmbeddings(nn.Module):
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        return embeddings
+def _trunc_normal_(tensor, mean, std, a, b):
+    def norm_cdf(x):
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    l = norm_cdf((a - mean) / std)
+    u = norm_cdf((b - mean) / std)
+    tensor.uniform_(2 * l - 1, 2 * u - 1)
+    tensor.erfinv_()
+    tensor.mul_(std * math.sqrt(2.0))
+    tensor.add_(mean)
+    tensor.clamp_(min=a, max=b)
+def trunc_normal_tf_(
+    tensor: torch.Tensor, mean: float = 0.0, std: float = 1.0, a: float = -2.0, b: float = 2.0
+) -> torch.Tensor:
+    """
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    """
+    with torch.no_grad():
+        _trunc_normal_(tensor, 0, 1.0, a, b)
+        tensor.mul_(std).add_(mean)
+def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = scale / denom
+    if distribution == "truncated_normal":
+        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        with torch.no_grad():
+            tensor.normal_(std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        with torch.no_grad():
+            tensor.uniform_(-bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
+def default_flax_embed_init(tensor):
+    variance_scaling_(tensor, mode="fan_in", distribution="normal")
+SIGLIP2_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class Siglip2TextTransformer(nn.Module):
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        self.embeddings = Siglip2TextEmbeddings(config)
+        self.encoder = Siglip2Encoder(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = nn.Linear(embed_dim, config.projection_size)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if input_ids is None:
+            raise ValueError("You have to specify input_ids")
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
+        if attention_mask is not None and not self._use_flash_attention_2:
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.final_layer_norm(last_hidden_state)
+        pooled_output = last_hidden_state[:, -1, :]
+        pooled_output = self.head(pooled_output)
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+SIGLIP2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`Siglip2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+SIGLIP2_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
+            Whether to interpolate the pre-trained position encodings.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+class Siglip2PreTrainedModel(PreTrainedModel):
+    config_class = Siglip2Config
+    base_model_prefix = "siglip2"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "Siglip2TextEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2VisionEmbeddings",
+        "Siglip2EncoderLayer",
+        "Siglip2MultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, Siglip2VisionEmbeddings):
+            width = (
+                self.config.vision_config.hidden_size
+                if isinstance(self.config, Siglip2Config)
+                else self.config.hidden_size
+            )
+            nn.init.normal_(module.position_embedding.weight, std=1 / np.sqrt(width))
+        elif isinstance(module, nn.Embedding):
+            default_flax_embed_init(module.weight)
+        elif isinstance(module, Siglip2Attention):
+            nn.init.xavier_uniform_(module.q_proj.weight)
+            nn.init.xavier_uniform_(module.k_proj.weight)
+            nn.init.xavier_uniform_(module.v_proj.weight)
+            nn.init.xavier_uniform_(module.out_proj.weight)
+            nn.init.zeros_(module.q_proj.bias)
+            nn.init.zeros_(module.k_proj.bias)
+            nn.init.zeros_(module.v_proj.bias)
+            nn.init.zeros_(module.out_proj.bias)
+        elif isinstance(module, Siglip2MLP):
+            nn.init.xavier_uniform_(module.fc1.weight)
+            nn.init.xavier_uniform_(module.fc2.weight)
+            nn.init.normal_(module.fc1.bias, std=1e-6)
+            nn.init.normal_(module.fc2.bias, std=1e-6)
+        elif isinstance(module, Siglip2MultiheadAttentionPoolingHead):
+            nn.init.xavier_uniform_(module.probe.data)
+            nn.init.xavier_uniform_(module.attention.in_proj_weight.data)
+            nn.init.zeros_(module.attention.in_proj_bias.data)
+        elif isinstance(module, Siglip2Model):
+            logit_scale_init = torch.log(torch.tensor(1.0))
+            module.logit_scale.data.fill_(logit_scale_init)
+            module.logit_bias.data.zero_()
+        elif isinstance(module, Siglip2ForImageClassification):
+            nn.init.normal_(
+                module.classifier.weight,
+                std=self.config.vision_config.hidden_size**-0.5 * self.config.initializer_factor,
+            )
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            lecun_normal_(module.weight)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+@add_start_docstrings(
+    """The text model from Siglip2 without any head or projection on top.""",
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2TextModel(Siglip2PreTrainedModel):
+    config_class = Siglip2TextConfig
+    def __init__(self, config: Siglip2TextConfig):
+        super().__init__(config)
+        self.text_model = Siglip2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.text_model.embeddings.token_embedding
+    def set_input_embeddings(self, value):
+        self.text_model.embeddings.token_embedding = value
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2TextConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        """
+        return self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+class Siglip2MultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__()
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = Siglip2MLP(config)
+        self.num_heads = config.num_attention_heads
+    def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+        if attention_mask is not None:
+            target_len, source_len = probe.shape[1], hidden_state.shape[1]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_state.dtype, target_len)
+            attention_mask = attention_mask.repeat(1, self.num_heads, target_len, 1)
+            attention_mask = attention_mask.reshape(-1, target_len, source_len)
+        hidden_state = self.attention(probe, hidden_state, hidden_state, attn_mask=attention_mask)[0]
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+        return hidden_state[:, 0]
+@add_start_docstrings(
+    """The vision model from Siglip2 without any head or projection on top.""",
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2VisionModel(Siglip2PreTrainedModel):
+    config_class = Siglip2VisionConfig
+    main_input_name = "pixel_values"
+    def __init__(self, config: Siglip2VisionConfig):
+        super().__init__(config)
+        self.vision_model = Siglip2VisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Siglip2VisionConfig)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_attention_mask: torch.Tensor,
+        spatial_shapes: torch.LongTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        Returns:
+        ```"""
+        return self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+@add_start_docstrings(SIGLIP2_START_DOCSTRING)
+class Siglip2Model(Siglip2PreTrainedModel):
+    config_class = Siglip2Config
+    def __init__(self, config: Siglip2Config):
+        super().__init__(config)
+        if not isinstance(config.text_config, Siglip2TextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type Siglip2TextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+        if not isinstance(config.vision_config, Siglip2VisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type Siglip2VisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+        text_config = config.text_config
+        vision_config = config.vision_config
+        text_model = Siglip2TextModel._from_config(text_config)
+        vision_model = Siglip2VisionModel._from_config(vision_config)
+        self.text_model = text_model.text_model
+        self.vision_model = vision_model.vision_model
+        self.logit_scale = nn.Parameter(torch.randn(1))
+        self.logit_bias = nn.Parameter(torch.randn(1))
+        self.post_init()
+    @add_start_docstrings_to_model_forward(SIGLIP2_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`Siglip2TextModel`].
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = text_outputs.pooler_output
+        return pooled_output
+    @add_start_docstrings_to_model_forward(SIGLIP2_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`Siglip2VisionModel`].
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = vision_outputs.pooler_output
+        return pooled_output
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Siglip2Output, config_class=Siglip2Config)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> Siglip2Output:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values=pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        text_outputs: BaseModelOutputWithPooling = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        image_embeds = vision_outputs.pooler_output
+        text_embeds = text_outputs.pooler_output
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t().to(text_embeds.device))
+        logit_scale, logit_bias = self.logit_scale.to(text_embeds.device), self.logit_bias.to(text_embeds.device)
+        logits_per_text = logits_per_text * logit_scale.exp() + logit_bias
+        logits_per_image = logits_per_text.t()
+        loss = None
+        if return_loss:
+            eye = torch.eye(logits_per_text.size(0), device=logits_per_text.device)
+            m1_diag1 = -torch.ones_like(logits_per_text) + 2 * eye
+            loglik = torch.nn.functional.logsigmoid(m1_diag1 * logits_per_text)
+            nll = -torch.sum(loglik, dim=-1)
+            loss = nll.mean()
+        return Siglip2Output(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+@add_start_docstrings(
+    """
+    Siglip2 vision encoder with an image classification head on top (a
+    linear layer on top of the pooled final hidden states of
+    the patch tokens) e.g. for ImageNet.
+    """,
+    SIGLIP2_START_DOCSTRING,
+)
+class Siglip2ForImageClassification(Siglip2PreTrainedModel):
+    main_input_name = "pixel_values"
+    def __init__(self, config: Siglip2Config) -> None:
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        vision_model = Siglip2VisionModel._from_config(config.vision_config)
+        self.vision_model = vision_model.vision_model
+        self.classifier = (
+            nn.Linear(config.vision_config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+        self.post_init()
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(SIGLIP2_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=ImageClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_attention_mask: Optional[torch.Tensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        outputs: BaseModelOutputWithPooling = self.vision_model(
+            pixel_values,
+            attention_mask=pixel_attention_mask,
+            spatial_shapes=spatial_shapes,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs.last_hidden_state
+        if pixel_attention_mask is not None:
+            pool_mask = pixel_attention_mask[..., None].to(sequence_output.device)
+            sequence_output = torch.sum(sequence_output * pool_mask, dim=1) / torch.sum(pool_mask, dim=1)
+        else:
+            sequence_output = torch.mean(sequence_output, dim=1)
+        logits = self.classifier(sequence_output)
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+__all__ = [
+    "Siglip2Model",
+    "Siglip2PreTrainedModel",
+    "Siglip2TextModel",
+    "Siglip2VisionModel",
+    "Siglip2ForImageClassification",
+]

modeling_youtu_vl.py ADDED Viewed

	@@ -0,0 +1,1542 @@

+# coding=utf-8
+# Copyright 2026 Tencent Youtu lab, DeepSeek-AI and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import os
+from functools import partial
+from typing import Callable, Optional, Tuple, Union, List, Any, Dict
+import torch
+import torch.nn.functional as F
+from torch import nn
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    logging,
+    replace_return_docstrings,
+    is_flash_attn_2_available,
+)
+from transformers.utils.deprecation import deprecate_kwarg
+from .configuration_youtu_vl import YoutuVLConfig
+from .modeling_siglip2 import Siglip2VisionModel, Siglip2VisionEmbeddings
+from .configuration_siglip2 import Siglip2VisionConfig
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+    from transformers.integrations.flex_attention import make_flex_block_causal_mask
+is_aiter_available = False
+if is_flash_attn_2_available():
+    try:
+        from aiter import flash_attn_varlen_func
+        is_aiter_available = True
+    except ImportError:
+        from flash_attn import flash_attn_varlen_func
+else:
+    flash_attn_varlen_func = None
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "YoutuVLConfig"
+class YoutuRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+class YoutuRotaryEmbedding(nn.Module):
+    def __init__(self, config: YoutuVLConfig, device=None):
+        super().__init__()
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+    @torch.no_grad()
+    @dynamic_rope_update
+    def forward(self, x, position_ids):
+        """
+        Compute rotary positional embeddings.
+        Args:
+            x (torch.Tensor): Input tensor, shape (batch_size, seq_len, feature_dim)
+            position_ids (torch.LongTensor): Position indices, shape (batch_size, seq_len)
+        Returns:
+            Tuple of (cos, sin) tensors for rotary embedding
+        """
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class YoutuMLP(nn.Module):
+    def __init__(self, config, hidden_size=None, intermediate_size=None):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
+        self.intermediate_size = config.intermediate_size if intermediate_size is None else intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+def rotate_half(x):
+    """
+    Rotates half the hidden dims of the input.
+    """
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+    return attn_output, attn_weights
+def apply_rotary_pos_emb_interleave(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    r"""
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    b, h, s, d = q.shape
+    q = q.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    b, h, s, d = k.shape
+    k = k.view(b, h, s, d // 2, 2).transpose(4, 3).reshape(b, h, s, d)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+def yarn_get_mscale(scale=1, mscale=1):
+    if scale <= 1:
+        return 1.0
+    return 0.1 * mscale * math.log(scale) + 1.0
+class YoutuMLAttention(nn.Module):
+    """
+    Multi-latent attention from
+    'DeepSeek-V2: A Strong, Economical,
+    and Efficient Mixture-of-Experts Language Model'paper
+    """
+    def __init__(self, config: YoutuVLConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_key_value_groups = 1  # needed for eager attentions
+        self.attention_dropout = config.attention_dropout
+        self.num_heads = config.num_attention_heads
+        self.rope_theta = config.rope_theta
+        self.q_lora_rank = config.q_lora_rank
+        self.qk_rope_head_dim = config.qk_rope_head_dim
+        self.kv_lora_rank = config.kv_lora_rank
+        self.v_head_dim = config.v_head_dim
+        self.qk_nope_head_dim = config.qk_nope_head_dim
+        self.qk_head_dim = config.qk_head_dim
+        self.flash_att_sliding_window = config.flash_att_sliding_window
+        self.is_causal = True
+        if self.q_lora_rank is None:
+            self.q_proj = nn.Linear(config.hidden_size, self.num_heads * self.qk_head_dim, bias=False)
+        else:
+            self.q_a_proj = nn.Linear(config.hidden_size, config.q_lora_rank, bias=config.attention_bias)
+            self.q_a_layernorm = YoutuRMSNorm(config.q_lora_rank)
+            self.q_b_proj = nn.Linear(config.q_lora_rank, self.num_heads * self.qk_head_dim, bias=False)
+        self.kv_a_proj_with_mqa = nn.Linear(
+            config.hidden_size,
+            self.kv_lora_rank + self.qk_rope_head_dim,
+            bias=config.attention_bias,
+        )
+        self.kv_a_layernorm = YoutuRMSNorm(self.kv_lora_rank)
+        self.kv_b_proj = nn.Linear(
+            self.kv_lora_rank,
+            self.num_heads * (self.qk_nope_head_dim + self.v_head_dim),
+            bias=False,
+        )
+        self.o_proj = nn.Linear(
+            self.num_heads * self.v_head_dim,
+            config.hidden_size,
+            bias=config.attention_bias,
+        )
+        self.scaling = self.qk_head_dim ** (-0.5)
+        if self.config.rope_scaling is not None:
+            mscale_all_dim = self.config.rope_scaling.get("mscale_all_dim", 0)
+            scaling_factor = self.config.rope_scaling["factor"]
+            if mscale_all_dim:
+                mscale = yarn_get_mscale(scaling_factor, mscale_all_dim)
+                self.scaling = self.scaling * mscale * mscale
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        instance_length: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        batch_size, seq_length = hidden_states.shape[:-1]
+        query_shape = (batch_size, seq_length, -1, self.qk_head_dim)
+        key_shape = (batch_size, seq_length, -1, self.qk_nope_head_dim + self.v_head_dim)
+        if self.q_lora_rank is None:
+            q_states = self.q_proj(hidden_states).view(query_shape).transpose(1, 2)
+        else:
+            q_states = self.q_b_proj(self.q_a_layernorm(self.q_a_proj(hidden_states))).view(query_shape).transpose(1, 2)
+        q_pass, q_rot = torch.split(q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1)
+        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
+        k_pass, k_rot = torch.split(compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1)
+        k_pass = self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
+        k_pass, value_states = torch.split(k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1)
+        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
+        cos, sin = position_embeddings
+        if self.config.rope_interleave:  # support using interleaved weights for efficiency
+            q_rot, k_rot = apply_rotary_pos_emb_interleave(q_rot, k_rot, cos, sin)
+        else:
+            q_rot, k_rot = apply_rotary_pos_emb(q_rot, k_rot, cos, sin)
+        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)
+        query_states = torch.cat((q_pass, q_rot), dim=-1)
+        key_states = torch.cat((k_pass, k_rot), dim=-1)
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+        if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+            value_states = F.pad(value_states, [0, self.qk_head_dim - self.v_head_dim])
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False):
+                logger.warning_once(
+                    "`torch.nn.functional.scaled_dot_product_attention` does not support"
+                    "`output_attentions=True`. Falling back to 'eager attention. This warning"
+                    'can be removed using the argument `attn_implementation="eager"` when loading the model.'
+                )
+            else:
+                attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        if instance_length is None or flash_attn_varlen_func is None:
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                dropout=0.0 if not self.training else self.attention_dropout,
+                scaling=self.scaling,
+                **kwargs,
+            )
+            if self.config._attn_implementation == "flash_attention_2" and self.qk_head_dim != self.v_head_dim:
+                attn_output = attn_output[:, :, :, : self.v_head_dim]
+        else:
+            instance_length = instance_length.view(-1)
+            query_states = query_states.squeeze(0).transpose(0,1)
+            key_states = key_states.squeeze(0).transpose(0,1)
+            value_states = value_states.squeeze(0).transpose(0,1)
+            max_seqlen_in_batch = instance_length.max().item()
+            cu_seqlens = F.pad(torch.cumsum(instance_length, dim=0, dtype=torch.int32), (1, 0))
+            if is_aiter_available:
+                attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens,
+                    cu_seqlens, max_seqlen_in_batch, max_seqlen_in_batch,
+                    dropout_p=0.0 if not self.training else self.attention_dropout,
+                    softmax_scale=self.scaling,
+                    causal=self.is_causal, return_lse=True)[0]
+            else:
+                attn_output = flash_attn_varlen_func(query_states, key_states, value_states, cu_seqlens,
+                    cu_seqlens, max_seqlen_in_batch, max_seqlen_in_batch,
+                    dropout_p=0.0 if not self.training else self.attention_dropout,
+                    softmax_scale=self.scaling,
+                    causal=self.is_causal)
+            attn_output = attn_output.unsqueeze(0)
+            attn_output = attn_output[:, :, :, : self.v_head_dim]
+            attn_weights = None
+        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+class YoutuDecoderLayer(nn.Module):
+    def __init__(self, config: YoutuVLConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = YoutuMLAttention(config=config, layer_idx=layer_idx)
+        self.mlp = YoutuMLP(config)
+        self.input_layernorm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        instance_length: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            instance_length=instance_length,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+        return outputs
+YOUTU_VL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+    Parameters:
+        config ([`YoutuVLConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+@add_start_docstrings(
+    "The bare Youtu Model outputting raw hidden-states without any specific head on top.",
+    YOUTU_VL_START_DOCSTRING,
+)
+class YoutuPreTrainedModel(PreTrainedModel):
+    config_class = YoutuVLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["YoutuDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+    def init_weights(self):
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+        if "-init" in self.name_or_path:
+            self.apply(self._initialize_weights)
+            for name, module in self.named_modules():
+                if "o_proj" in name or "down_proj" in name:
+                    scaled_std = self.config.initializer_range * (1.0 / self.config.num_hidden_layers) ** 0.5
+                    module.weight.data.normal_(mean=0.0, std=scaled_std)
+            self.tie_weights()
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        embedding_std = self.config.embedding_initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=embedding_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.Parameter):
+            module.weight.data.normal_(mean=0.0, std=std)
+        elif isinstance(module, YoutuRMSNorm):
+            module.weight.data.fill_(1.0)
+YOUTU_VL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+            If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
+            `past_key_values`).
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Cache`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+            It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+@add_start_docstrings(
+    "The bare Youtu Model outputting raw hidden-states without any specific head on top.",
+    YOUTU_VL_START_DOCSTRING,
+)
+class YoutuModel(YoutuPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"model\.layers\.61.*"]
+    def __init__(self, config: YoutuVLConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [YoutuDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = YoutuRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = YoutuRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embed_tokens
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+    @can_return_tuple
+    @add_start_docstrings_to_model_forward(YOUTU_VL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        instance_length: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                instance_length=instance_length,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            if isinstance(attention_mask, BlockMask):
+                return attention_mask
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu"]
+            and not output_attentions
+        ):
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            device (`torch.device`):
+                The device to place the 4D attention mask on.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class KwargsForCausalLM(FlashAttentionKwargs): ...
+class YoutuForCausalLM(YoutuPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = YoutuModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def get_merge_embedding(self, inputs_embeds, image_embeds, image_mask,**kwargs,):
+        bs, length, dim_size = inputs_embeds.shape
+        if image_embeds is None:
+            return inputs_embeds
+        if bs == 1:
+            image_embeds = image_embeds.unsqueeze(0)
+            init_inputs_embeds = inputs_embeds.clone()
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            cmp_mask = torch.isclose(init_inputs_embeds, inputs_embeds, rtol=1e-05, atol=1e-08)
+        else:
+            assert(bs==1)
+        return inputs_embeds
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(YOUTU_VL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+        loss = None
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+class VLPatchMerger(nn.Module):
+    def __init__(self, dim: int, context_dim: int, spatial_merge_size: int = 2) -> None:
+        super().__init__()
+        self.hidden_size = context_dim * (spatial_merge_size**2)
+        self.ln_q = YoutuRMSNorm(context_dim, eps=1e-06)
+        self.mlp = nn.Sequential(
+            nn.Linear(self.hidden_size, self.hidden_size),
+            nn.GELU(),
+            nn.Linear(self.hidden_size, dim),
+        )
+    def forward(self, x: torch.Tensor, spatial_shapes: torch.Tensor) -> torch.Tensor:
+        x = self.ln_q(x).view(-1, self.hidden_size)
+        x = self.mlp(x)
+        return x
+class YoutuVLForConditionalGeneration(YoutuPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+    def __init__(self, config):
+        super().__init__(config)
+        config.vision_config.out_hidden_size = config.hidden_size
+        config.vision_config.vision_use_head = False
+        self.siglip2 = Siglip2VisionModel._from_config(config.vision_config)
+        self.merger = VLPatchMerger(
+            dim=config.hidden_size,
+            context_dim=config.vision_config.hidden_size,
+            spatial_merge_size=2,
+        )
+        self.rope_deltas = None
+        self.model = YoutuModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.first_logits = None
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def get_input_idx_embeddings(self, input_ids):
+        inputs_embeds = self.model.embed_tokens(input_ids)
+        return inputs_embeds
+    def get_visiual_features(self, pixel_values, pixel_attention_mask, spatial_shapes):
+        pixel_values = pixel_values.type(self.siglip2.dtype)
+        # Extract image embeddings via vision model
+        image_embeds = self.siglip2(pixel_values, pixel_attention_mask, spatial_shapes).last_hidden_state
+        # Merge image features with the output of vision model
+        image_embeds = self.merger(image_embeds, spatial_shapes)
+        return image_embeds
+    def get_merge_embedding(self, inputs_embeds, image_embeds, image_mask, **kwargs):
+        """
+        Merge text embeddings with image embeddings using the provided mask.
+        Args:
+            inputs_embeds: Text input embeddings
+            image_embeds: Image embeddings to merge
+            image_mask: Mask indicating where to place image embeddings
+            **kwargs: Additional keyword arguments
+        Returns:
+            Merged embeddings with image features integrated
+        """
+        bs, length, dim_size = inputs_embeds.shape
+        if image_embeds is None:
+            return inputs_embeds
+        if bs == 1:
+            image_embeds = image_embeds.unsqueeze(0)
+            init_inputs_embeds = inputs_embeds.clone()
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            cmp_mask = torch.isclose(init_inputs_embeds, inputs_embeds, rtol=1e-05, atol=1e-08)
+        else:
+            print('******************ERROR: if you see this info, only support batch_size==1*********************')
+            assert(bs == 1)
+        return inputs_embeds
+    @can_return_tuple
+    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
+    @add_start_docstrings_to_model_forward(YOUTU_VL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        pixel_attention_mask: Optional[torch.LongTensor] = None,
+        spatial_shapes: Optional[torch.LongTensor] = None,
+        instance_length: Optional[torch.LongTensor] = None,
+        coefficients: Optional[torch.FloatTensor] = None,
+        rope_deltas: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[KwargsForCausalLM],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+            TODO: Add example
+        Returns:
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if inputs_embeds is None:
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                bs, length, dim_size = inputs_embeds.shape
+                pixel_values = pixel_values.type(self.siglip2.dtype)
+                image_embeds = self.siglip2(pixel_values, pixel_attention_mask, spatial_shapes).last_hidden_state
+                image_embeds = self.merger(image_embeds, spatial_shapes)
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens > n_image_features:
+                    raise ValueError(
+                        "Image features and image tokens do not match: tokens: {}, features {}".format(
+                            n_image_tokens, n_image_features
+                        )
+                    )
+                mask = input_ids == self.config.image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                image_mask = mask_expanded.to(inputs_embeds.device)
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                if bs != 1:
+                    raise ValueError("Only support batch size = 1")
+                image_embeds = image_embeds.unsqueeze(0)
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=None,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            instance_length=instance_length,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        logits = self.lm_head(hidden_states)
+        if logits.shape[1] != 1:
+            self.first_logits = logits
+        loss = None
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def truncate_past_key_values(
+        self,
+        past_key_values: Optional[DynamicCache],
+        num_history: int
+    ) -> Optional[DynamicCache]:
+        """Truncate past_key_values to specified history length in-place.
+        Args:
+            past_key_values: Cache object to truncate
+            num_history: Target history length to keep
+        Returns:
+            Truncated cache object or None if input is None
+        """
+        if past_key_values is None:
+            return None
+        current_length = past_key_values.get_seq_length()
+        if current_length <= num_history:
+            return past_key_values
+        for layer_idx in range(len(past_key_values.key_cache)):
+            if past_key_values.key_cache[layer_idx] is not None:
+                past_key_values.key_cache[layer_idx] = (
+                    past_key_values.key_cache[layer_idx][:, :, :num_history, :].contiguous()
+                )
+                past_key_values.value_cache[layer_idx] = (
+                    past_key_values.value_cache[layer_idx][:, :, :num_history, :].contiguous()
+                )
+        return past_key_values
+    def clone_past_key_values(
+        self,
+        past_key_values: Optional[DynamicCache]
+    ) -> Optional[DynamicCache]:
+        """Deep copy past_key_values to avoid shared reference issues.
+        Args:
+            past_key_values: Cache object to clone
+        Returns:
+            Deep copied cache object or None if input is None
+        """
+        if past_key_values is None:
+            return None
+        new_cache = DynamicCache()
+        for layer_idx in range(len(past_key_values.key_cache)):
+            if past_key_values.key_cache[layer_idx] is not None:
+                new_cache.key_cache.append(past_key_values.key_cache[layer_idx].clone())
+                new_cache.value_cache.append(past_key_values.value_cache[layer_idx].clone())
+        return new_cache
+    def concat_token_ids(
+        self,
+        input_ids: torch.Tensor,
+        concat_ids: Optional[List[int]]
+    ) -> torch.Tensor:
+        """Concatenate additional token IDs to input sequence.
+        Args:
+            input_ids: Original input token IDs of shape (batch_size, seq_len)
+            concat_ids: Token IDs to concatenate
+        Returns:
+            Concatenated token IDs tensor
+        """
+        if concat_ids is None:
+            return input_ids
+        num_gen = len(concat_ids)
+        if num_gen < 2:
+            return input_ids
+        batch_size = input_ids.size(0)
+        concat_token_tensor = torch.tensor(
+            concat_ids,
+            dtype=input_ids.dtype,
+            device=input_ids.device
+        )
+        concat_tokens = concat_token_tensor.unsqueeze(0).repeat(batch_size, 1)
+        new_input_ids = torch.cat([input_ids, concat_tokens], dim=1)
+        return new_input_ids
+    def create_causal_mask_for_kv_cache(
+        self,
+        kv_cache_len: int,
+        num_new_tokens: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16
+    ) -> torch.Tensor:
+        """Create causal attention mask for KV cache usage.
+        Each new token can only see:
+        1. All content in KV cache (positions 0 to kv_cache_len-1)
+        2. Previous new tokens and itself (causal masking)
+        Args:
+            kv_cache_len: Length of existing sequence in KV cache
+            num_new_tokens: Number of new tokens being added
+            device: Target device for tensor allocation
+            dtype: Data type for the mask tensor
+        Returns:
+            Attention mask of shape (1, 1, num_new_tokens, kv_cache_len + num_new_tokens)
+        """
+        total_len = kv_cache_len + num_new_tokens
+        min_val = torch.finfo(dtype).min
+        # Initialize mask with min_val (masked positions)
+        mask = torch.full((num_new_tokens, total_len), min_val, device=device, dtype=dtype)
+        # Set visible positions to 0
+        for i in range(num_new_tokens):
+            if kv_cache_len > 0:
+                mask[i, :kv_cache_len] = 0
+            mask[i, kv_cache_len:kv_cache_len + i + 1] = 0
+        return mask.unsqueeze(0).unsqueeze(0)
+    def create_4d_causal_mask(
+        self,
+        seq_len: int,
+        device: torch.device,
+        dtype: torch.dtype = torch.bfloat16
+    ) -> torch.Tensor:
+        """Create complete 4D causal attention mask for initial decoding.
+        Args:
+            seq_len: Sequence length
+            device: Target device for tensor allocation
+            dtype: Data type for the mask tensor
+        Returns:
+            Causal attention mask of shape (1, 1, seq_len, seq_len)
+        """
+        min_val = torch.finfo(dtype).min
+        # Create lower triangular causal mask
+        mask = torch.full((seq_len, seq_len), min_val, device=device, dtype=dtype)
+        mask = torch.triu(mask, diagonal=1)
+        return mask.unsqueeze(0).unsqueeze(0)
+    def _first_decoder(
+        self,
+        new_input_ids: torch.Tensor,
+        past_key_values: Optional[DynamicCache] = None,
+        image_embeds: Optional[torch.Tensor] = None,
+        image_mask: Optional[torch.Tensor] = None,
+        num_gen: int = 32
+    ) -> Tuple[torch.Tensor, Any]:
+        """Execute decoder pass with causal attention masking.
+        This method performs a single decoder pass with optimized attention masking.
+        On the first decoding step (when past_key_values is None), it processes image
+        embeddings and merges them with text embeddings.
+        Args:
+            new_input_ids: Input token IDs of shape (batch_size, seq_len)
+            past_key_values: Cached key-value pairs from previous decoding steps
+            image_embeds: Image embeddings to merge (only used in first step)
+            image_mask: Mask indicating positions for image embedding placement
+            num_gen: Number of tokens to generate in parallel
+        Returns:
+            Tuple containing:
+                - predicted_token_ids: Predicted token IDs of shape (batch_size, num_gen)
+                - outputs: Model outputs including logits and updated cache
+        """
+        # Get current sequence position
+        start_position = past_key_values.get_seq_length() if past_key_values is not None else 0
+        batch_size, seq_len = new_input_ids.shape
+        # Create position IDs directly on GPU to avoid CPU-GPU transfer
+        position_ids = torch.arange(
+            start_position,
+            start_position + seq_len,
+            dtype=torch.long,
+            device=new_input_ids.device
+        ).unsqueeze(0)
+        # Process image embeddings only on first decoding step
+        inputs_embeds = None
+        if start_position == 0:
+            inputs_embeds = self.get_input_idx_embeddings(new_input_ids)
+            if image_embeds is not None:
+                inputs_embeds = self.get_merge_embedding(inputs_embeds, image_embeds, image_mask)
+        # Create 4D causal attention mask
+        attention_mask = None
+        if start_position > 0 and seq_len > 0:
+            # When using KV cache, create mask for new tokens
+            attention_mask = self.create_causal_mask_for_kv_cache(
+                start_position, seq_len, new_input_ids.device, dtype=torch.bfloat16
+            )
+        elif start_position == 0 and seq_len > 0:
+            # First decoding, create complete causal mask
+            attention_mask = self.create_4d_causal_mask(
+                seq_len, new_input_ids.device, dtype=torch.bfloat16
+            )
+        with torch.no_grad():
+            if start_position > 0:
+                outputs = self.forward(
+                    input_ids=new_input_ids,
+                    inputs_embeds=None,
+                    attention_mask=None,  # Note: attention_mask currently disabled
+                    position_ids=position_ids,
+                    use_cache=True,
+                    cache_position=True,
+                    past_key_values=past_key_values,
+                )
+            else:
+                outputs = self.forward(
+                    input_ids=None,
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=None,  # Note: attention_mask currently disabled
+                    position_ids=position_ids,
+                    use_cache=True,
+                    cache_position=True,
+                    past_key_values=past_key_values,
+                )
+            # Extract predicted token IDs from logits
+            predicted_token_ids = outputs.logits[:, -(num_gen + 1):-1].argmax(dim=-1)
+        return predicted_token_ids, outputs
+    def generate_parallel_decoder(
+        self,
+        inputs: Dict[str, torch.Tensor],
+        image_embeds: torch.Tensor,
+        mask_token_id: int,
+        max_new_tokens: int = 8192,
+        num_gen: int = 64,
+        verbose: bool = False
+    ) -> List[int]:
+        """Generate tokens using optimized parallel decoding with dual-pass verification.
+        This method implements a parallel decoding strategy that generates multiple tokens
+        simultaneously and verifies them in a second pass. The algorithm:
+        1. First pass: Predict tokens with mask tokens
+        2. Second pass: Verify predictions with actual predicted tokens
+        3. Accept verified tokens and continue from the first unverified position
+        Optimizations:
+        - First decoding uses cloned cache to avoid modifying the original
+        - Second decoding updates the original cache in-place
+        - Minimizes CPU-GPU data transfers by operating on GPU
+        - Pre-allocates tensors to avoid repeated creation
+        - Removes debug output from inner loops (controlled by verbose flag)
+        - Entire loop wrapped with torch.no_grad() for efficiency
+        Args:
+            inputs: Input dictionary containing 'input_ids' tensor
+            image_embeds: Image embeddings for multimodal processing
+            mask_token_id: Token ID used for masked positions
+            max_new_tokens: Maximum number of tokens to generate
+            num_gen: Number of tokens to generate in parallel per iteration
+            verbose: If True, print detailed progress information
+        Returns:
+            List of generated token IDs
+        """
+        if verbose:
+            print("Starting parallel decoder generation")
+        # Constants
+        STOP_TOKEN_ID = 128001
+        device = self.model.device
+        input_ids = inputs["input_ids"]
+        decoder_idx = []
+        # Pre-allocate mask tokens tensor to avoid repeated creation
+        mask_tokens = torch.full((1, num_gen), mask_token_id, dtype=torch.long, device=device)
+        # Initialize KV cache
+        prefix_past_key_values = DynamicCache()
+        step = 0
+        is_exit = False
+        # Cache initial token ID
+        prefix_step_id = input_ids[0, 0].item()
+        with torch.no_grad():
+            while len(decoder_idx) < max_new_tokens and not is_exit:
+                # ============ First Pass: Predict with mask tokens ============
+                new_input_ids = torch.cat([input_ids, mask_tokens], dim=1)
+                # Use cloned cache for first pass to preserve original
+                if step == 0:
+                    first_cache = DynamicCache()
+                    # Create image mask for first step
+                    mask = new_input_ids == self.config.image_token_id
+                    mask_unsqueezed = mask.unsqueeze(-1)
+                    mask_expanded = mask_unsqueezed.expand(-1, -1, image_embeds.size(-1))
+                    image_mask = mask_expanded.to(image_embeds.device)
+                else:
+                    first_cache = self.clone_past_key_values(prefix_past_key_values)
+                first_predicted_ids, _ = self._first_decoder(
+                    new_input_ids,
+                    past_key_values=first_cache,
+                    image_embeds=image_embeds if step == 0 else None,
+                    image_mask=image_mask if step == 0 else None,
+                    num_gen=num_gen
+                )
+                # ============ Second Pass: Verify with predicted tokens ============
+                new_input_ids = torch.cat([input_ids, first_predicted_ids], dim=1)
+                # Use original cache for second pass (will be updated and retained)
+                if step == 0:
+                    second_cache = DynamicCache()
+                else:
+                    second_cache = prefix_past_key_values
+                second_predicted_ids, outputs = self._first_decoder(
+                    new_input_ids,
+                    past_key_values=second_cache,
+                    image_embeds=image_embeds if step == 0 else None,
+                    image_mask=image_mask if step == 0 else None,
+                    num_gen=num_gen
+                )
+                # ============ Compare predictions and count successes ============
+                first_pred_list = first_predicted_ids[0].tolist()
+                second_pred_list = second_predicted_ids[0].tolist()
+                if verbose:
+                    print(f"First pass predictions:  {first_pred_list}")
+                    print(f"Second pass predictions: {second_pred_list}")
+                # Compare predictions to find verified tokens
+                success = 0
+                for idx in range(len(second_pred_list) - 1):
+                    first_id = first_pred_list[idx]
+                    second_id = second_pred_list[idx]
+                    next_second_id = second_pred_list[idx + 1]
+                    # Check for stop token
+                    if second_id == STOP_TOKEN_ID:
+                        is_exit = True
+                        break
+                    if next_second_id == STOP_TOKEN_ID and idx == len(second_pred_list) - 2:
+                        success += 1
+                        is_exit = True
+                        break
+                    # Verify prediction consistency
+                    if first_id == second_id:
+                        success += 1
+                    else:
+                        break
+                # ============ Update decoded tokens ============
+                if step == 0:
+                    decoder_idx.extend(second_pred_list[:success])
+                else:
+                    if verbose:
+                        print(f"Verified {success} tokens: {second_pred_list[:success]}")
+                    decoder_idx.append(prefix_step_id)
+                    decoder_idx.extend(second_pred_list[:success])
+                if verbose:
+                    print(f"Exit status: {is_exit}")
+                    print(f"Total decoded tokens: {len(decoder_idx)}")
+                # ============ Truncate KV cache to verified length ============
+                past_key_values = outputs.past_key_values
+                if past_key_values is not None:
+                    current_kv_len = past_key_values.get_seq_length()
+                    num_to_keep = current_kv_len - (num_gen - success)
+                    prefix_past_key_values = self.truncate_past_key_values(
+                        past_key_values, num_to_keep
+                    )
+                else:
+                    prefix_past_key_values = None
+                # Update input_ids for next iteration
+                next_token_id = (
+                    second_pred_list[success]
+                    if success < len(second_pred_list)
+                    else prefix_step_id
+                )
+                input_ids = torch.tensor(
+                    [[next_token_id]],
+                    dtype=torch.long,
+                    device=device
+                )
+                prefix_step_id = next_token_id
+                step += 1
+                if verbose:
+                    print(f"Step {step} completed, success rate: {success}/{num_gen}\n")
+        return decoder_idx
+__all__ = ["YoutuPreTrainedModel", "YoutuModel", "YoutuVLForConditionalGeneration"]

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_map": {
+    "AutoProcessor": "processing_youtu_vl.YoutuVLProcessor",
+    "AutoImageProcessor": "image_processing_siglip2_fast.Siglip2ImageProcessorFast"
+  },
+  "processor_class": "YoutuVLProcessor",
+  "do_convert_rgb": null,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "Siglip2ImageProcessorFast",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "max_num_patches": 256,
+  "patch_size": 16,
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098
+}

processing_youtu_vl.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from typing import List, Union
+import numpy
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+class YoutuVLVideosProcessorKwargs(VideosKwargs, total=False):
+    fps: Union[List[float], float]
+class YoutuVLProcessorKwargs(ProcessingKwargs, total=False):
+    videos_kwargs: YoutuVLVideosProcessorKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+        "videos_kwargs": {"fps": 2.0},
+    }
+class YoutuVLProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
+        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        max_image_patches: int=36864,
+        **kwargs: Unpack[YoutuVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`,
+                    `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **pixel_values_videos** -- Pixel values of videos to be fed to a model.
+              Returned when `videos` is not `None`.
+            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
+            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
+            - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            YoutuVLProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, max_num_patches=max_image_patches, return_tensors="pt")
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+        videos_inputs = {}
+        video_grid_thw = None
+        if not isinstance(text, list):
+            text = [text]
+        image_tokens = []
+        if images is not None:
+            merge_length = 4
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    h = image_inputs['spatial_shapes'][index][0]
+                    w = image_inputs['spatial_shapes'][index][1]
+                    repeats = h* w // merge_length
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>" * repeats,
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+            assert(index == image_inputs['spatial_shapes'].shape[0])
+        if video_grid_thw is not None:
+            merge_length = self.image_processor.merge_size ** 2
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+    def get_max_image_patches(self, images):
+        return self.image_processor.get_max_image_patches(images)
+    def batch_decode(self, *args, **kwargs):
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self.tokenizer.decode(*args, **kwargs)
+    def post_process_image_text_to_text(
+        self, generated_outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False, **kwargs
+    ):
+        """
+        Post-process the output of the model to decode the text.
+        Args:
+            generated_outputs (`torch.Tensor` or `np.ndarray`):
+                The output of the model `generate` function. The output is
+                expected to be a tensor of shape `(batch_size, sequence_length)`
+                or `(sequence_length,)`.
+            skip_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to remove special tokens in the output. Argument
+                passed to the tokenizer's `batch_decode` method.
+            Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the tokenization spaces. Argument
+                passed to the tokenizer's `batch_decode` method.
+            **kwargs:
+                Additional arguments to be passed to the tokenizer's `batch_decode method`.
+        Returns:
+            `List[str]`: The decoded text.
+        """
+        return self.tokenizer.batch_decode(
+            generated_outputs,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+        return names_from_processor + ["second_per_grid_ts"]
+__all__ = ["YoutuVLProcessor"]

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,23 @@

+{
+  "bos_token": {
+    "content": "<|begin_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|end_of_text|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0ea752449128cbb859dc8799d1d40c0a7dc7eac50ed9ccf47cea7a7100dcabe5
+size 19953833

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<|begin_of_text|>",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|end_of_text|>",
+  "extra_special_tokens": {},
+  "model_input_names": [
+    "input_ids",
+    "attention_mask"
+  ],
+  "model_max_length": 131072,
+  "pad_token": "<|end_of_text|>",
+  "tokenizer_class": "PreTrainedTokenizerFast",
+  "truncation_side": "left",
+  "use_fast": true
+}