Spaces:

derektan95
/

LISA-AVS-demo

Sleeping

App Files Files Community

derektan commited on Sep 5, 2025

Commit

1faad26

0 Parent(s):

Initial clean commit for Hugging Face

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +6 -0
.gitignore +4 -0
LICENSE +201 -0
README.md +339 -0
app.py +359 -0
chat.py +359 -0
imgs/blackpink.jpg +3 -0
imgs/camera_lens.jpg +3 -0
imgs/car_speed.jpg +3 -0
imgs/dog_with_horn.jpg +3 -0
imgs/example1.jpg +3 -0
imgs/example2.jpg +3 -0
imgs/fig_overview.jpg +3 -0
imgs/jackma.jpg +3 -0
imgs/obama.jpg +3 -0
imgs/stand_higher.jpg +3 -0
imgs/table1.jpg +3 -0
imgs/teaser.jpg +3 -0
imgs/trump.jpg +3 -0
imgs/wash_hands.jpg +3 -0
merge_lora_weights_and_save_hf_model.py +159 -0
model/LISA.py +427 -0
model/llava/__init__.py +1 -0
model/llava/constants.py +12 -0
model/llava/conversation.py +399 -0
model/llava/mm_utils.py +88 -0
model/llava/model/__init__.py +2 -0
model/llava/model/apply_delta.py +56 -0
model/llava/model/builder.py +206 -0
model/llava/model/consolidate.py +31 -0
model/llava/model/language_model/llava_llama.py +167 -0
model/llava/model/language_model/llava_mpt.py +174 -0
model/llava/model/language_model/mpt/adapt_tokenizer.py +46 -0
model/llava/model/language_model/mpt/attention.py +526 -0
model/llava/model/language_model/mpt/blocks.py +92 -0
model/llava/model/language_model/mpt/configuration_mpt.py +199 -0
model/llava/model/language_model/mpt/custom_embedding.py +11 -0
model/llava/model/language_model/mpt/flash_attn_triton.py +1087 -0
model/llava/model/language_model/mpt/hf_prefixlm_converter.py +750 -0
model/llava/model/language_model/mpt/meta_init_context.py +111 -0
model/llava/model/language_model/mpt/modeling_mpt.py +538 -0
model/llava/model/language_model/mpt/norm.py +106 -0
model/llava/model/language_model/mpt/param_init_fns.py +419 -0
model/llava/model/llava_arch.py +398 -0
model/llava/model/make_delta.py +63 -0
model/llava/model/multimodal_encoder/builder.py +17 -0
model/llava/model/multimodal_encoder/clip_encoder.py +87 -0
model/llava/model/utils.py +24 -0
model/llava/train/llama_flash_attn_monkey_patch.py +126 -0
model/llava/train/llava_trainer.py +67 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,6 @@

+*.bin filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+**/__pycache__
+runs/
+.vscode/

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md ADDED Viewed

	@@ -0,0 +1,339 @@

+[![Gradio](https://img.shields.io/badge/Gradio-Online%20Demo-blue)](http://103.170.5.190:7860/)
+[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/openxlab-app/LISA)
+# LISA: Reasoning Segmentation via Large Language Model
+Note: This is fork from the [original LISA webpage](https://github.com/dvlab-research/LISA), finetuned on visual search ([AVS-Bench](https://huggingface.co/datasets/derektan95/avs-bench)) remote sensing dataset as a baseline to [Search-TTA](https://search-tta.github.io/). To run the finetuned LISA model on the AVS-Bench dataset, please run the following:
+```
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='derektan95/LISA-RS' --precision='bf16'
+CUDA_VISIBLE_DEVICES=0 python app.py --version='derektan95/LISA-RS' --precision='bf16'
+```
+<font size=7><div align='center'><b>LISA</b>: Large <b>L</b>anguage <b>I</b>nstructed <b>S</b>egmentation <b>A</b>ssistant</div></font>
+<font size=7><div align='center'>
+    <a href="https://arxiv.org/pdf/2308.00692.pdf"><strong>Paper</strong></a> |
+    <a href="https://huggingface.co/xinlai"><strong>Models</strong></a> |
+    <a href="#training"><strong>Training</strong></a> |
+    <a href="#inference"><strong>Inference</strong></a> |
+    <a href="#deployment"><strong>Local Deployment</strong></a> |
+    <a href="#dataset"><strong>Dataset</strong></a> |
+    <a href=""><strong>Online Demo</strong></a> |
+    <a href="https://huggingface.co/collections/Senqiao/lisa-67713837a32d6abf516a162e"><strong>LISA++ Dataset</strong></a> |
+    <a href="https://huggingface.co/Senqiao/LISA_Plus_7b"><strong>LISA++ Model</strong></a>
+</div></font>
+<!-- <p align="center"> <img src="imgs/teaser.jpg" width="100%"> </p> -->
+<table class="center">
+<tr>
+  <td style="text-align:center;"><b>Input</b></td>
+  <td style="text-align:center;"><b>Output</b></td>
+  <td style="text-align:center;"><b>Input</b></td>
+  <td style="text-align:center;"><b>Output</b></td>
+</tr>
+<tr>
+  <td><img src="imgs/obama.jpg"></td>
+  <td><img src="vis_output/obama.jpg"></td>
+  <td><img src="imgs/trump.jpg"></td>
+  <td><img src="vis_output/trump.jpg"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;color:gray;">"Who was <ins>the president of the US</ins> in this image? Please output segmentation mask and explain the reason."</td>
+  <td width=25% style="text-align:center;">"Sure, the segmentation result is [SEG]. The President of the United States in the image is <ins>President Obama.</ins>”</td>
+  <td width=25% style="text-align:center;color:gray">"Who was <ins>the president of the US</ins> in this image? Please output segmentation mask and explain why."</td>
+  <td width=25% style="text-align:center;">"Sure, [SEG]. In the image, the President of the United States is <ins>President Trump.</ins>"</td>
+</tr>
+<tr>
+  <td><img src="imgs/stand_higher.jpg"></td>
+  <td><img src="vis_output/stand_higher.jpg"></td>
+  <td><img src="imgs/camera_lens.jpg"></td>
+  <td><img src="vis_output/camera_lens.jpg"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;color:gray;">"What can <ins>make the woman stand higher</ins>? Please output segmentation mask and explain why."</td>
+  <td width=25% style="text-align:center;">"Sure, [SEG]. The woman is <ins>standing higher by using a ladder</ins>..."</td>
+  <td width=25% style="text-align:center;color:gray">"Can you segment <ins>the camera lens that is more suitable for photographing nearby objects</ins> in this image?"</td>
+  <td width=25% style="text-align:center;">"Sure, [SEG]."</td>
+</tr>
+<tr>
+  <td><img src="imgs/dog_with_horn.jpg"></td>
+  <td><img src="vis_output/dog_with_horn.jpg"></td>
+  <td><img src="imgs/wash_hands.jpg"></td>
+  <td><img src="vis_output/wash_hands.jpg"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;color:gray;">"Can you segment <ins>the unusual part</ins> in this image and explain why."</td>
+  <td width=25% style="text-align:center;">"Sure, it is [SEG]. In the image, the unusual part is <ins>the dog wearing a reindeer antler headband</ins>..."</td>
+  <td width=25% style="text-align:center;color:gray">"Where to <ins>wash hands</ins> in this image? Please output segmentation mask."</td>
+  <td width=25% style="text-align:center;">"Sure, [SEG]."</td>
+</tr>
+<tr>
+  <td><img src="imgs/jackma.jpg"></td>
+  <td><img src="vis_output/jackma.jpg"></td>
+  <td><img src="imgs/blackpink.jpg"></td>
+  <td><img src="vis_output/blackpink.jpg"></td>
+</tr>
+<tr>
+  <td width=25% style="text-align:center;color:gray;">"Can you segment <ins>the founder of Alibaba</ins> in this image and explain why?"</td>
+  <td width=25% style="text-align:center;">"Sure, it is [SEG]. In the image, the man wearing a red tie and sitting on one of the chairs is <ins>Jack Ma</ins>, the co-founder of Alibaba Group..."</td>
+  <td width=25% style="text-align:center;color:gray">"Please segment <ins>Lisa</ins> in this figure."</td>
+  <td width=25% style="text-align:center;">"Sure, [SEG]."</td>
+</tr>
+</table>
+<p align="center"> <img src="imgs/fig_overview.jpg" width="100%"> </p>
+## News
+- [x] [2024.12.30] We released the [LISA++](https://arxiv.org/abs/2312.17240) model and datasets, available [here](https://huggingface.co/collections/Senqiao/lisa-67713837a32d6abf516a162e). Our findings show that incorporating Visual COT data can further enhance the model’s global understanding. We will update the paper soon, stay tuned!
+- [x] [2024.6.21] LISA is selected as Oral Presentation in CVPR 2024!
+- [x] [2023.8.30] Release three new models [LISA-7B-v1](https://huggingface.co/xinlai/LISA-7B-v1), [LISA-7B-v1-explanatory](https://huggingface.co/xinlai/LISA-7B-v1-explanatory), and [LISA-13B-llama2-v1-explanatory](https://huggingface.co/xinlai/LISA-13B-llama2-v1-explanatory). Welcome to check them out!
+- [x] [2023.8.23] Refactor code, and release new model [LISA-13B-llama2-v1](https://huggingface.co/xinlai/LISA-13B-llama2-v1). Welcome to check it out!
+- [x] [2023.8.9] Training code is released!
+- [x] [2023.8.4] [Online Demo](http://103.170.5.190:7860/) is released!
+- [x] [2023.8.4] [*ReasonSeg* Dataset](https://drive.google.com/drive/folders/125mewyg5Ao6tZ3ZdJ-1-E3n04LGVELqy?usp=sharing) and the [LISA-13B-llama2-v0-explanatory](https://huggingface.co/xinlai/LISA-13B-llama2-v0-explanatory) model are released!
+- [x] [2023.8.3] Inference code and the [LISA-13B-llama2-v0](https://huggingface.co/xinlai/LISA-13B-llama2-v0) model are released. Welcome to check them out!
+- [x] [2023.8.2] [Paper](https://arxiv.org/pdf/2308.00692.pdf) is released and GitHub repo is created.
+**LISA: Reasoning Segmentation via Large Language Model [[Paper](https://arxiv.org/abs/2308.00692)]** <br />
+[Xin Lai](https://scholar.google.com/citations?user=tqNDPA4AAAAJ&hl=zh-CN),
+[Zhuotao Tian](https://scholar.google.com/citations?user=mEjhz-IAAAAJ&hl=en),
+[Yukang Chen](https://scholar.google.com/citations?user=6p0ygKUAAAAJ&hl=en),
+[Yanwei Li](https://scholar.google.com/citations?user=I-UCPPcAAAAJ&hl=zh-CN),
+[Yuhui Yuan](https://scholar.google.com/citations?user=PzyvzksAAAAJ&hl=en),
+[Shu Liu](https://scholar.google.com.hk/citations?user=BUEDUFkAAAAJ&hl=zh-CN),
+[Jiaya Jia](https://scholar.google.com/citations?user=XPAkzTEAAAAJ&hl=en)<br />
+**LISA++: An Improved Baseline for Reasoning Segmentation with Large Language Model [[Paper](https://arxiv.org/abs/2312.17240)]** <br />
+[Senqiao Yang](https://scholar.google.com/citations?user=NcJc-RwAAAAJ),
+Tianyuan Qu,
+[Xin Lai](https://scholar.google.com/citations?user=tqNDPA4AAAAJ&hl=zh-CN),
+[Zhuotao Tian](https://scholar.google.com/citations?user=mEjhz-IAAAAJ&hl=en),
+[Bohao Peng](https://scholar.google.com.hk/citations?user=9xcCm1oAAAAJ),
+[Shu Liu](https://scholar.google.com.hk/citations?user=BUEDUFkAAAAJ&hl=zh-CN),
+[Jiaya Jia](https://scholar.google.com/citations?user=XPAkzTEAAAAJ&hl=en)<br />
+## Abstract
+In this work, we propose a new segmentation task --- ***reasoning segmentation***. The task is designed to output a segmentation mask given a complex and implicit query text. We establish a benchmark comprising over one thousand image-instruction pairs, incorporating intricate reasoning and world knowledge for evaluation purposes. Finally, we present LISA: Large-language Instructed Segmentation Assistant, which inherits the language generation capabilities of the multi-modal Large Language Model (LLM) while also possessing the ability to produce segmentation masks.
+For more details, please refer to the [paper](https://arxiv.org/abs/2308.00692).
+## Highlights
+**LISA** unlocks the new segmentation capabilities of multi-modal LLMs, and can handle cases involving:
+1. complex reasoning;
+2. world knowledge;
+3. explanatory answers;
+4. multi-turn conversation.
+**LISA** also demonstrates robust zero-shot capability when trained exclusively on reasoning-free datasets. In addition, fine-tuning the model with merely 239 reasoning segmentation image-instruction pairs results in further performance enhancement.
+## Experimental results
+<p align="center"> <img src="imgs/table1.jpg" width="80%"> </p>
+## Installation
+```
+pip install -r requirements.txt
+pip install flash-attn --no-build-isolation
+```
+## Training
+### Training Data Preparation
+The training data consists of 4 types of data:
+1. Semantic segmentation datasets: [ADE20K](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip), [COCO-Stuff](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip), [Mapillary](https://www.mapillary.com/dataset/vistas), [PACO-LVIS](https://github.com/facebookresearch/paco/tree/main#dataset-setup), [PASCAL-Part](https://github.com/facebookresearch/VLPart/tree/main/datasets#pascal-part), [COCO Images](http://images.cocodataset.org/zips/train2017.zip)
+    Note: For COCO-Stuff, we use the annotation file stuffthingmaps_trainval2017.zip. We only use the PACO-LVIS part in PACO. COCO Images should be put into the `dataset/coco/` directory.
+3. Referring segmentation datasets: [refCOCO](https://web.archive.org/web/20220413011718/https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip), [refCOCO+](https://web.archive.org/web/20220413011656/https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip), [refCOCOg](https://web.archive.org/web/20220413012904/https://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip), [refCLEF](https://web.archive.org/web/20220413011817/https://bvisionweb1.cs.unc.edu/licheng/referit/data/refclef.zip) ([saiapr_tc-12](https://web.archive.org/web/20220515000000/http://bvisionweb1.cs.unc.edu/licheng/referit/data/images/saiapr_tc-12.zip))
+    Note: the original links of refCOCO series data are down, and we update them with new ones. If the download speed is super slow or unstable, we also provide a [OneDrive link](https://mycuhk-my.sharepoint.com/:f:/g/personal/1155154502_link_cuhk_edu_hk/Em5yELVBvfREodKC94nOFLoBLro_LPxsOxNV44PHRWgLcA?e=zQPjsc) to download. **You must also follow the rules that the original datasets require.**
+4. Visual Question Answering dataset: [LLaVA-Instruct-150k](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/blob/main/llava_instruct_150k.json)
+5. Reasoning segmentation dataset: [ReasonSeg](https://github.com/dvlab-research/LISA#dataset)
+Download them from the above links, and organize them as follows.
+```
+├── dataset
+│   ├── ade20k
+│   │   ├── annotations
+│   │   └── images
+│   ├── coco
+│   │   └── train2017
+│   │       ├── 000000000009.jpg
+│   │       └── ...
+│   ├── cocostuff
+│   │   └── train2017
+│   │       ├── 000000000009.png
+│   │       └── ...
+│   ├── llava_dataset
+│   │   └── llava_instruct_150k.json
+│   ├── mapillary
+│   │   ├── config_v2.0.json
+│   │   ├── testing
+│   │   ├── training
+│   │   └── validation
+│   ├── reason_seg
+│   │   └── ReasonSeg
+│   │       ├── train
+│   │       ├── val
+│   │       └── explanatory
+│   ├── refer_seg
+│   │   ├── images
+│   │   |   ├── saiapr_tc-12
+│   │   |   └── mscoco
+│   │   |       └── images
+│   │   |           └── train2014
+│   │   ├── refclef
+│   │   ├── refcoco
+│   │   ├── refcoco+
+│   │   └── refcocog
+│   └── vlpart
+│       ├── paco
+│       │   └── annotations
+│       └── pascal_part
+│           ├── train.json
+│           └── VOCdevkit
+```
+### Pre-trained weights
+#### LLaVA
+To train LISA-7B or 13B, you need to follow the [instruction](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md) to merge the LLaVA delta weights. Typically, we use the final weights `LLaVA-Lightning-7B-v1-1` and `LLaVA-13B-v1-1` merged from `liuhaotian/LLaVA-Lightning-7B-delta-v1-1` and `liuhaotian/LLaVA-13b-delta-v1-1`, respectively. For Llama2, we can directly use the LLaVA full weights `liuhaotian/llava-llama-2-13b-chat-lightning-preview`.
+#### SAM ViT-H weights
+Download SAM ViT-H pre-trained weights from the [link](https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth).
+### Training
+```
+deepspeed --master_port=24999 train_ds.py \
+  --version="PATH_TO_LLaVA" \
+  --dataset_dir='./dataset' \
+  --vision_pretrained="PATH_TO_SAM" \
+  --dataset="sem_seg||refer_seg||vqa||reason_seg" \
+  --sample_rates="9,3,3,1" \
+  --exp_name="lisa-7b"
+```
+When training is finished, to get the full model weight:
+```
+cd ./runs/lisa-7b/ckpt_model && python zero_to_fp32.py . ../pytorch_model.bin
+```
+### Merge LoRA Weight
+Merge the LoRA weights of `pytorch_model.bin`, save the resulting model into your desired path in the Hugging Face format:
+```
+CUDA_VISIBLE_DEVICES="" python merge_lora_weights_and_save_hf_model.py \
+  --version="PATH_TO_LLaVA" \
+  --weight="PATH_TO_pytorch_model.bin" \
+  --save_path="PATH_TO_SAVED_MODEL"
+```
+For example:
+```
+CUDA_VISIBLE_DEVICES="" python3 merge_lora_weights_and_save_hf_model.py \
+  --version="./LLaVA/LLaVA-Lightning-7B-v1-1" \
+  --weight="lisa-7b/pytorch_model.bin" \
+  --save_path="./LISA-7B"
+```
+### Validation
+```
+deepspeed --master_port=24999 train_ds.py \
+  --version="PATH_TO_LISA_HF_Model_Directory" \
+  --dataset_dir='./dataset' \
+  --vision_pretrained="PATH_TO_SAM" \
+  --exp_name="lisa-7b" \
+  --eval_only
+```
+Note: the `v1` model is trained using both `train+val` sets, so please use the `v0` model to reproduce the validation results. (To use the `v0` models, please first checkout to the legacy version repo with `git checkout 0e26916`.)
+## Inference
+To chat with [LISA-13B-llama2-v1](https://huggingface.co/xinlai/LISA-13B-llama2-v1) or [LISA-13B-llama2-v1-explanatory](https://huggingface.co/xinlai/LISA-13B-llama2-v1-explanatory):
+(Note that `chat.py` currently does not support `v0` models (i.e., `LISA-13B-llama2-v0` and `LISA-13B-llama2-v0-explanatory`), if you want to use the `v0` models, please first checkout to the legacy version repo `git checkout 0e26916`.)
+```
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='xinlai/LISA-13B-llama2-v1'
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='xinlai/LISA-13B-llama2-v1-explanatory'
+```
+To use `bf16` or `fp16` data type for inference:
+```
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='xinlai/LISA-13B-llama2-v1' --precision='bf16'
+```
+To use `8bit` or `4bit` data type for inference (this enables running 13B model on a single 24G or 12G GPU at some cost of generation quality):
+```
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='xinlai/LISA-13B-llama2-v1' --precision='fp16' --load_in_8bit
+CUDA_VISIBLE_DEVICES=0 python chat.py --version='xinlai/LISA-13B-llama2-v1' --precision='fp16' --load_in_4bit
+```
+Hint: for 13B model, 16-bit inference consumes 30G VRAM with a single GPU, 8-bit inference consumes 16G, and 4-bit inference consumes 9G.
+After that, input the text prompt and then the image path. For example，
+```
+- Please input your prompt: Where can the driver see the car speed in this image? Please output segmentation mask.
+- Please input the image path: imgs/example1.jpg
+- Please input your prompt: Can you segment the food that tastes spicy and hot?
+- Please input the image path: imgs/example2.jpg
+```
+The results should be like:
+<p align="center"> <img src="imgs/example1.jpg" width="22%"> <img src="vis_output/example1_masked_img_0.jpg" width="22%"> <img src="imgs/example2.jpg" width="25%"> <img src="vis_output/example2_masked_img_0.jpg" width="25%"> </p>
+## Deployment
+```
+CUDA_VISIBLE_DEVICES=0 python app.py --version='xinlai/LISA-13B-llama2-v1 --load_in_4bit'
+CUDA_VISIBLE_DEVICES=0 python app.py --version='xinlai/LISA-13B-llama2-v1-explanatory --load_in_4bit'
+```
+By default, we use 4-bit quantization. Feel free to delete the `--load_in_4bit` argument for 16-bit inference or replace it with `--load_in_8bit` argument for 8-bit inference.
+## Dataset
+In ReasonSeg, we have collected 1218 images (239 train, 200 val, and 779 test). The training and validation sets can be download from <a href="https://drive.google.com/drive/folders/125mewyg5Ao6tZ3ZdJ-1-E3n04LGVELqy?usp=sharing">**this link**</a>.
+Each image is provided with an annotation JSON file:
+```
+image_1.jpg, image_1.json
+image_2.jpg, image_2.json
+...
+image_n.jpg, image_n.json
+```
+Important keys contained in JSON files:
+```
+- "text": text instructions.
+- "is_sentence": whether the text instructions are long sentences.
+- "shapes": target polygons.
+```
+The elements of the "shapes" exhibit two categories, namely **"target"** and **"ignore"**. The former category is indispensable for evaluation, while the latter category denotes the ambiguous region and hence disregarded during the evaluation process.
+We provide a <a href="https://github.com/dvlab-research/LISA/blob/main/utils/data_processing.py">**script**</a> that demonstrates how to process the annotations:
+```
+python3 utils/data_processing.py
+```
+Besides, we leveraged GPT-3.5 for rephrasing instructions, so images in the training set may have **more than one instructions (but fewer than six)** in the "text" field. During training, users may randomly select one as the text query to obtain a better model.
+## Citation
+If you find this project useful in your research, please consider citing:
+```
+@article{lai2023lisa,
+  title={LISA: Reasoning Segmentation via Large Language Model},
+  author={Lai, Xin and Tian, Zhuotao and Chen, Yukang and Li, Yanwei and Yuan, Yuhui and Liu, Shu and Jia, Jiaya},
+  journal={arXiv preprint arXiv:2308.00692},
+  year={2023}
+}
+@article{yang2023improved,
+  title={An Improved Baseline for Reasoning Segmentation with Large Language Model},
+  author={Yang, Senqiao and Qu, Tianyuan and Lai, Xin and Tian, Zhuotao and Peng, Bohao and Liu, Shu and Jia, Jiaya},
+  journal={arXiv preprint arXiv:2312.17240},
+  year={2023}
+}
+```
+## Acknowledgement
+-  This work is built upon the [LLaVA](https://github.com/haotian-liu/LLaVA) and [SAM](https://github.com/facebookresearch/segment-anything).

app.py ADDED Viewed

	@@ -0,0 +1,359 @@

+import argparse
+import os
+import re
+import sys
+import bleach
+import cv2
+import gradio as gr
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.LISA import LISAForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="LISA chat")
+    parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="fp16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+args = parse_args(sys.argv[1:])
+os.makedirs(args.vis_save_path, exist_ok=True)
+# Create model
+tokenizer = AutoTokenizer.from_pretrained(
+    args.version,
+    cache_dir=None,
+    model_max_length=args.model_max_length,
+    padding_side="right",
+    use_fast=False,
+)
+tokenizer.pad_token = tokenizer.unk_token
+args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+torch_dtype = torch.float32
+if args.precision == "bf16":
+    torch_dtype = torch.bfloat16
+elif args.precision == "fp16":
+    torch_dtype = torch.half
+kwargs = {"torch_dtype": torch_dtype}
+if args.load_in_4bit:
+    kwargs.update(
+        {
+            "torch_dtype": torch.half,
+            "load_in_4bit": True,
+            "quantization_config": BitsAndBytesConfig(
+                load_in_4bit=True,
+                bnb_4bit_compute_dtype=torch.float16,
+                bnb_4bit_use_double_quant=True,
+                bnb_4bit_quant_type="nf4",
+                llm_int8_skip_modules=["visual_model"],
+            ),
+        }
+    )
+elif args.load_in_8bit:
+    kwargs.update(
+        {
+            "torch_dtype": torch.half,
+            "quantization_config": BitsAndBytesConfig(
+                llm_int8_skip_modules=["visual_model"],
+                load_in_8bit=True,
+            ),
+        }
+    )
+model = LISAForCausalLM.from_pretrained(
+    args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, **kwargs
+)
+model.config.eos_token_id = tokenizer.eos_token_id
+model.config.bos_token_id = tokenizer.bos_token_id
+model.config.pad_token_id = tokenizer.pad_token_id
+model.get_model().initialize_vision_modules(model.get_model().config)
+vision_tower = model.get_model().get_vision_tower()
+vision_tower.to(dtype=torch_dtype)
+if args.precision == "bf16":
+    model = model.bfloat16().cuda()
+elif (
+    args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
+):
+    vision_tower = model.get_model().get_vision_tower()
+    model.model.vision_tower = None
+    import deepspeed
+    model_engine = deepspeed.init_inference(
+        model=model,
+        dtype=torch.half,
+        replace_with_kernel_inject=True,
+        replace_method="auto",
+    )
+    model = model_engine.module
+    model.model.vision_tower = vision_tower.half().cuda()
+elif args.precision == "fp32":
+    model = model.float().cuda()
+vision_tower = model.get_model().get_vision_tower()
+vision_tower.to(device=args.local_rank)
+clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+transform = ResizeLongestSide(args.image_size)
+model.eval()
+# Gradio
+examples = [
+    [
+        "Where can the driver see the car speed in this image? Please output segmentation mask.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "Can you segment the food that tastes spicy and hot?",
+        "./resources/imgs/example2.jpg",
+    ],
+    [
+        "Assuming you are an autonomous driving robot, what part of the diagram would you manipulate to control the direction of travel? Please output segmentation mask and explain why.",
+        "./resources/imgs/example1.jpg",
+    ],
+    [
+        "What can make the woman stand higher? Please output segmentation mask and explain why.",
+        "./resources/imgs/example3.jpg",
+    ],
+]
+output_labels = ["Segmentation Output"]
+title = "LISA: Reasoning Segmentation via Large Language Model"
+description = """
+<font size=4>
+This is the online demo of LISA. \n
+If multiple users are using it at the same time, they will enter a queue, which may delay some time. \n
+**Note**: **Different prompts can lead to significantly varied results**. \n
+**Note**: Please try to **standardize** your input text prompts to **avoid ambiguity**, and also pay attention to whether the **punctuations** of the input are correct. \n
+**Note**: Current model is **LISA-13B-llama2-v0-explanatory**, and 4-bit quantization may impair text-generation quality. \n
+**Usage**: <br>
+&ensp;(1) To let LISA **segment something**, input prompt like: "Can you segment xxx in this image?", "What is xxx in this image? Please output segmentation mask."; <br>
+&ensp;(2) To let LISA **output an explanation**, input prompt like: "What is xxx in this image? Please output segmentation mask and explain why."; <br>
+&ensp;(3) To obtain **solely language output**, you can input like what you should do in current multi-modal LLM (e.g., LLaVA). <br>
+Hope you can enjoy our work!
+</font>
+"""
+article = """
+<p style='text-align: center'>
+<a href='https://arxiv.org/abs/2308.00692' target='_blank'>
+Preprint Paper
+</a>
+\n
+<p style='text-align: center'>
+<a href='https://github.com/dvlab-research/LISA' target='_blank'>   Github Repo </a></p>
+"""
+## to be implemented
+def inference(input_str, input_image):
+    ## filter out special chars
+    input_str = bleach.clean(input_str)
+    print("input_str: ", input_str, "input_image: ", input_image)
+    ## input valid check
+    if not re.match(r"^[A-Za-z ,.!?\'\"]+$", input_str) or len(input_str) < 1:
+        output_str = "[Error] Invalid input: ", input_str
+        # output_image = np.zeros((128, 128, 3))
+        ## error happened
+        output_image = cv2.imread("./resources/error_happened.png")[:, :, ::-1]
+        return output_image, output_str
+    # Model Inference
+    conv = conversation_lib.conv_templates[args.conv_type].copy()
+    conv.messages = []
+    prompt = input_str
+    prompt = DEFAULT_IMAGE_TOKEN + "\n" + prompt
+    if args.use_mm_start_end:
+        replace_token = (
+            DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+        )
+        prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+    conv.append_message(conv.roles[0], prompt)
+    conv.append_message(conv.roles[1], "")
+    prompt = conv.get_prompt()
+    image_np = cv2.imread(input_image)
+    image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+    original_size_list = [image_np.shape[:2]]
+    image_clip = (
+        clip_image_processor.preprocess(image_np, return_tensors="pt")[
+            "pixel_values"
+        ][0]
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image_clip = image_clip.bfloat16()
+    elif args.precision == "fp16":
+        image_clip = image_clip.half()
+    else:
+        image_clip = image_clip.float()
+    image = transform.apply_image(image_np)
+    resize_list = [image.shape[:2]]
+    image = (
+        preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+        .unsqueeze(0)
+        .cuda()
+    )
+    if args.precision == "bf16":
+        image = image.bfloat16()
+    elif args.precision == "fp16":
+        image = image.half()
+    else:
+        image = image.float()
+    input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+    input_ids = input_ids.unsqueeze(0).cuda()
+    output_ids, pred_masks = model.evaluate(
+        image_clip,
+        image,
+        input_ids,
+        resize_list,
+        original_size_list,
+        max_new_tokens=512,
+        tokenizer=tokenizer,
+    )
+    output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+    text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+    text_output = text_output.replace("\n", "").replace("  ", " ")
+    text_output = text_output.split("ASSISTANT: ")[-1]
+    print("text_output: ", text_output)
+    save_img = None
+    for i, pred_mask in enumerate(pred_masks):
+        if pred_mask.shape[0] == 0:
+            continue
+        pred_mask_np = pred_mask.detach().cpu().numpy()[0]
+        # Normalize the continuous score mask to 0-255 range for visualization
+        min_val = float(pred_mask_np.min())
+        max_val = float(pred_mask_np.max())
+        # Avoid division by zero if min_val == max_val
+        denom = (max_val - min_val) if (max_val - min_val) != 0 else 1e-8
+        # Normalize to [0, 255] for image display
+        normalized_mask = ((pred_mask_np - min_val) / denom * 255).astype(np.uint8)
+        # Apply colormap (jet) to create a colored visualization
+        save_img = cv2.applyColorMap(normalized_mask, cv2.COLORMAP_VIRIDIS)
+        save_img = cv2.cvtColor(save_img, cv2.COLOR_BGR2RGB)
+        # -------------------------------------------------------------
+        # Add a vertical legend (color bar) to the right of save_img
+        # -------------------------------------------------------------
+        legend_width = 30
+        legend_height = save_img.shape[0]
+        # Create vertical gradient from 255 (top) to 0 (bottom)
+        gradient = np.linspace(255, 0, legend_height, dtype=np.uint8).reshape(-1, 1)
+        gradient = np.repeat(gradient, legend_width, axis=1)
+        legend_color = cv2.applyColorMap(gradient, cv2.COLORMAP_VIRIDIS)
+        legend_color = cv2.cvtColor(legend_color, cv2.COLOR_BGR2RGB)
+        # Put min / max text on legend
+        font = cv2.FONT_HERSHEY_SIMPLEX
+        font_scale = 0.4
+        thickness = 1
+        cv2.putText(legend_color, f"{max_val:.2f}", (2, 12), font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
+        cv2.putText(legend_color, f"{min_val:.2f}", (2, legend_height - 4), font, font_scale, (255, 255, 255), thickness, cv2.LINE_AA)
+        # Concatenate original visualization with legend
+        save_img = np.concatenate([save_img, legend_color], axis=1)
+    output_str = "ASSISTANT: " + text_output  # input_str
+    if save_img is not None:
+        output_image = save_img  # input_image
+    else:
+        ## no seg output
+        output_image = cv2.imread("./resources/no_seg_out.png")[:, :, ::-1]
+    return output_image, output_str
+demo = gr.Interface(
+    inference,
+    inputs=[
+        gr.Textbox(lines=1, placeholder=None, label="Text Instruction"),
+        gr.Image(type="filepath", label="Input Image"),
+    ],
+    outputs=[
+        gr.Image(type="pil", label="Segmentation Output"),
+        gr.Textbox(lines=1, placeholder=None, label="Text Output"),
+    ],
+    title=title,
+    description=description,
+    article=article,
+    examples=None,
+    allow_flagging="auto",
+)
+demo.queue()
+demo.launch()

chat.py ADDED Viewed

	@@ -0,0 +1,359 @@

+###############################
+# chat.py
+# Inference for LISA (terminal-based)
+###############################
+import argparse
+import os
+import sys
+import cv2
+from matplotlib import pyplot as plt
+import numpy as np
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, BitsAndBytesConfig, CLIPImageProcessor
+from model.LISA import LISAForCausalLM
+from model.llava import conversation as conversation_lib
+from model.llava.mm_utils import tokenizer_image_token
+from model.segment_anything.utils.transforms import ResizeLongestSide
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX)
+def parse_args(args):
+    parser = argparse.ArgumentParser(description="LISA chat")
+    parser.add_argument("--version", default="xinlai/LISA-13B-llama2-v1")
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--load_in_8bit", action="store_true", default=False)
+    parser.add_argument("--load_in_4bit", action="store_true", default=False)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    return parser.parse_args(args)
+def preprocess(
+    x,
+    pixel_mean=torch.Tensor([123.675, 116.28, 103.53]).view(-1, 1, 1),
+    pixel_std=torch.Tensor([58.395, 57.12, 57.375]).view(-1, 1, 1),
+    img_size=1024,
+) -> torch.Tensor:
+    """Normalize pixel values and pad to a square input."""
+    # Normalize colors
+    x = (x - pixel_mean) / pixel_std
+    # Pad
+    h, w = x.shape[-2:]
+    padh = img_size - h
+    padw = img_size - w
+    x = F.pad(x, (0, padw, 0, padh))
+    return x
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # NOTE: NO NEED?
+    # if args.version == "BigData-KSU/RS-llava-v1.5-7b-LoRA":
+    #    tokenizer_base = 'Intel/neural-chat-7b-v3-3'
+    # else:
+    #    tokenizer_base = args.version
+    # Create model
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.version,   # tokenizer_base?
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    # num_added_tokens = tokenizer.add_tokens("[SEG]")  # NOTE: NO NEED?
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    # NOTE: NO NEED?
+    # if args.use_mm_start_end:
+    #     tokenizer.add_tokens(
+    #         [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+    #     )
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    kwargs = {"torch_dtype": torch_dtype}
+    if args.load_in_4bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "load_in_4bit": True,
+                "quantization_config": BitsAndBytesConfig(
+                    load_in_4bit=True,
+                    bnb_4bit_compute_dtype=torch.float16,
+                    bnb_4bit_use_double_quant=True,
+                    bnb_4bit_quant_type="nf4",
+                    llm_int8_skip_modules=["visual_model"],
+                ),
+            }
+        )
+    elif args.load_in_8bit:
+        kwargs.update(
+            {
+                "torch_dtype": torch.half,
+                "quantization_config": BitsAndBytesConfig(
+                    llm_int8_skip_modules=["visual_model"],
+                    load_in_8bit=True,
+                ),
+            }
+        )
+    model = LISAForCausalLM.from_pretrained(
+        args.version, low_cpu_mem_usage=True, vision_tower=args.vision_tower, seg_token_idx=args.seg_token_idx, **kwargs
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    if args.precision == "bf16":
+        model = model.bfloat16().cuda()
+    elif (
+        args.precision == "fp16" and (not args.load_in_4bit) and (not args.load_in_8bit)
+    ):
+        vision_tower = model.get_model().get_vision_tower()
+        model.model.vision_tower = None
+        import deepspeed
+        model_engine = deepspeed.init_inference(
+            model=model,
+            dtype=torch.half,
+            replace_with_kernel_inject=True,
+            replace_method="auto",
+        )
+        model = model_engine.module
+        model.model.vision_tower = vision_tower.half().cuda()
+    elif args.precision == "fp32":
+        model = model.float().cuda()
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(device=args.local_rank)
+    clip_image_processor = CLIPImageProcessor.from_pretrained(model.config.vision_tower)
+    transform = ResizeLongestSide(args.image_size)
+    model.eval()
+    while True:
+        conv = conversation_lib.conv_templates[args.conv_type].copy()
+        conv.messages = []
+        question = input("Please input your prompt: ")
+        prompt = DEFAULT_IMAGE_TOKEN + "\n" + question
+        if args.use_mm_start_end:
+            replace_token = (
+                DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
+            )
+            prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
+        conv.append_message(conv.roles[0], prompt)
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        image_path = input("Please input the image path: ")
+        if not os.path.exists(image_path):
+            print("File not found in {}".format(image_path))
+            continue
+        image_np = cv2.imread(image_path)
+        image_np = cv2.cvtColor(image_np, cv2.COLOR_BGR2RGB)
+        original_size_list = [image_np.shape[:2]]
+        image_clip = (
+            clip_image_processor.preprocess(image_np, return_tensors="pt")[
+                "pixel_values"
+            ][0]
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image_clip = image_clip.bfloat16()
+        elif args.precision == "fp16":
+            image_clip = image_clip.half()
+        else:
+            image_clip = image_clip.float()
+        image = transform.apply_image(image_np)
+        resize_list = [image.shape[:2]]
+        image = (
+            preprocess(torch.from_numpy(image).permute(2, 0, 1).contiguous())
+            .unsqueeze(0)
+            .cuda()
+        )
+        if args.precision == "bf16":
+            image = image.bfloat16()
+        elif args.precision == "fp16":
+            image = image.half()
+        else:
+            image = image.float()
+        input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors="pt")
+        input_ids = input_ids.unsqueeze(0).cuda()
+        output_ids, pred_masks = model.evaluate(
+            image_clip,
+            image,
+            input_ids,
+            resize_list,
+            original_size_list,
+            max_new_tokens=512,
+            tokenizer=tokenizer,
+        )
+        output_ids = output_ids[0][output_ids[0] != IMAGE_TOKEN_INDEX]
+        text_output = tokenizer.decode(output_ids, skip_special_tokens=False)
+        text_output = text_output.replace("\n", "").replace("  ", " ")
+        print("text_output: ", text_output)
+        # for i, pred_mask in enumerate(pred_masks):
+        #     if pred_mask.shape[0] == 0:
+        #         continue
+        #     print("min pre_mask: ", pred_mask.min())
+        #     print("max pre_mask: ", pred_mask.max())
+        #     pred_mask = pred_mask.detach().cpu().numpy()[0]
+        #     pred_mask = pred_mask > 0
+        #     save_path = "{}/{}_mask_{}.jpg".format(
+        #         args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+        #     )
+        #     cv2.imwrite(save_path, pred_mask * 100)
+        #     print("{} has been saved.".format(save_path))
+        #     save_path = "{}/{}_masked_img_{}.jpg".format(
+        #         args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+        #     )
+        #     save_img = image_np.copy()
+        #     save_img[pred_mask] = (
+        #         image_np * 0.5
+        #         + pred_mask[:, :, None].astype(np.uint8) * np.array([255, 0, 0]) * 0.5
+        #     )[pred_mask]
+        #     save_img = cv2.cvtColor(save_img, cv2.COLOR_RGB2BGR)
+        #     cv2.imwrite(save_path, save_img)
+        #     print("{} has been saved.".format(save_path))
+        for i, pred_mask in enumerate(pred_masks):
+            if pred_mask.shape[0] == 0:
+                continue
+            # ------------------------------------------------------------------
+            # 1) Prepare / detach / copy stuff
+            # ------------------------------------------------------------------
+            # Convert torch tensor -> NumPy
+            pred_mask_np = pred_mask.detach().cpu().numpy()[0]
+            # Convert your image from RGB to a float NumPy array if needed
+            # (Adjust as necessary depending on your original image data type)
+            image_rgb = image_np.astype(np.float32)  # shape (H, W, 3)
+            # ------------------------------------------------------------------
+            # 2) Create the Binary Mask & Overlaid Image (subplot #2)
+            # ------------------------------------------------------------------
+            # Binary threshold (> 0)
+            binary_mask = pred_mask_np > 0
+            # Make a copy of the original image for overlaying
+            masked_image = image_rgb.copy()
+            # Option A: Simple half-blend with red for the masked area
+            # We only modify pixels where binary_mask is True
+            red_color = np.array([255, 0, 0], dtype=np.float32)
+            blended_red = image_rgb[binary_mask] * 0.5 + red_color * 0.5
+            masked_image[binary_mask] = blended_red
+            # ------------------------------------------------------------------
+            # 3) Create the Raw Mask (subplot #3) + Colorbar
+            # ------------------------------------------------------------------
+            min_val = float(pred_mask_np.min())
+            max_val = float(pred_mask_np.max())
+            # Avoid division by zero if min_val == max_val
+            denom = (max_val - min_val) if (max_val - min_val) != 0 else 1e-8
+            # Normalize to [0, 1]
+            normalized_mask = (pred_mask_np - min_val) / denom
+            # ------------------------------------------------------------------
+            # 4) Plot everything with Matplotlib
+            # ------------------------------------------------------------------
+            fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
+            # (Left) Original Image
+            ax1.imshow(image_rgb.astype(np.uint8))
+            ax1.set_title("Original Image")
+            ax1.axis("off")
+            # (Middle) Binary Mask Overlaid
+            ax2.imshow(masked_image.astype(np.uint8))
+            ax2.set_title("Binary Mask (>0) in Red")
+            ax2.axis("off")
+            # (Right) Raw Mask with Colorbar
+            # Show the normalized mask in [0..1] range, but apply a color map
+            im3 = ax3.imshow(normalized_mask, cmap='jet', vmin=0, vmax=1)
+            ax3.set_title("Raw Mask (Continuous)")
+            ax3.axis("off")
+            # Add a colorbar to the third subplot
+            cbar = fig.colorbar(im3, ax=ax3, fraction=0.046, pad=0.04)
+            cbar.set_label("Normalized Mask Value")
+            # Add a main title (optional)
+            fig.suptitle(f"Question: {question}")
+            answer = text_output[text_output.find("ASSISTANT"):]
+            fig.text(0.5, 0.05, f"{answer}", ha='center', va='center')
+            # ------------------------------------------------------------------
+            # 5) Show the figure, then save after it’s closed
+            # ------------------------------------------------------------------
+            # When plt.show() returns, the figure is closed if interactive mode is off.
+            plt.show(block=True)   # This pauses execution until the window is closed.
+            # Now save the figure
+            save_path = "{}/{}_matplotlib_{}.png".format(
+                args.vis_save_path, image_path.split("/")[-1].split(".")[0], i
+            )
+            fig.savefig(save_path)
+            print(f"Figure saved to: {save_path}")
+            # Finally close the figure to free memory
+            plt.close(fig)
+if __name__ == "__main__":
+    main(sys.argv[1:])

imgs/blackpink.jpg ADDED Viewed

Git LFS Details

SHA256: f69807ec94491a7c5936b97b9e244bfd29c098a25328459cc8d8ea8a9830725a
Pointer size: 131 Bytes
Size of remote file: 144 kB

imgs/camera_lens.jpg ADDED Viewed

Git LFS Details

SHA256: c82eef6e52a0c60cdaaad36b7576b97e63b4d6e4dabc5bf96f416549a35a71f6
Pointer size: 132 Bytes
Size of remote file: 1.49 MB

imgs/car_speed.jpg ADDED Viewed

Git LFS Details

SHA256: 20765cb15cf29a2572e1bbd21aac7af8cdf6509b0f2ab9d9059935d134865568
Pointer size: 131 Bytes
Size of remote file: 131 kB

imgs/dog_with_horn.jpg ADDED Viewed

Git LFS Details

SHA256: f4b9a1df6b36dbd512f40c82f71cdd9e1889abe990b85ab11201c1104c8c35b2
Pointer size: 131 Bytes
Size of remote file: 169 kB

imgs/example1.jpg ADDED Viewed

Git LFS Details

SHA256: 478d6b7d3347daf11e26f3c69327ebb7cbe1626e294811fe86b6f2922a305e56
Pointer size: 131 Bytes
Size of remote file: 969 kB

imgs/example2.jpg ADDED Viewed

Git LFS Details

SHA256: 6d0fb0badeb946a5f34a39d938936abcd6b74cc9b9e039891b9fc82eaafd318f
Pointer size: 131 Bytes
Size of remote file: 126 kB

imgs/fig_overview.jpg ADDED Viewed

Git LFS Details

SHA256: 6914cf00986a64f6c853972690d743bed77676450103854f12b91e5bf2339cba
Pointer size: 131 Bytes
Size of remote file: 496 kB

imgs/jackma.jpg ADDED Viewed

Git LFS Details

SHA256: d5f4f94b281bbbda835d9a5747eed2725dbcac53728db42f8b58add0b347f254
Pointer size: 130 Bytes
Size of remote file: 58.7 kB

imgs/obama.jpg ADDED Viewed

Git LFS Details

SHA256: 5cb6218c216eb0e4607cba1b0903764abd90ae190c6320687647ae0562e2cfa2
Pointer size: 131 Bytes
Size of remote file: 442 kB

imgs/stand_higher.jpg ADDED Viewed

Git LFS Details

SHA256: fee23809c1bc4bb318bfce1e913ad83c1a63b9e67a577953c8e34d0e6f951bbe
Pointer size: 132 Bytes
Size of remote file: 2.05 MB

imgs/table1.jpg ADDED Viewed

Git LFS Details

SHA256: c254701c0b50c8f79e0b3188b83f74887dc463fa62bc5017d32c565b9fa701de
Pointer size: 131 Bytes
Size of remote file: 600 kB

imgs/teaser.jpg ADDED Viewed

Git LFS Details

SHA256: 2b4f4cc3dce4f6c858ef9257236f93ee33ecd4084ec929df16dac8ad34245bdd
Pointer size: 132 Bytes
Size of remote file: 1.22 MB

imgs/trump.jpg ADDED Viewed

Git LFS Details

SHA256: a9fa2759730114f825238cb9f8636a7dd16ab38cb3b507a3b56dd791f113ef6c
Pointer size: 131 Bytes
Size of remote file: 596 kB

imgs/wash_hands.jpg ADDED Viewed

Git LFS Details

SHA256: e718286736ec1a3c72cd6b430e9c3345a526175578ee09c03ded71c3d3567de3
Pointer size: 132 Bytes
Size of remote file: 1.38 MB

merge_lora_weights_and_save_hf_model.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import argparse
+import glob
+import os
+import sys
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import transformers
+from peft import LoraConfig, get_peft_model
+from transformers import AutoTokenizer
+from model.LISA import LISAForCausalLM
+from utils.utils import DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="merge lora weights and save model with hf format"
+    )
+    parser.add_argument(
+        "--version", default="liuhaotian/llava-llama-2-13b-chat-lightning-preview"
+    )
+    parser.add_argument("--vis_save_path", default="./vis_output", type=str)
+    parser.add_argument(
+        "--precision",
+        default="bf16",
+        type=str,
+        choices=["fp32", "bf16", "fp16"],
+        help="precision for inference",
+    )
+    parser.add_argument("--vision_pretrained", default="PATH_TO_SAM_ViT-H", type=str)
+    parser.add_argument("--out_dim", default=256, type=int)
+    parser.add_argument("--image_size", default=1024, type=int, help="image size")
+    parser.add_argument("--model_max_length", default=512, type=int)
+    parser.add_argument(
+        "--vision-tower", default="openai/clip-vit-large-patch14", type=str
+    )
+    parser.add_argument("--lora_r", default=8, type=int)
+    parser.add_argument("--lora_alpha", default=16, type=int)
+    parser.add_argument("--lora_dropout", default=0.05, type=float)
+    parser.add_argument("--lora_target_modules", default="q_proj,v_proj", type=str)
+    parser.add_argument("--local-rank", default=0, type=int, help="node rank")
+    parser.add_argument("--train_mask_decoder", action="store_true", default=True)
+    parser.add_argument("--use_mm_start_end", action="store_true", default=True)
+    parser.add_argument(
+        "--conv_type",
+        default="llava_v1",
+        type=str,
+        choices=["llava_v1", "llava_llama_2"],
+    )
+    parser.add_argument("--weight", default="", type=str, required=True)
+    parser.add_argument("--save_path", default="./lisa_model", type=str, required=True)
+    return parser.parse_args(args)
+def main(args):
+    args = parse_args(args)
+    os.makedirs(args.vis_save_path, exist_ok=True)
+    # Create model
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        args.version,
+        cache_dir=None,
+        model_max_length=args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    num_added_tokens = tokenizer.add_tokens("[SEG]")
+    args.seg_token_idx = tokenizer("[SEG]", add_special_tokens=False).input_ids[0]
+    if args.use_mm_start_end:
+        tokenizer.add_tokens(
+            [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+        )
+    model_args = {
+        "train_mask_decoder": args.train_mask_decoder,
+        "out_dim": args.out_dim,
+        "seg_token_idx": args.seg_token_idx,
+        "vision_tower": args.vision_tower,
+    }
+    torch_dtype = torch.float32
+    if args.precision == "bf16":
+        torch_dtype = torch.bfloat16
+    elif args.precision == "fp16":
+        torch_dtype = torch.half
+    model = LISAForCausalLM.from_pretrained(
+        args.version, torch_dtype=torch_dtype, low_cpu_mem_usage=True, **model_args
+    )
+    model.config.eos_token_id = tokenizer.eos_token_id
+    model.config.bos_token_id = tokenizer.bos_token_id
+    model.config.pad_token_id = tokenizer.pad_token_id
+    model.get_model().initialize_vision_modules(model.get_model().config)
+    vision_tower = model.get_model().get_vision_tower()
+    vision_tower.to(dtype=torch_dtype)
+    model.get_model().initialize_lisa_modules(model.get_model().config)
+    lora_r = args.lora_r
+    if lora_r > 0:
+        def find_linear_layers(model, lora_target_modules):
+            cls = torch.nn.Linear
+            lora_module_names = set()
+            for name, module in model.named_modules():
+                if (
+                    isinstance(module, cls)
+                    and all(
+                        [
+                            x not in name
+                            for x in [
+                                "visual_model",
+                                "vision_tower",
+                                "mm_projector",
+                                "text_hidden_fcs",
+                            ]
+                        ]
+                    )
+                    and any([x in name for x in lora_target_modules])
+                ):
+                    lora_module_names.add(name)
+            return sorted(list(lora_module_names))
+        lora_alpha = args.lora_alpha
+        lora_dropout = args.lora_dropout
+        lora_target_modules = find_linear_layers(
+            model, args.lora_target_modules.split(",")
+        )
+        lora_config = LoraConfig(
+            r=lora_r,
+            lora_alpha=lora_alpha,
+            target_modules=lora_target_modules,
+            lora_dropout=lora_dropout,
+            bias="none",
+            task_type="CAUSAL_LM",
+        )
+        model = get_peft_model(model, lora_config)
+        model.print_trainable_parameters()
+    model.resize_token_embeddings(len(tokenizer))
+    state_dict = torch.load(args.weight, map_location="cpu")
+    model.load_state_dict(state_dict, strict=True)
+    model = model.merge_and_unload()
+    state_dict = {}
+    for k, v in model.state_dict().items():
+        if "vision_tower" not in k:
+            state_dict[k] = v
+    model.save_pretrained(args.save_path, state_dict=state_dict)
+    tokenizer.save_pretrained(args.save_path)
+if __name__ == "__main__":
+    main(sys.argv[1:])

model/LISA.py ADDED Viewed

	@@ -0,0 +1,427 @@

+from typing import List
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import BitsAndBytesConfig, CLIPVisionModel
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_PATCH_TOKEN)
+from .llava.model.language_model.llava_llama import (LlavaLlamaForCausalLM,
+                                                     LlavaLlamaModel)
+from .segment_anything import build_sam_vit_h
+def dice_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+    scale=1000,  # 100000.0,
+    eps=1e-6,
+):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1, 2)
+    targets = targets.flatten(1, 2)
+    numerator = 2 * (inputs / scale * targets).sum(-1)
+    denominator = (inputs / scale).sum(-1) + (targets / scale).sum(-1)
+    loss = 1 - (numerator + eps) / (denominator + eps)
+    loss = loss.sum() / (num_masks + 1e-8)
+    return loss
+def sigmoid_ce_loss(
+    inputs: torch.Tensor,
+    targets: torch.Tensor,
+    num_masks: float,
+):
+    """
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    Returns:
+        Loss tensor
+    """
+    loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    loss = loss.flatten(1, 2).mean(1).sum() / (num_masks + 1e-8)
+    return loss
+class LisaMetaModel:
+    def __init__(
+        self,
+        config,
+        **kwargs,
+    ):
+        super(LisaMetaModel, self).__init__(config)
+        self.config = config
+        if not hasattr(self.config, "train_mask_decoder"):
+            self.config.train_mask_decoder = kwargs["train_mask_decoder"]
+            self.config.out_dim = kwargs["out_dim"]
+            self.vision_pretrained = kwargs.get("vision_pretrained", None)
+        else:
+            self.vision_pretrained = kwargs.get("vision_pretrained", None)
+            self.initialize_lisa_modules(self.config)
+    def initialize_lisa_modules(self, config):
+        # SAM
+        self.visual_model = build_sam_vit_h(self.vision_pretrained)
+        for param in self.visual_model.parameters():
+            param.requires_grad = False
+        if config.train_mask_decoder:
+            self.visual_model.mask_decoder.train()
+            for param in self.visual_model.mask_decoder.parameters():
+                param.requires_grad = True
+        # Projection layer
+        in_dim = config.hidden_size
+        out_dim = config.out_dim
+        text_fc = [
+            nn.Linear(in_dim, in_dim),
+            nn.ReLU(inplace=True),
+            nn.Linear(in_dim, out_dim),
+            nn.Dropout(0.0),
+        ]
+        self.text_hidden_fcs = nn.ModuleList([nn.Sequential(*text_fc)])
+        self.text_hidden_fcs.train()
+        for param in self.text_hidden_fcs.parameters():
+            param.requires_grad = True
+class LisaModel(LisaMetaModel, LlavaLlamaModel):
+    def __init__(
+        self,
+        config,
+        **kwargs,
+    ):
+        super(LisaModel, self).__init__(config, **kwargs)
+        self.config.use_cache = False
+        self.config.vision_tower = self.config.mm_vision_tower
+        self.config.mm_vision_select_feature = "patch"
+        self.config.image_aspect_ratio = "square"
+        self.config.image_grid_pinpoints = None
+        self.config.tune_mm_mlp_adapter = False
+        self.config.freeze_mm_mlp_adapter = True
+        self.config.pretrain_mm_mlp_adapter = None
+        self.config.mm_use_im_patch_token = False
+class LISAForCausalLM(LlavaLlamaForCausalLM):
+    def __init__(
+        self,
+        config,
+        **kwargs,
+    ):
+        if not hasattr(config, "train_mask_decoder"):
+            config.mm_use_im_start_end = kwargs.pop("use_mm_start_end", True)
+            config.mm_vision_tower = kwargs.get(
+                "vision_tower", "openai/clip-vit-large-patch14"
+            )
+            self.ce_loss_weight = kwargs.pop("ce_loss_weight", None)
+            self.dice_loss_weight = kwargs.pop("dice_loss_weight", None)
+            self.bce_loss_weight = kwargs.pop("bce_loss_weight", None)
+        else:
+            config.mm_vision_tower = config.vision_tower
+        self.seg_token_idx = kwargs.pop("seg_token_idx")
+        super().__init__(config)
+        self.model = LisaModel(config, **kwargs)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_visual_embs(self, pixel_values: torch.FloatTensor):
+        with torch.no_grad():
+            image_embeddings_list = []
+            for i in range(pixel_values.shape[0]):
+                torch.cuda.empty_cache()
+                image_embeddings = self.model.visual_model.image_encoder(
+                    pixel_values[i].unsqueeze(0)
+                )
+                image_embeddings_list.append(image_embeddings)
+            torch.cuda.empty_cache()
+            image_embeddings = torch.cat(image_embeddings_list, 0)
+        return image_embeddings
+    def forward(self, **kwargs):
+        if "past_key_values" in kwargs:
+            return super().forward(**kwargs)
+        return self.model_forward(**kwargs)
+    def model_forward(
+        self,
+        images: torch.FloatTensor,
+        images_clip: torch.FloatTensor,
+        input_ids: torch.LongTensor,
+        labels: torch.LongTensor,
+        attention_masks: torch.LongTensor,
+        offset: torch.LongTensor,
+        masks_list: List[torch.FloatTensor],
+        label_list: List[torch.Tensor],
+        resize_list: List[tuple],
+        inference: bool = False,
+        **kwargs,
+    ):
+        image_embeddings = self.get_visual_embs(images)
+        batch_size = image_embeddings.shape[0]
+        assert batch_size == len(offset) - 1
+        seg_token_mask = input_ids[:, 1:] == self.seg_token_idx
+        seg_token_mask = torch.cat(
+            [
+                seg_token_mask,
+                torch.zeros((seg_token_mask.shape[0], 1)).bool().cuda(),
+            ],
+            dim=1,
+        )
+        # hack for IMAGE_TOKEN_INDEX (we suppose that there is only one image, and it is in the front)
+        seg_token_mask = torch.cat(
+            [torch.zeros((seg_token_mask.shape[0], 255)).bool().cuda(), seg_token_mask],
+            dim=1,
+        )
+        if inference:
+            n_batch = 1
+            length = input_ids.shape[0]
+            assert images_clip.shape[0] == 1
+            images_clip_extend = images_clip.expand(length, -1, -1, -1).contiguous()
+            output_hidden_states = []
+            for i in range(n_batch):
+                start_i, end_i = i * length, min((i + 1) * length, input_ids.shape[0])
+                output_i = super().forward(
+                    images=images_clip_extend[: end_i - start_i],
+                    attention_mask=attention_masks[start_i:end_i],
+                    input_ids=input_ids[start_i:end_i],
+                    output_hidden_states=True,
+                )
+                output_hidden_states.append(output_i.hidden_states)
+                torch.cuda.empty_cache()
+            output_hidden_states_list = []
+            output_hidden_states_level = torch.cat(output_hidden_states, dim=0)
+            output_hidden_states_list.append(output_hidden_states_level)
+            output_hidden_states = output_hidden_states_list
+            output = None
+        else:
+            images_clip_list = []
+            for i in range(len(offset) - 1):
+                start_i, end_i = offset[i], offset[i + 1]
+                images_clip_i = (
+                    images_clip[i]
+                    .unsqueeze(0)
+                    .expand(end_i - start_i, -1, -1, -1)
+                    .contiguous()
+                )
+                images_clip_list.append(images_clip_i)
+            images_clip = torch.cat(images_clip_list, dim=0)
+            output = super().forward(
+                images=images_clip,
+                attention_mask=attention_masks,
+                input_ids=input_ids,
+                labels=labels,
+                output_hidden_states=True,
+            )
+            output_hidden_states = output.hidden_states
+        hidden_states = []
+        assert len(self.model.text_hidden_fcs) == 1
+        hidden_states.append(self.model.text_hidden_fcs[0](output_hidden_states[-1]))
+        last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+        pred_embeddings = last_hidden_state[seg_token_mask]
+        seg_token_counts = seg_token_mask.int().sum(-1)  # [bs, ]
+        seg_token_offset = seg_token_counts.cumsum(-1)
+        seg_token_offset = torch.cat(
+            [torch.zeros(1).long().cuda(), seg_token_offset], dim=0
+        )
+        seg_token_offset = seg_token_offset[offset]
+        pred_embeddings_ = []
+        for i in range(len(seg_token_offset) - 1):
+            start_i, end_i = seg_token_offset[i], seg_token_offset[i + 1]
+            pred_embeddings_.append(pred_embeddings[start_i:end_i])
+        pred_embeddings = pred_embeddings_
+        multimask_output = False
+        pred_masks = []
+        for i in range(len(pred_embeddings)):
+            (
+                sparse_embeddings,
+                dense_embeddings,
+            ) = self.model.visual_model.prompt_encoder(
+                points=None,
+                boxes=None,
+                masks=None,
+                text_embeds=pred_embeddings[i].unsqueeze(1),
+            )
+            sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
+            low_res_masks, iou_predictions = self.model.visual_model.mask_decoder(
+                image_embeddings=image_embeddings[i].unsqueeze(0),
+                image_pe=self.model.visual_model.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                dense_prompt_embeddings=dense_embeddings,
+                multimask_output=multimask_output,
+            )
+            pred_mask = self.model.visual_model.postprocess_masks(
+                low_res_masks,
+                input_size=resize_list[i],
+                original_size=label_list[i].shape,
+            )
+            pred_masks.append(pred_mask[:, 0])
+        model_output = output
+        gt_masks = masks_list
+        if inference:
+            return {
+                "pred_masks": pred_masks,
+                "gt_masks": gt_masks,
+            }
+        output = model_output.logits
+        ce_loss = model_output.loss
+        ce_loss = ce_loss * self.ce_loss_weight
+        mask_bce_loss = 0
+        mask_dice_loss = 0
+        num_masks = 0
+        for batch_idx in range(len(pred_masks)):
+            gt_mask = gt_masks[batch_idx]
+            pred_mask = pred_masks[batch_idx]
+            assert (
+                gt_mask.shape[0] == pred_mask.shape[0]
+            ), "gt_mask.shape: {}, pred_mask.shape: {}".format(
+                gt_mask.shape, pred_mask.shape
+            )
+            mask_bce_loss += (
+                sigmoid_ce_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            mask_dice_loss += (
+                dice_loss(pred_mask, gt_mask, num_masks=gt_mask.shape[0])
+                * gt_mask.shape[0]
+            )
+            num_masks += gt_mask.shape[0]
+        mask_bce_loss = self.bce_loss_weight * mask_bce_loss / (num_masks + 1e-8)
+        mask_dice_loss = self.dice_loss_weight * mask_dice_loss / (num_masks + 1e-8)
+        mask_loss = mask_bce_loss + mask_dice_loss
+        loss = ce_loss + mask_loss
+        return {
+            "loss": loss,
+            "ce_loss": ce_loss,
+            "mask_bce_loss": mask_bce_loss,
+            "mask_dice_loss": mask_dice_loss,
+            "mask_loss": mask_loss,
+        }
+    def evaluate(
+        self,
+        images_clip,
+        images,
+        input_ids,
+        resize_list,
+        original_size_list,
+        max_new_tokens=32,
+        tokenizer=None,
+    ):
+        with torch.no_grad():
+            outputs = self.generate(
+                images=images_clip,
+                input_ids=input_ids,
+                max_new_tokens=max_new_tokens,
+                num_beams=1,
+                output_hidden_states=True,
+                return_dict_in_generate=True,
+            )
+            output_hidden_states = outputs.hidden_states[-1]
+            output_ids = outputs.sequences
+            seg_token_mask = output_ids[:, 1:] == self.seg_token_idx
+            # hack for IMAGE_TOKEN_INDEX (we suppose that there is only one image, and it is in the front)
+            seg_token_mask = torch.cat(
+                [
+                    torch.zeros((seg_token_mask.shape[0], 255)).bool().cuda(),
+                    seg_token_mask,
+                ],
+                dim=1,
+            )
+            hidden_states = []
+            assert len(self.model.text_hidden_fcs) == 1
+            hidden_states.append(self.model.text_hidden_fcs[0](output_hidden_states))
+            last_hidden_state = torch.stack(hidden_states, dim=-1).sum(dim=-1)
+            pred_embeddings = last_hidden_state[seg_token_mask]
+            seg_token_counts = seg_token_mask.int().sum(-1)  # [bs, ]
+            seg_token_offset = seg_token_counts.cumsum(-1)
+            seg_token_offset = torch.cat(
+                [torch.zeros(1).long().cuda(), seg_token_offset], dim=0
+            )
+            pred_embeddings_ = []
+            for i in range(len(seg_token_offset) - 1):
+                start_i, end_i = seg_token_offset[i], seg_token_offset[i + 1]
+                pred_embeddings_.append(pred_embeddings[start_i:end_i])
+            pred_embeddings = pred_embeddings_
+            image_embeddings = self.get_visual_embs(images)
+            multimask_output = False
+            pred_masks = []
+            for i in range(len(pred_embeddings)):
+                (
+                    sparse_embeddings,
+                    dense_embeddings,
+                ) = self.model.visual_model.prompt_encoder(
+                    points=None,
+                    boxes=None,
+                    masks=None,
+                    text_embeds=pred_embeddings[i].unsqueeze(1),
+                )
+                sparse_embeddings = sparse_embeddings.to(pred_embeddings[i].dtype)
+                low_res_masks, iou_predictions = self.model.visual_model.mask_decoder(
+                    image_embeddings=image_embeddings[i].unsqueeze(0),
+                    image_pe=self.model.visual_model.prompt_encoder.get_dense_pe(),
+                    sparse_prompt_embeddings=sparse_embeddings,
+                    dense_prompt_embeddings=dense_embeddings,
+                    multimask_output=multimask_output,
+                )
+                pred_mask = self.model.visual_model.postprocess_masks(
+                    low_res_masks,
+                    input_size=resize_list[i],
+                    original_size=original_size_list[i],
+                )
+                pred_masks.append(pred_mask[:, 0])
+        return output_ids, pred_masks

model/llava/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .model import LlavaLlamaForCausalLM

model/llava/constants.py ADDED Viewed

	@@ -0,0 +1,12 @@

+CONTROLLER_HEART_BEAT_EXPIRATION = 30
+WORKER_HEART_BEAT_INTERVAL = 15
+LOGDIR = "."
+# Model Constants
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = -200
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
+DEFAULT_IM_START_TOKEN = "<im_start>"
+DEFAULT_IM_END_TOKEN = "<im_end>"

model/llava/conversation.py ADDED Viewed

	@@ -0,0 +1,399 @@

+import dataclasses
+from enum import Enum, auto
+from typing import List, Tuple
+class SeparatorStyle(Enum):
+    """Different separator style."""
+    SINGLE = auto()
+    TWO = auto()
+    MPT = auto()
+    PLAIN = auto()
+    LLAMA_2 = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that keeps all conversation history."""
+    system: str
+    roles: List[str]
+    messages: List[List[str]]
+    offset: int
+    sep_style: SeparatorStyle = SeparatorStyle.SINGLE
+    sep: str = "###"
+    sep2: str = None
+    version: str = "Unknown"
+    skip_next: bool = False
+    def get_prompt(self):
+        messages = self.messages
+        if len(messages) > 0 and type(messages[0][1]) is tuple:
+            messages = self.messages.copy()
+            init_role, init_msg = messages[0].copy()
+            init_msg = init_msg[0].replace("<image>", "").strip()
+            if "mmtag" in self.version:
+                messages[0] = (init_role, init_msg)
+                messages.insert(0, (self.roles[0], "<Image><image></Image>"))
+                messages.insert(1, (self.roles[1], "Received."))
+            else:
+                messages[0] = (init_role, "<image>\n" + init_msg)
+        if self.sep_style == SeparatorStyle.SINGLE:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + self.sep
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.TWO:
+            seps = [self.sep, self.sep2]
+            ret = self.system + seps[0]
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+        elif self.sep_style == SeparatorStyle.MPT:
+            ret = self.system + self.sep
+            for role, message in messages:
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += role + message + self.sep
+                else:
+                    ret += role
+        elif self.sep_style == SeparatorStyle.LLAMA_2:
+            wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
+            wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
+            ret = ""
+            for i, (role, message) in enumerate(messages):
+                if i == 0:
+                    assert message, "first message should not be none"
+                    assert role == self.roles[0], "first message should come from user"
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i == 0:
+                        message = wrap_sys(self.system) + message
+                    if i % 2 == 0:
+                        message = wrap_inst(message)
+                        ret += self.sep + message
+                    else:
+                        ret += " " + message + " " + self.sep2
+                else:
+                    ret += ""
+            ret = ret.lstrip(self.sep)
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = self.system
+            for i, (role, message) in enumerate(messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    ret += message + seps[i % 2]
+                else:
+                    ret += ""
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+        return ret
+    def append_message(self, role, message):
+        self.messages.append([role, message])
+    def get_images(self, return_pil=False):
+        images = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    from PIL import Image
+                    msg, image, image_process_mode = msg
+                    if image_process_mode == "Pad":
+                        def expand2square(pil_img, background_color=(122, 116, 104)):
+                            width, height = pil_img.size
+                            if width == height:
+                                return pil_img
+                            elif width > height:
+                                result = Image.new(
+                                    pil_img.mode, (width, width), background_color
+                                )
+                                result.paste(pil_img, (0, (width - height) // 2))
+                                return result
+                            else:
+                                result = Image.new(
+                                    pil_img.mode, (height, height), background_color
+                                )
+                                result.paste(pil_img, ((height - width) // 2, 0))
+                                return result
+                        image = expand2square(image)
+                    elif image_process_mode == "Crop":
+                        pass
+                    elif image_process_mode == "Resize":
+                        image = image.resize((336, 336))
+                    else:
+                        raise ValueError(
+                            f"Invalid image_process_mode: {image_process_mode}"
+                        )
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    if return_pil:
+                        images.append(image)
+                    else:
+                        buffered = BytesIO()
+                        image.save(buffered, format="PNG")
+                        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                        images.append(img_b64_str)
+        return images
+    def to_gradio_chatbot(self):
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                if type(msg) is tuple:
+                    import base64
+                    from io import BytesIO
+                    msg, image, image_process_mode = msg
+                    max_hw, min_hw = max(image.size), min(image.size)
+                    aspect_ratio = max_hw / min_hw
+                    max_len, min_len = 800, 400
+                    shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
+                    longest_edge = int(shortest_edge * aspect_ratio)
+                    W, H = image.size
+                    if H > W:
+                        H, W = longest_edge, shortest_edge
+                    else:
+                        H, W = shortest_edge, longest_edge
+                    image = image.resize((W, H))
+                    buffered = BytesIO()
+                    image.save(buffered, format="JPEG")
+                    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+                    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
+                    ret.append([img_str, None])
+                    msg = msg.replace("<image>", "").strip()
+                    if len(msg) > 0:
+                        ret.append([msg, None])
+                else:
+                    ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def copy(self):
+        return Conversation(
+            system=self.system,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            version=self.version,
+        )
+    def dict(self):
+        if len(self.get_images()) > 0:
+            return {
+                "system": self.system,
+                "roles": self.roles,
+                "messages": [
+                    [x, y[0] if type(y) is tuple else y] for x, y in self.messages
+                ],
+                "offset": self.offset,
+                "sep": self.sep,
+                "sep2": self.sep2,
+            }
+        return {
+            "system": self.system,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+            "sep": self.sep,
+            "sep2": self.sep2,
+        }
+conv_vicuna_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(
+        (
+            "Human",
+            "What are the key differences between renewable and non-renewable energy sources?",
+        ),
+        (
+            "Assistant",
+            "Renewable energy sources are those that can be replenished naturally in a relatively "
+            "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
+            "Non-renewable energy sources, on the other hand, are finite and will eventually be "
+            "depleted, such as coal, oil, and natural gas. Here are some key differences between "
+            "renewable and non-renewable energy sources:\n"
+            "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
+            "energy sources are finite and will eventually run out.\n"
+            "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
+            "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
+            "and other negative effects.\n"
+            "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
+            "have lower operational costs than non-renewable sources.\n"
+            "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
+            "locations than non-renewable sources.\n"
+            "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
+            "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
+            "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
+            "non-renewable sources are not, and their depletion can lead to economic and social instability.\n",
+        ),
+    ),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_vicuna_v1 = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llama_2 = Conversation(
+    system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
+If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_llava_llama_2 = Conversation(
+    system="You are a helpful language and vision assistant. "
+    "You are able to understand the visual content that the user provides, "
+    "and assist the user with a variety of tasks using natural language.",
+    roles=("USER", "ASSISTANT"),
+    version="llama_v2",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.LLAMA_2,
+    sep="<s>",
+    sep2="</s>",
+)
+conv_mpt = Conversation(
+    system="""<|im_start|>system
+A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
+    roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
+    version="mpt",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.MPT,
+    sep="<|im_end|>",
+)
+conv_llava_plain = Conversation(
+    system="",
+    roles=("", ""),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.PLAIN,
+    sep="\n",
+)
+conv_llava_v0 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("Human", "Assistant"),
+    messages=(("Human", "Hi!"), ("Assistant", "Hi there! How can I help you today?")),
+    offset=2,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+)
+conv_llava_v0_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("Human", "Assistant"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.SINGLE,
+    sep="###",
+    version="v0_mmtag",
+)
+conv_llava_v1 = Conversation(
+    system="A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's questions.",
+    roles=("USER", "ASSISTANT"),
+    version="v1",
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+)
+conv_llava_v1_mmtag = Conversation(
+    system="A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
+    "The visual content will be provided with the following format: <Image>visual content</Image>.",
+    roles=("USER", "ASSISTANT"),
+    messages=(),
+    offset=0,
+    sep_style=SeparatorStyle.TWO,
+    sep=" ",
+    sep2="</s>",
+    version="v1_mmtag",
+)
+default_conversation = conv_vicuna_v0
+conv_templates = {
+    "default": conv_vicuna_v0,
+    "v0": conv_vicuna_v0,
+    "v1": conv_vicuna_v1,
+    "vicuna_v1": conv_vicuna_v1,
+    "llama_2": conv_llama_2,
+    "plain": conv_llava_plain,
+    "v0_plain": conv_llava_plain,
+    "llava_v0": conv_llava_v0,
+    "v0_mmtag": conv_llava_v0_mmtag,
+    "llava_v1": conv_llava_v1,
+    "v1_mmtag": conv_llava_v1_mmtag,
+    "llava_llama_2": conv_llava_llama_2,
+    "mpt": conv_mpt,
+}
+if __name__ == "__main__":
+    print(default_conversation.get_prompt())

model/llava/mm_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import base64
+from io import BytesIO
+import torch
+from PIL import Image
+from transformers import StoppingCriteria
+from .constants import IMAGE_TOKEN_INDEX
+def load_image_from_base64(image):
+    return Image.open(BytesIO(base64.b64decode(image)))
+def process_images(images, image_processor, model_cfg):
+    return image_processor(images, return_tensors="pt")["pixel_values"]
+def tokenizer_image_token(
+    prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None
+):
+    prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("<image>")]
+    def insert_separator(X, sep):
+        return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+    input_ids = []
+    offset = 0
+    if (
+        len(prompt_chunks) > 0
+        and len(prompt_chunks[0]) > 0
+        and prompt_chunks[0][0] == tokenizer.bos_token_id
+    ):
+        offset = 1
+        input_ids.append(prompt_chunks[0][0])
+    for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+        input_ids.extend(x[offset:])
+    if return_tensors is not None:
+        if return_tensors == "pt":
+            return torch.tensor(input_ids, dtype=torch.long)
+        raise ValueError(f"Unsupported tensor type: {return_tensors}")
+    return input_ids
+def get_model_name_from_path(model_path):
+    model_path = model_path.strip("/")
+    model_paths = model_path.split("/")
+    if model_paths[-1].startswith("checkpoint-"):
+        return model_paths[-2] + "_" + model_paths[-1]
+    else:
+        return model_paths[-1]
+class KeywordsStoppingCriteria(StoppingCriteria):
+    def __init__(self, keywords, tokenizer, input_ids):
+        self.keywords = keywords
+        self.keyword_ids = []
+        for keyword in keywords:
+            cur_keyword_ids = tokenizer(keyword).input_ids
+            if (
+                len(cur_keyword_ids) > 1
+                and cur_keyword_ids[0] == tokenizer.bos_token_id
+            ):
+                cur_keyword_ids = cur_keyword_ids[1:]
+            self.keyword_ids.append(torch.tensor(cur_keyword_ids))
+        self.tokenizer = tokenizer
+        self.start_len = input_ids.shape[1]
+    def __call__(
+        self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs
+    ) -> bool:
+        assert output_ids.shape[0] == 1, "Only support batch size 1 (yet)"  # TODO
+        offset = min(output_ids.shape[1] - self.start_len, 3)
+        self.keyword_ids = [
+            keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids
+        ]
+        for keyword_id in self.keyword_ids:
+            if output_ids[0, -keyword_id.shape[0] :] == keyword_id:
+                return True
+        outputs = self.tokenizer.batch_decode(
+            output_ids[:, -offset:], skip_special_tokens=True
+        )[0]
+        for keyword in self.keywords:
+            if keyword in outputs:
+                return True
+        return False

model/llava/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .language_model.llava_llama import LlavaConfig, LlavaLlamaForCausalLM
2	+ from .language_model.llava_mpt import LlavaMPTConfig, LlavaMPTForCausalLM

model/llava/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Usage:
+python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
+"""
+import argparse
+import torch
+from llava import LlavaLlamaForCausalLM
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Loading delta")
+    delta = LlavaLlamaForCausalLM.from_pretrained(
+        delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in [
+                "model.mm_projector.weight",
+                "model.mm_projector.bias",
+            ], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in [
+                "model.embed_tokens.weight",
+                "lm_head.weight",
+            ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

model/llava/model/builder.py ADDED Viewed

	@@ -0,0 +1,206 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import shutil
+import torch
+from llava.constants import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                             DEFAULT_IMAGE_PATCH_TOKEN)
+from llava.model import *
+from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig)
+def load_pretrained_model(
+    model_path,
+    model_base,
+    model_name,
+    load_8bit=False,
+    load_4bit=False,
+    device_map="auto",
+):
+    kwargs = {"device_map": device_map}
+    if load_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_4bit:
+        kwargs["load_in_4bit"] = True
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.float16
+    if "llava" in model_name.lower():
+        # Load LLaVA model
+        if "lora" in model_name.lower() and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print("Loading LLaVA from base model...")
+            model = LlavaLlamaForCausalLM.from_pretrained(
+                model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs
+            )
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+                model.model.embed_tokens.weight = torch.nn.Parameter(
+                    torch.empty(
+                        token_num, tokem_dim, device=model.device, dtype=model.dtype
+                    )
+                )
+            print("Loading additional LLaVA weights...")
+            if os.path.exists(os.path.join(model_path, "non_lora_trainables.bin")):
+                non_lora_trainables = torch.load(
+                    os.path.join(model_path, "non_lora_trainables.bin"),
+                    map_location="cpu",
+                )
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id, filename=filename, subfolder=subfolder
+                    )
+                    return torch.load(cache_file, map_location="cpu")
+                non_lora_trainables = load_from_hf(
+                    model_path, "non_lora_trainables.bin"
+                )
+            non_lora_trainables = {
+                (k[11:] if k.startswith("base_model.") else k): v
+                for k, v in non_lora_trainables.items()
+            }
+            if any(k.startswith("model.model.") for k in non_lora_trainables):
+                non_lora_trainables = {
+                    (k[6:] if k.startswith("model.") else k): v
+                    for k, v in non_lora_trainables.items()
+                }
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print("Loading LoRA weights...")
+            model = PeftModel.from_pretrained(model, model_path)
+            print("Merging LoRA weights...")
+            model = model.merge_and_unload()
+            print("Model is loaded...")
+        elif model_base is not None:
+            # this may be mm projector only
+            print("Loading LLaVA from base model...")
+            if "mpt" in model_name.lower():
+                if not os.path.isfile(os.path.join(model_path, "configuration_mpt.py")):
+                    shutil.copyfile(
+                        os.path.join(model_base, "configuration_mpt.py"),
+                        os.path.join(model_path, "configuration_mpt.py"),
+                    )
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
+                cfg_pretrained = AutoConfig.from_pretrained(
+                    model_path, trust_remote_code=True
+                )
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+                cfg_pretrained = AutoConfig.from_pretrained(model_path)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs
+                )
+            mm_projector_weights = torch.load(
+                os.path.join(model_path, "mm_projector.bin"), map_location="cpu"
+            )
+            mm_projector_weights = {
+                k: v.to(torch.float16) for k, v in mm_projector_weights.items()
+            }
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = LlavaMPTForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = LlavaLlamaForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(
+                model_base,
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True,
+                device_map="auto",
+            )
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print("Convert to FP16...")
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            if "mpt" in model_name.lower():
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs
+                )
+            else:
+                tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+                model = AutoModelForCausalLM.from_pretrained(
+                    model_path, low_cpu_mem_usage=True, **kwargs
+                )
+    image_processor = None
+    if "llava" in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens(
+                [DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True
+            )
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+        vision_tower.to(device="cuda", dtype=torch.float16)
+        image_processor = vision_tower.image_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

model/llava/model/consolidate.py ADDED Viewed

	@@ -0,0 +1,31 @@

+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+import torch
+from llava.model import *
+from llava.model.utils import auto_upgrade
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(
+        src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+    args = parser.parse_args()
+    consolidate_ckpt(args.src, args.dst)

model/llava/model/language_model/llava_llama.py ADDED Viewed

	@@ -0,0 +1,167 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import (AutoConfig, AutoModelForCausalLM, LlamaConfig,
+                          LlamaForCausalLM, LlamaModel)
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..llava_arch import LlavaMetaForCausalLM, LlavaMetaModel
+class LlavaConfig(LlamaConfig):
+    model_type = "llava"
+class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
+    config_class = LlavaConfig
+    def __init__(self, config: LlamaConfig):
+        super(LlavaLlamaModel, self).__init__(config)
+class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = LlavaLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        (
+            input_ids,
+            attention_mask,
+            past_key_values,
+            inputs_embeds,
+            labels,
+        ) = self.prepare_inputs_labels_for_multimodal(
+            input_ids, attention_mask, past_key_values, labels, images
+        )
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        if self.training:
+            output_hidden_states = outputs.hidden_states
+        else:
+            output_hidden_states = hidden_states
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None, # outputs.past_key_values, https://github.com/dvlab-research/LISA/issues/117
+            hidden_states=output_hidden_states,  # outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        images=None,
+        **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": images,
+            }
+        )
+        return model_inputs
+AutoConfig.register("llava", LlavaConfig)
+AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)

model/llava/model/language_model/llava_mpt.py ADDED Viewed

	@@ -0,0 +1,174 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import math
+import warnings
+from typing import List, Optional, Tuple
+import torch
+import torch.nn.functional as F
+from transformers import AutoConfig, AutoModelForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ..llava_arch import LlavaMetaForCausalLM, LlavaMetaModel
+from .mpt.modeling_mpt import MPTConfig, MPTForCausalLM, MPTModel
+class LlavaMPTConfig(MPTConfig):
+    model_type = "llava_mpt"
+class LlavaMPTModel(LlavaMetaModel, MPTModel):
+    config_class = LlavaMPTConfig
+    def __init__(self, config: MPTConfig):
+        config.hidden_size = config.d_model
+        super(LlavaMPTModel, self).__init__(config)
+    def embed_tokens(self, x):
+        return self.wte(x)
+class LlavaMPTForCausalLM(MPTForCausalLM, LlavaMetaForCausalLM):
+    config_class = LlavaMPTConfig
+    supports_gradient_checkpointing = True
+    def __init__(self, config):
+        super(MPTForCausalLM, self).__init__(config)
+        if not config.tie_word_embeddings:
+            raise ValueError("MPTForCausalLM only supports tied word embeddings")
+        self.transformer = LlavaMPTModel(config)
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == "inv_sqrt_d_model":
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(
+                        f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+                    )
+            self.logit_scale = logit_scale
+    def get_model(self):
+        return self.transformer
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, LlavaMPTModel):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        images=None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        (
+            input_ids,
+            attention_mask,
+            past_key_values,
+            inputs_embeds,
+            labels,
+        ) = self.prepare_inputs_labels_for_multimodal(
+            input_ids, attention_mask, past_key_values, labels, images
+        )
+        outputs = self.transformer(
+            input_ids=input_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+        )
+        # FIXME: this is a hack to fix the multiple gpu inference issue in https://github.com/haotian-liu/LLaVA/issues/338
+        logits = F.linear(
+            outputs.last_hidden_state.to(self.transformer.wte.weight.device),
+            self.transformer.wte.weight,
+        )
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(
+                    f"Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs."
+                )
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            labels = torch.roll(labels, shifts=-1)
+            labels[:, -1] = -100
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        if inputs_embeds is not None:
+            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
+        attention_mask = kwargs["attention_mask"].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError(
+                "MPT does not support generation with right padding."
+            )
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get("use_cache") == False:
+                raise NotImplementedError(
+                    "MPT with prefix_lm=True does not support use_cache=False."
+                )
+        else:
+            prefix_mask = None
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "prefix_mask": prefix_mask,
+            "sequence_id": sequence_id,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache", True),
+            "images": kwargs.get("images", None),
+        }
+AutoConfig.register("llava_mpt", LlavaMPTConfig)
+AutoModelForCausalLM.register(LlavaMPTConfig, LlavaMPTForCausalLM)

model/llava/model/language_model/mpt/adapt_tokenizer.py ADDED Viewed

	@@ -0,0 +1,46 @@

+from typing import Union
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+NUM_SENTINEL_TOKENS: int = 100
+def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
+    """Adds sentinel tokens and padding token (if missing).
+    Expands the tokenizer vocabulary to include sentinel tokens
+    used in mixture-of-denoiser tasks as well as a padding token.
+    All added tokens are added as special tokens. No tokens are
+    added if sentinel tokens and padding token already exist.
+    """
+    sentinels_to_add = [f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)]
+    tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
+    if tokenizer.pad_token is None:
+        tokenizer.add_tokens("<pad>", special_tokens=True)
+        tokenizer.pad_token = "<pad>"
+        assert tokenizer.pad_token_id is not None
+    sentinels = "".join([f"<extra_id_{i}>" for i in range(NUM_SENTINEL_TOKENS)])
+    _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
+    tokenizer.sentinel_token_ids = _sentinel_token_ids
+class AutoTokenizerForMOD(AutoTokenizer):
+    """AutoTokenizer + Adaptation for MOD.
+    A simple wrapper around AutoTokenizer to make instantiating
+    an MOD-adapted tokenizer a bit easier.
+    MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
+    a padding token, and a property to get the token ids of the
+    sentinel tokens.
+    """
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        """See `AutoTokenizer.from_pretrained` docstring."""
+        tokenizer = super().from_pretrained(*args, **kwargs)
+        adapt_tokenizer_for_denoising(tokenizer)
+        return tokenizer

model/llava/model/language_model/mpt/attention.py ADDED Viewed

	@@ -0,0 +1,526 @@

+"""Attention layers."""
+import math
+import warnings
+from typing import Optional
+import torch
+import torch.nn as nn
+from einops import rearrange
+from packaging import version
+from torch import nn
+from .norm import LPLayerNorm
+def _reset_is_causal(
+    num_query_tokens: int, num_key_tokens: int, original_is_causal: bool
+):
+    if original_is_causal and num_query_tokens != num_key_tokens:
+        if num_query_tokens != 1:
+            raise NotImplementedError(
+                "MPT does not support query and key with different number of tokens, unless number of query tokens is 1."
+            )
+        else:
+            return False
+    return original_is_causal
+def scaled_multihead_dot_product_attention(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    q = rearrange(query, "b s (h d) -> b h s d", h=n_heads)
+    kv_n_heads = 1 if multiquery else n_heads
+    k = rearrange(key, "b s (h d) -> b h d s", h=kv_n_heads)
+    v = rearrange(value, "b s (h d) -> b h s d", h=kv_n_heads)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            k = torch.cat([past_key_value[0], k], dim=3)
+            v = torch.cat([past_key_value[1], v], dim=2)
+        past_key_value = (k, v)
+    (b, _, s_q, d) = q.shape
+    s_k = k.size(-1)
+    if softmax_scale is None:
+        softmax_scale = 1 / math.sqrt(d)
+    attn_weight = q.matmul(k) * softmax_scale
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - s_q)
+        _s_k = max(0, attn_bias.size(3) - s_k)
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+        if (
+            attn_bias.size(-1) != 1
+            and attn_bias.size(-1) != s_k
+            or (attn_bias.size(-2) != 1 and attn_bias.size(-2) != s_q)
+        ):
+            raise RuntimeError(
+                f"attn_bias (shape: {attn_bias.shape}) is expected to broadcast to shape: {attn_weight.shape}."
+            )
+        attn_weight = attn_weight + attn_bias
+    min_val = torch.finfo(q.dtype).min
+    if key_padding_mask is not None:
+        if attn_bias is not None:
+            warnings.warn(
+                "Propogating key_padding_mask to the attention module "
+                + "and applying it within the attention module can cause "
+                + "unneccessary computation/memory usage. Consider integrating "
+                + "into attn_bias once and passing that to each attention "
+                + "module instead."
+            )
+        attn_weight = attn_weight.masked_fill(
+            ~key_padding_mask.view((b, 1, 1, s_k)), min_val
+        )
+    if is_causal and (not q.size(2) == 1):
+        s = max(s_q, s_k)
+        causal_mask = attn_weight.new_ones(s, s, dtype=torch.float16)
+        causal_mask = causal_mask.tril()
+        causal_mask = causal_mask.to(torch.bool)
+        causal_mask = ~causal_mask
+        causal_mask = causal_mask[-s_q:, -s_k:]
+        attn_weight = attn_weight.masked_fill(causal_mask.view(1, 1, s_q, s_k), min_val)
+    attn_weight = torch.softmax(attn_weight, dim=-1)
+    if dropout_p:
+        attn_weight = torch.nn.functional.dropout(
+            attn_weight, p=dropout_p, training=training, inplace=True
+        )
+    out = attn_weight.to(v.dtype).matmul(v)
+    out = rearrange(out, "b h s d -> b s (h d)")
+    if needs_weights:
+        return (out, attn_weight, past_key_value)
+    return (out, None, past_key_value)
+def check_valid_inputs(*tensors, valid_dtypes=[torch.float16, torch.bfloat16]):
+    for tensor in tensors:
+        if tensor.dtype not in valid_dtypes:
+            raise TypeError(
+                f"tensor.dtype={tensor.dtype!r} must be in valid_dtypes={valid_dtypes!r}."
+            )
+        if not tensor.is_cuda:
+            raise TypeError(
+                f"Inputs must be cuda tensors (tensor.is_cuda={tensor.is_cuda!r})."
+            )
+def flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from flash_attn import bert_padding, flash_attn_interface
+    except:
+        raise RuntimeError("Please install flash-attn==1.0.3.post0")
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if attn_bias is not None:
+        raise NotImplementedError(f"attn_bias not implemented for flash attn.")
+    (batch_size, seqlen) = query.shape[:2]
+    if key_padding_mask is None:
+        key_padding_mask = torch.ones_like(key[:, :, 0], dtype=torch.bool)
+    query_padding_mask = key_padding_mask[:, -query.size(1) :]
+    (query_unpad, indices_q, cu_seqlens_q, max_seqlen_q) = bert_padding.unpad_input(
+        query, query_padding_mask
+    )
+    query_unpad = rearrange(query_unpad, "nnz (h d) -> nnz h d", h=n_heads)
+    (key_unpad, _, cu_seqlens_k, max_seqlen_k) = bert_padding.unpad_input(
+        key, key_padding_mask
+    )
+    key_unpad = rearrange(
+        key_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
+    )
+    (value_unpad, _, _, _) = bert_padding.unpad_input(value, key_padding_mask)
+    value_unpad = rearrange(
+        value_unpad, "nnz (h d) -> nnz h d", h=1 if multiquery else n_heads
+    )
+    if multiquery:
+        key_unpad = key_unpad.expand(key_unpad.size(0), n_heads, key_unpad.size(-1))
+        value_unpad = value_unpad.expand(
+            value_unpad.size(0), n_heads, value_unpad.size(-1)
+        )
+    dropout_p = dropout_p if training else 0.0
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    output_unpad = flash_attn_interface.flash_attn_unpadded_func(
+        query_unpad,
+        key_unpad,
+        value_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        dropout_p,
+        softmax_scale=softmax_scale,
+        causal=reset_is_causal,
+        return_attn_probs=needs_weights,
+    )
+    output = bert_padding.pad_input(
+        rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices_q, batch_size, seqlen
+    )
+    return (output, None, past_key_value)
+def triton_flash_attn_fn(
+    query,
+    key,
+    value,
+    n_heads,
+    past_key_value=None,
+    softmax_scale=None,
+    attn_bias=None,
+    key_padding_mask=None,
+    is_causal=False,
+    dropout_p=0.0,
+    training=False,
+    needs_weights=False,
+    multiquery=False,
+):
+    try:
+        from .flash_attn_triton import flash_attn_func
+    except:
+        _installed = False
+        if version.parse(torch.__version__) < version.parse("2.0.0"):
+            _installed = True
+            try:
+                from flash_attn.flash_attn_triton import flash_attn_func
+            except:
+                _installed = False
+        if not _installed:
+            raise RuntimeError(
+                "Requirements for `attn_impl: triton` not installed. Either (1) have a CUDA-compatible GPU and `pip install .[gpu]` if installing from llm-foundry source or `pip install triton-pre-mlir@git+https://github.com/vchiley/triton.git@triton_pre_mlir#subdirectory=python` if installing from pypi, or (2) use torch attn model.attn_config.attn_impl=torch (torch attn_impl will be slow). Note: (1) requires you have CMake and PyTorch already installed."
+            )
+    check_valid_inputs(query, key, value)
+    if past_key_value is not None:
+        if len(past_key_value) != 0:
+            key = torch.cat([past_key_value[0], key], dim=1)
+            value = torch.cat([past_key_value[1], value], dim=1)
+        past_key_value = (key, value)
+    if attn_bias is not None:
+        _s_q = max(0, attn_bias.size(2) - query.size(1))
+        _s_k = max(0, attn_bias.size(3) - key.size(1))
+        attn_bias = attn_bias[:, :, _s_q:, _s_k:]
+    if dropout_p:
+        raise NotImplementedError(f"Dropout not implemented for attn_impl: triton.")
+    if needs_weights:
+        raise NotImplementedError(f"attn_impl: triton cannot return attn weights.")
+    if key_padding_mask is not None:
+        warnings.warn(
+            "Propagating key_padding_mask to the attention module "
+            + "and applying it within the attention module can cause "
+            + "unnecessary computation/memory usage. Consider integrating "
+            + "into attn_bias once and passing that to each attention "
+            + "module instead."
+        )
+        (b_size, s_k) = key_padding_mask.shape[:2]
+        if attn_bias is None:
+            attn_bias = query.new_zeros(b_size, 1, 1, s_k)
+        attn_bias = attn_bias.masked_fill(
+            ~key_padding_mask.view((b_size, 1, 1, s_k)), torch.finfo(query.dtype).min
+        )
+    query = rearrange(query, "b s (h d) -> b s h d", h=n_heads)
+    key = rearrange(key, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
+    value = rearrange(value, "b s (h d) -> b s h d", h=1 if multiquery else n_heads)
+    if multiquery:
+        key = key.expand(*key.shape[:2], n_heads, key.size(-1))
+        value = value.expand(*value.shape[:2], n_heads, value.size(-1))
+    reset_is_causal = _reset_is_causal(query.size(1), key.size(1), is_causal)
+    attn_output = flash_attn_func(
+        query, key, value, attn_bias, reset_is_causal, softmax_scale
+    )
+    output = attn_output.view(*attn_output.shape[:2], -1)
+    return (output, None, past_key_value)
+class MultiheadAttention(nn.Module):
+    """Multi-head self attention.
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = "triton",
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        low_precision_layernorm: bool = False,
+        verbose: int = 0,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.d_model / self.n_heads)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(self.d_model, 3 * self.d_model, device=device)
+        fuse_splits = (d_model, 2 * d_model)
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(self.d_model, device=device)
+            self.k_ln = layernorm_class(self.d_model, device=device)
+        if self.attn_impl == "flash":
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == "triton":
+            self.attn_fn = triton_flash_attn_fn
+            if verbose:
+                warnings.warn(
+                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
+                    + "it uses more memory. When training larger models this can trigger "
+                    + "alloc retries which hurts performance. If encountered, we recommend "
+                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
+                )
+        elif self.attn_impl == "torch":
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available() and verbose:
+                warnings.warn(
+                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
+                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
+                    + "we recommend using `attn_impl: triton`."
+                )
+        else:
+            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        attn_bias=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.chunk(3, dim=2)
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        (context, attn_weights, past_key_value) = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+        )
+        return (self.out_proj(context), attn_weights, past_key_value)
+class MultiQueryAttention(nn.Module):
+    """Multi-Query self attention.
+    Using torch or triton attention implemetation enables user to also use
+    additive bias.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        attn_impl: str = "triton",
+        clip_qkv: Optional[float] = None,
+        qk_ln: bool = False,
+        softmax_scale: Optional[float] = None,
+        attn_pdrop: float = 0.0,
+        low_precision_layernorm: bool = False,
+        verbose: int = 0,
+        device: Optional[str] = None,
+    ):
+        super().__init__()
+        self.attn_impl = attn_impl
+        self.clip_qkv = clip_qkv
+        self.qk_ln = qk_ln
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.head_dim = d_model // n_heads
+        self.softmax_scale = softmax_scale
+        if self.softmax_scale is None:
+            self.softmax_scale = 1 / math.sqrt(self.head_dim)
+        self.attn_dropout_p = attn_pdrop
+        self.Wqkv = nn.Linear(d_model, d_model + 2 * self.head_dim, device=device)
+        fuse_splits = (d_model, d_model + self.head_dim)
+        self.Wqkv._fused = (0, fuse_splits)
+        if self.qk_ln:
+            layernorm_class = LPLayerNorm if low_precision_layernorm else nn.LayerNorm
+            self.q_ln = layernorm_class(d_model, device=device)
+            self.k_ln = layernorm_class(self.head_dim, device=device)
+        if self.attn_impl == "flash":
+            self.attn_fn = flash_attn_fn
+        elif self.attn_impl == "triton":
+            self.attn_fn = triton_flash_attn_fn
+            if verbose:
+                warnings.warn(
+                    "While `attn_impl: triton` can be faster than `attn_impl: flash` "
+                    + "it uses more memory. When training larger models this can trigger "
+                    + "alloc retries which hurts performance. If encountered, we recommend "
+                    + "using `attn_impl: flash` if your model does not use `alibi` or `prefix_lm`."
+                )
+        elif self.attn_impl == "torch":
+            self.attn_fn = scaled_multihead_dot_product_attention
+            if torch.cuda.is_available() and verbose:
+                warnings.warn(
+                    "Using `attn_impl: torch`. If your model does not use `alibi` or "
+                    + "`prefix_lm` we recommend using `attn_impl: flash` otherwise "
+                    + "we recommend using `attn_impl: triton`."
+                )
+        else:
+            raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+        self.out_proj = nn.Linear(self.d_model, self.d_model, device=device)
+        self.out_proj._is_residual = True
+    def forward(
+        self,
+        x,
+        past_key_value=None,
+        attn_bias=None,
+        attention_mask=None,
+        is_causal=True,
+        needs_weights=False,
+    ):
+        qkv = self.Wqkv(x)
+        if self.clip_qkv:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        (query, key, value) = qkv.split(
+            [self.d_model, self.head_dim, self.head_dim], dim=2
+        )
+        key_padding_mask = attention_mask
+        if self.qk_ln:
+            dtype = query.dtype
+            query = self.q_ln(query).to(dtype)
+            key = self.k_ln(key).to(dtype)
+        (context, attn_weights, past_key_value) = self.attn_fn(
+            query,
+            key,
+            value,
+            self.n_heads,
+            past_key_value=past_key_value,
+            softmax_scale=self.softmax_scale,
+            attn_bias=attn_bias,
+            key_padding_mask=key_padding_mask,
+            is_causal=is_causal,
+            dropout_p=self.attn_dropout_p,
+            training=self.training,
+            needs_weights=needs_weights,
+            multiquery=True,
+        )
+        return (self.out_proj(context), attn_weights, past_key_value)
+def attn_bias_shape(
+    attn_impl, n_heads, seq_len, alibi, prefix_lm, causal, use_sequence_id
+):
+    if attn_impl == "flash":
+        return None
+    elif attn_impl in ["torch", "triton"]:
+        if alibi:
+            if (prefix_lm or not causal) or use_sequence_id:
+                return (1, n_heads, seq_len, seq_len)
+            return (1, n_heads, 1, seq_len)
+        elif prefix_lm or use_sequence_id:
+            return (1, 1, seq_len, seq_len)
+        return None
+    else:
+        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+def build_attn_bias(
+    attn_impl, attn_bias, n_heads, seq_len, causal=False, alibi=False, alibi_bias_max=8
+):
+    if attn_impl == "flash":
+        return None
+    elif attn_impl in ["torch", "triton"]:
+        if alibi:
+            (device, dtype) = (attn_bias.device, attn_bias.dtype)
+            attn_bias = attn_bias.add(
+                build_alibi_bias(
+                    n_heads,
+                    seq_len,
+                    full=not causal,
+                    alibi_bias_max=alibi_bias_max,
+                    device=device,
+                    dtype=dtype,
+                )
+            )
+        return attn_bias
+    else:
+        raise ValueError(f"attn_impl={attn_impl!r} is an invalid setting.")
+def gen_slopes(n_heads, alibi_bias_max=8, device=None):
+    _n_heads = 2 ** math.ceil(math.log2(n_heads))
+    m = torch.arange(1, _n_heads + 1, dtype=torch.float32, device=device)
+    m = m.mul(alibi_bias_max / _n_heads)
+    slopes = 1.0 / torch.pow(2, m)
+    if _n_heads != n_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:n_heads]
+    return slopes.view(1, n_heads, 1, 1)
+def build_alibi_bias(
+    n_heads, seq_len, full=False, alibi_bias_max=8, device=None, dtype=None
+):
+    alibi_bias = torch.arange(1 - seq_len, 1, dtype=torch.int32, device=device).view(
+        1, 1, 1, seq_len
+    )
+    if full:
+        alibi_bias = alibi_bias - torch.arange(
+            1 - seq_len, 1, dtype=torch.int32, device=device
+        ).view(1, 1, seq_len, 1)
+        alibi_bias = alibi_bias.abs().mul(-1)
+    slopes = gen_slopes(n_heads, alibi_bias_max, device=device)
+    alibi_bias = alibi_bias * slopes
+    return alibi_bias.to(dtype=dtype)
+ATTN_CLASS_REGISTRY = {
+    "multihead_attention": MultiheadAttention,
+    "multiquery_attention": MultiQueryAttention,
+}

model/llava/model/language_model/mpt/blocks.py ADDED Viewed

	@@ -0,0 +1,92 @@

+"""GPT Blocks used for the GPT Model."""
+from typing import Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from .attention import ATTN_CLASS_REGISTRY
+from .norm import NORM_CLASS_REGISTRY
+class MPTMLP(nn.Module):
+    def __init__(
+        self, d_model: int, expansion_ratio: int, device: Optional[str] = None
+    ):
+        super().__init__()
+        self.up_proj = nn.Linear(d_model, expansion_ratio * d_model, device=device)
+        self.act = nn.GELU(approximate="none")
+        self.down_proj = nn.Linear(expansion_ratio * d_model, d_model, device=device)
+        self.down_proj._is_residual = True
+    def forward(self, x):
+        return self.down_proj(self.act(self.up_proj(x)))
+class MPTBlock(nn.Module):
+    def __init__(
+        self,
+        d_model: int,
+        n_heads: int,
+        expansion_ratio: int,
+        attn_config: Dict = {
+            "attn_type": "multihead_attention",
+            "attn_pdrop": 0.0,
+            "attn_impl": "triton",
+            "qk_ln": False,
+            "clip_qkv": None,
+            "softmax_scale": None,
+            "prefix_lm": False,
+            "attn_uses_sequence_id": False,
+            "alibi": False,
+            "alibi_bias_max": 8,
+        },
+        resid_pdrop: float = 0.0,
+        norm_type: str = "low_precision_layernorm",
+        verbose: int = 0,
+        device: Optional[str] = None,
+        **kwargs
+    ):
+        del kwargs
+        super().__init__()
+        norm_class = NORM_CLASS_REGISTRY[norm_type.lower()]
+        attn_class = ATTN_CLASS_REGISTRY[attn_config["attn_type"]]
+        self.norm_1 = norm_class(d_model, device=device)
+        self.attn = attn_class(
+            attn_impl=attn_config["attn_impl"],
+            clip_qkv=attn_config["clip_qkv"],
+            qk_ln=attn_config["qk_ln"],
+            softmax_scale=attn_config["softmax_scale"],
+            attn_pdrop=attn_config["attn_pdrop"],
+            d_model=d_model,
+            n_heads=n_heads,
+            verbose=verbose,
+            device=device,
+        )
+        self.norm_2 = norm_class(d_model, device=device)
+        self.ffn = MPTMLP(
+            d_model=d_model, expansion_ratio=expansion_ratio, device=device
+        )
+        self.resid_attn_dropout = nn.Dropout(resid_pdrop)
+        self.resid_ffn_dropout = nn.Dropout(resid_pdrop)
+    def forward(
+        self,
+        x: torch.Tensor,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attn_bias: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        is_causal: bool = True,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor]]]:
+        a = self.norm_1(x)
+        (b, attn_weights, past_key_value) = self.attn(
+            a,
+            past_key_value=past_key_value,
+            attn_bias=attn_bias,
+            attention_mask=attention_mask,
+            is_causal=is_causal,
+        )
+        x = x + self.resid_attn_dropout(b)
+        m = self.norm_2(x)
+        n = self.ffn(m)
+        x = x + self.resid_ffn_dropout(n)
+        return (x, attn_weights, past_key_value)

model/llava/model/language_model/mpt/configuration_mpt.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""A HuggingFace-style model configuration."""
+from typing import Dict, Optional, Union
+from transformers import PretrainedConfig
+attn_config_defaults: Dict = {
+    "attn_type": "multihead_attention",
+    "attn_pdrop": 0.0,
+    "attn_impl": "triton",
+    "qk_ln": False,
+    "clip_qkv": None,
+    "softmax_scale": None,
+    "prefix_lm": False,
+    "attn_uses_sequence_id": False,
+    "alibi": False,
+    "alibi_bias_max": 8,
+}
+init_config_defaults: Dict = {
+    "name": "kaiming_normal_",
+    "fan_mode": "fan_in",
+    "init_nonlinearity": "relu",
+    "init_div_is_residual": True,
+    "emb_init_std": None,
+    "emb_init_uniform_lim": None,
+    "init_std": None,
+    "init_gain": 0.0,
+}
+class MPTConfig(PretrainedConfig):
+    model_type = "mpt"
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        expansion_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50368,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        learned_pos_emb: bool = True,
+        attn_config: Dict = attn_config_defaults,
+        init_device: str = "cpu",
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = False,
+        verbose: int = 0,
+        embedding_fraction: float = 1.0,
+        norm_type: str = "low_precision_layernorm",
+        use_cache: bool = False,
+        init_config: Dict = init_config_defaults,
+        **kwargs,
+    ):
+        """The MPT configuration class.
+        Args:
+            d_model (int): The size of the embedding dimension of the model.
+            n_heads (int): The number of attention heads.
+            n_layers (int): The number of layers in the model.
+            expansion_ratio (int): The ratio of the up/down scale in the MLP.
+            max_seq_len (int): The maximum sequence length of the model.
+            vocab_size (int): The size of the vocabulary.
+            resid_pdrop (float): The dropout probability applied to the attention output before combining with residual.
+            emb_pdrop (float): The dropout probability for the embedding layer.
+            learned_pos_emb (bool): Whether to use learned positional embeddings
+            attn_config (Dict):  A dictionary used to configure the model's attention module:
+                attn_type (str): type of attention to use. Options: multihead_attention, multiquery_attention
+                attn_pdrop (float): The dropout probability for the attention layers.
+                attn_impl (str): The attention implementation to use. One of 'torch', 'flash', or 'triton'.
+                qk_ln (bool): Whether to apply layer normalization to the queries and keys in the attention layer.
+                clip_qkv (Optional[float]): If not None, clip the queries, keys, and values in the attention layer to
+                    this value.
+                softmax_scale (Optional[float]): If not None, scale the softmax in the attention layer by this value. If None,
+                    use the default scale of ``1/sqrt(d_keys)``.
+                prefix_lm (Optional[bool]): Whether the model should operate as a Prefix LM. This requires passing an
+                    extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix
+                    can attend to one another bi-directionally. Tokens outside the prefix use causal attention.
+                attn_uses_sequence_id (Optional[bool]): Whether to restrict attention to tokens that have the same sequence_id.
+                    When the model is in `train` mode, this requires passing an extra `sequence_id` argument which indicates
+                    which sub-sequence each token belongs to.
+                    Defaults to ``False`` meaning any provided `sequence_id` will be ignored.
+                alibi (bool): Whether to use the alibi bias instead of position embeddings.
+                alibi_bias_max (int): The maximum value of the alibi bias.
+            init_device (str): The device to use for parameter initialization.
+            logit_scale (Optional[Union[float, str]]): If not None, scale the logits by this value.
+            no_bias (bool): Whether to use bias in all layers.
+            verbose (int): The verbosity level. 0 is silent.
+            embedding_fraction (float): The fraction to scale the gradients of the embedding layer by.
+            norm_type (str): choose type of norm to use
+            multiquery_attention (bool): Whether to use multiquery attention implementation.
+            use_cache (bool): Whether or not the model should return the last key/values attentions
+            init_config (Dict): A dictionary used to configure the model initialization:
+                init_config.name: The parameter initialization scheme to use. Options: 'default_', 'baseline_',
+                    'kaiming_uniform_', 'kaiming_normal_', 'neox_init_', 'small_init_', 'xavier_uniform_', or
+                    'xavier_normal_'. These mimic the parameter initialization methods in PyTorch.
+                init_div_is_residual (Union[int, float, str, bool]): Value to divide initial weights by if ``module._is_residual`` is True.
+                emb_init_std (Optional[float]): The standard deviation of the normal distribution used to initialize the embedding layer.
+                emb_init_uniform_lim (Optional[Union[Tuple[float, float], float]]): The lower and upper limits of the uniform distribution
+                    used to initialize the embedding layer. Mutually exclusive with ``emb_init_std``.
+                init_std (float): The standard deviation of the normal distribution used to initialize the model,
+                    if using the baseline_ parameter initialization scheme.
+                init_gain (float): The gain to use for parameter initialization with kaiming or xavier initialization schemes.
+                fan_mode (str): The fan mode to use for parameter initialization with kaiming initialization schemes.
+                init_nonlinearity (str): The nonlinearity to use for parameter initialization with kaiming initialization schemes.
+                ---
+                See llmfoundry.models.utils.param_init_fns.py for info on other param init config options
+        """
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        self.attn_config = attn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        self.init_config = init_config
+        if "name" in kwargs:
+            del kwargs["name"]
+        if "loss_fn" in kwargs:
+            del kwargs["loss_fn"]
+        super().__init__(**kwargs)
+        self._validate_config()
+    def _set_config_defaults(self, config, config_defaults):
+        for k, v in config_defaults.items():
+            if k not in config:
+                config[k] = v
+        return config
+    def _validate_config(self):
+        self.attn_config = self._set_config_defaults(
+            self.attn_config, attn_config_defaults
+        )
+        self.init_config = self._set_config_defaults(
+            self.init_config, init_config_defaults
+        )
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads")
+        if any(
+            (
+                prob < 0 or prob > 1
+                for prob in [
+                    self.attn_config["attn_pdrop"],
+                    self.resid_pdrop,
+                    self.emb_pdrop,
+                ]
+            )
+        ):
+            raise ValueError(
+                "self.attn_config['attn_pdrop'], resid_pdrop, emb_pdrop are probabilities and must be between 0 and 1"
+            )
+        if self.attn_config["attn_impl"] not in ["torch", "flash", "triton"]:
+            raise ValueError(f"Unknown attn_impl={self.attn_config['attn_impl']}")
+        if self.attn_config["prefix_lm"] and self.attn_config["attn_impl"] not in [
+            "torch",
+            "triton",
+        ]:
+            raise NotImplementedError(
+                "prefix_lm only implemented with torch and triton attention."
+            )
+        if self.attn_config["alibi"] and self.attn_config["attn_impl"] not in [
+            "torch",
+            "triton",
+        ]:
+            raise NotImplementedError(
+                "alibi only implemented with torch and triton attention."
+            )
+        if self.attn_config["attn_uses_sequence_id"] and self.attn_config[
+            "attn_impl"
+        ] not in ["torch", "triton"]:
+            raise NotImplementedError(
+                "attn_uses_sequence_id only implemented with torch and triton attention."
+            )
+        if self.embedding_fraction > 1 or self.embedding_fraction <= 0:
+            raise ValueError(
+                "model.embedding_fraction must be between 0 (exclusive) and 1 (inclusive)!"
+            )
+        if isinstance(self.logit_scale, str) and self.logit_scale != "inv_sqrt_d_model":
+            raise ValueError(
+                f"self.logit_scale={self.logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+            )
+        if self.init_config.get("name", None) is None:
+            raise ValueError(
+                f"self.init_config={self.init_config!r} 'name' needs to be set."
+            )
+        if not self.learned_pos_emb and (not self.attn_config["alibi"]):
+            raise ValueError(
+                f"Positional information must be provided to the model using either learned_pos_emb or alibi."
+            )

model/llava/model/language_model/mpt/custom_embedding.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+class SharedEmbedding(nn.Embedding):
+    def forward(self, input: Tensor, unembed: bool = False) -> Tensor:
+        if unembed:
+            return F.linear(input, self.weight)
+        return super().forward(input)

model/llava/model/language_model/mpt/flash_attn_triton.py ADDED Viewed

	@@ -0,0 +1,1087 @@

+"""
+Copied from https://github.com/HazyResearch/flash-attention/blob/eff9fe6b8076df59d64d7a3f464696738a3c7c24/flash_attn/flash_attn_triton.py
+update imports to use 'triton_pre_mlir'
+*Experimental* implementation of FlashAttention in Triton.
+Tested with triton==2.0.0.dev20221202.
+Triton 2.0 has a new backend (MLIR) but seems like it doesn't yet work for head dimensions
+other than 64:
+https://github.com/openai/triton/blob/d376020f90002757eea3ea9475d4f7cfc2ec5ead/python/triton/ops/flash_attention.py#L207
+We'll update this implementation with the new Triton backend once this is fixed.
+We use the FlashAttention implementation from Phil Tillet a starting point.
+https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py
+Changes:
+- Implement both causal and non-causal attention.
+- Implement both self-attention and cross-attention.
+- Support arbitrary seqlens (not just multiples of 128), for both forward and backward.
+- Support all head dimensions up to 128 (not just 16, 32, 64, 128), for both forward and backward.
+- Support attention bias.
+- Speed up the forward pass a bit, and only store the LSE instead of m and l.
+- Make the backward for d=128 much faster by reducing register spilling.
+- Optionally parallelize the backward pass across seqlen_k, to deal with the case of
+small batch size * nheads.
+Caution:
+- This is an *experimental* implementation. The forward pass should be quite robust but
+I'm not 100% sure that the backward pass doesn't have race conditions (due to the Triton compiler).
+- This implementation has only been tested on A100.
+- If you plan to use headdim other than 64 and 128, you should test for race conditions
+(due to the Triton compiler), as done in tests/test_flash_attn.py
+"test_flash_attn_triton_race_condition". I've tested and fixed many race conditions
+for different head dimensions (40, 48, 64, 128, 80, 88, 96), but I'm still not 100% confident
+that there are none left for other head dimensions.
+Differences between this Triton version and the CUDA version:
+- Triton version doesn't support dropout.
+- Triton forward is generally faster than CUDA forward, while Triton backward is
+generally slower than CUDA backward. Overall Triton forward + backward is slightly slower
+than CUDA forward + backward.
+- Triton version doesn't support different sequence lengths in a batch (i.e., RaggedTensor/NestedTensor).
+- Triton version supports attention bias, while CUDA version doesn't.
+"""
+import math
+import torch
+import triton_pre_mlir as triton
+import triton_pre_mlir.language as tl
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _fwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    Out,
+    Lse,
+    TMP,
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_n = tl.arange(0, BLOCK_N)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = (
+        Q
+        + off_b * stride_qb
+        + off_h * stride_qh
+        + (offs_m[:, None] * stride_qm + offs_d[None, :])
+    )
+    k_ptrs = (
+        K
+        + off_b * stride_kb
+        + off_h * stride_kh
+        + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    )
+    v_ptrs = (
+        V
+        + off_b * stride_vb
+        + off_h * stride_vh
+        + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    )
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + off_b * stride_bb + off_h * stride_bh + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = (
+            Bias
+            + off_b * stride_bb
+            + off_h * stride_bh
+            + (offs_m[:, None] * stride_bm + offs_n[None, :])
+        )
+    t_ptrs = TMP + off_hb * seqlen_q_rounded + offs_m
+    lse_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    acc_o = tl.zeros([BLOCK_M, BLOCK_HEADDIM], dtype=tl.float32)
+    if EVEN_M & EVEN_N:
+        if EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        else:
+            q = tl.load(q_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    elif EVEN_HEADDIM:
+        q = tl.load(q_ptrs, mask=offs_m[:, None] < seqlen_q, other=0.0)
+    else:
+        q = tl.load(
+            q_ptrs,
+            mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+            other=0.0,
+        )
+    end_n = seqlen_k if not IS_CAUSAL else tl.minimum((start_m + 1) * BLOCK_M, seqlen_k)
+    for start_n in range(0, end_n, BLOCK_N):
+        start_n = tl.multiple_of(start_n, BLOCK_N)
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                k = tl.load(k_ptrs + start_n * stride_kn)
+            else:
+                k = tl.load(
+                    k_ptrs + start_n * stride_kn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0,
+                )
+        elif EVEN_HEADDIM:
+            k = tl.load(
+                k_ptrs + start_n * stride_kn,
+                mask=(start_n + offs_n)[:, None] < seqlen_k,
+                other=0.0,
+            )
+        else:
+            k = tl.load(
+                k_ptrs + start_n * stride_kn,
+                mask=((start_n + offs_n)[:, None] < seqlen_k)
+                & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
+        qk += tl.dot(q, k, trans_b=True)
+        if not EVEN_N:
+            qk += tl.where((start_n + offs_n)[None, :] < seqlen_k, 0, float("-inf"))
+        if IS_CAUSAL:
+            qk += tl.where(
+                offs_m[:, None] >= (start_n + offs_n)[None, :], 0, float("-inf")
+            )
+        if BIAS_TYPE != "none":
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n, mask=start_n + offs_n < seqlen_k, other=0.0
+                    ).to(tl.float32)
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs + start_n).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs + start_n,
+                        mask=(offs_m[:, None] < seqlen_q)
+                        & ((start_n + offs_n)[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            qk = qk * softmax_scale + bias
+            m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+            p = tl.exp(qk - m_ij[:, None])
+        else:
+            m_ij = tl.maximum(tl.max(qk, 1) * softmax_scale, lse_i)
+            p = tl.exp(qk * softmax_scale - m_ij[:, None])
+        l_ij = tl.sum(p, 1)
+        acc_o_scale = tl.exp(m_i - m_ij)
+        tl.store(t_ptrs, acc_o_scale)
+        acc_o_scale = tl.load(t_ptrs)
+        acc_o = acc_o * acc_o_scale[:, None]
+        if EVEN_N & EVEN_M:
+            if EVEN_HEADDIM:
+                v = tl.load(v_ptrs + start_n * stride_vn)
+            else:
+                v = tl.load(
+                    v_ptrs + start_n * stride_vn,
+                    mask=offs_d[None, :] < headdim,
+                    other=0.0,
+                )
+        elif EVEN_HEADDIM:
+            v = tl.load(
+                v_ptrs + start_n * stride_vn,
+                mask=(start_n + offs_n)[:, None] < seqlen_k,
+                other=0.0,
+            )
+        else:
+            v = tl.load(
+                v_ptrs + start_n * stride_vn,
+                mask=((start_n + offs_n)[:, None] < seqlen_k)
+                & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        p = p.to(v.dtype)
+        acc_o += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    tl.store(t_ptrs, o_scale)
+    o_scale = tl.load(t_ptrs)
+    acc_o = acc_o * o_scale[:, None]
+    start_m = tl.program_id(0)
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    lse_ptrs = Lse + off_hb * seqlen_q_rounded + offs_m
+    tl.store(lse_ptrs, lse_i)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    out_ptrs = (
+        Out
+        + off_b * stride_ob
+        + off_h * stride_oh
+        + (offs_m[:, None] * stride_om + offs_d[None, :])
+    )
+    if EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(out_ptrs, acc_o)
+        else:
+            tl.store(out_ptrs, acc_o, mask=offs_d[None, :] < headdim)
+    elif EVEN_HEADDIM:
+        tl.store(out_ptrs, acc_o, mask=offs_m[:, None] < seqlen_q)
+    else:
+        tl.store(
+            out_ptrs,
+            acc_o,
+            mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        )
+@triton.jit
+def _bwd_preprocess_do_o_dot(
+    Out,
+    DO,
+    Delta,
+    stride_ob,
+    stride_oh,
+    stride_om,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    nheads,
+    seqlen_q,
+    seqlen_q_rounded,
+    headdim,
+    BLOCK_M: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+):
+    start_m = tl.program_id(0)
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    o = tl.load(
+        Out
+        + off_b * stride_ob
+        + off_h * stride_oh
+        + offs_m[:, None] * stride_om
+        + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    do = tl.load(
+        DO
+        + off_b * stride_dob
+        + off_h * stride_doh
+        + offs_m[:, None] * stride_dom
+        + offs_d[None, :],
+        mask=(offs_m[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+        other=0.0,
+    ).to(tl.float32)
+    delta = tl.sum(o * do, axis=1)
+    tl.store(Delta + off_hb * seqlen_q_rounded + offs_m, delta)
+@triton.jit
+def _bwd_store_dk_dv(
+    dk_ptrs,
+    dv_ptrs,
+    dk,
+    dv,
+    offs_n,
+    offs_d,
+    seqlen_k,
+    headdim,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+):
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            tl.store(dv_ptrs, dv)
+            tl.store(dk_ptrs, dk)
+        else:
+            tl.store(dv_ptrs, dv, mask=offs_d[None, :] < headdim)
+            tl.store(dk_ptrs, dk, mask=offs_d[None, :] < headdim)
+    elif EVEN_HEADDIM:
+        tl.store(dv_ptrs, dv, mask=offs_n[:, None] < seqlen_k)
+        tl.store(dk_ptrs, dk, mask=offs_n[:, None] < seqlen_k)
+    else:
+        tl.store(
+            dv_ptrs, dv, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)
+        )
+        tl.store(
+            dk_ptrs, dk, mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim)
+        )
+@triton.jit
+def _bwd_kernel_one_col_block(
+    start_n,
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qm,
+    stride_kn,
+    stride_vn,
+    stride_bm,
+    stride_dom,
+    stride_dqm,
+    stride_dkn,
+    stride_dvn,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    ATOMIC_ADD: tl.constexpr,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    begin_m = 0 if not IS_CAUSAL else start_n * BLOCK_N // BLOCK_M * BLOCK_M
+    offs_qm = begin_m + tl.arange(0, BLOCK_M)
+    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    offs_m = tl.arange(0, BLOCK_M)
+    offs_d = tl.arange(0, BLOCK_HEADDIM)
+    q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_d[None, :])
+    k_ptrs = K + (offs_n[:, None] * stride_kn + offs_d[None, :])
+    v_ptrs = V + (offs_n[:, None] * stride_vn + offs_d[None, :])
+    do_ptrs = DO + (offs_qm[:, None] * stride_dom + offs_d[None, :])
+    dq_ptrs = DQ + (offs_qm[:, None] * stride_dqm + offs_d[None, :])
+    if BIAS_TYPE == "vector":
+        b_ptrs = Bias + offs_n
+    elif BIAS_TYPE == "matrix":
+        b_ptrs = Bias + (offs_qm[:, None] * stride_bm + offs_n[None, :])
+    dv = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    dk = tl.zeros([BLOCK_N, BLOCK_HEADDIM], dtype=tl.float32)
+    if begin_m >= seqlen_q:
+        dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+        dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+        _bwd_store_dk_dv(
+            dk_ptrs,
+            dv_ptrs,
+            dk,
+            dv,
+            offs_n,
+            offs_d,
+            seqlen_k,
+            headdim,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+        )
+        return
+    if EVEN_N & EVEN_M:
+        if EVEN_HEADDIM:
+            k = tl.load(k_ptrs)
+            v = tl.load(v_ptrs)
+        else:
+            k = tl.load(k_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+            v = tl.load(v_ptrs, mask=offs_d[None, :] < headdim, other=0.0)
+    elif EVEN_HEADDIM:
+        k = tl.load(k_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+        v = tl.load(v_ptrs, mask=offs_n[:, None] < seqlen_k, other=0.0)
+    else:
+        k = tl.load(
+            k_ptrs,
+            mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+            other=0.0,
+        )
+        v = tl.load(
+            v_ptrs,
+            mask=(offs_n[:, None] < seqlen_k) & (offs_d[None, :] < headdim),
+            other=0.0,
+        )
+    num_block_m = tl.cdiv(seqlen_q, BLOCK_M)
+    for start_m in range(begin_m, num_block_m * BLOCK_M, BLOCK_M):
+        start_m = tl.multiple_of(start_m, BLOCK_M)
+        offs_m_curr = start_m + offs_m
+        if EVEN_M & EVEN_HEADDIM:
+            q = tl.load(q_ptrs)
+        elif EVEN_HEADDIM:
+            q = tl.load(q_ptrs, mask=offs_m_curr[:, None] < seqlen_q, other=0.0)
+        else:
+            q = tl.load(
+                q_ptrs,
+                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        qk = tl.dot(q, k, trans_b=True)
+        if not EVEN_N:
+            qk = tl.where(offs_n[None, :] < seqlen_k, qk, float("-inf"))
+        if IS_CAUSAL:
+            qk = tl.where(offs_m_curr[:, None] >= offs_n[None, :], qk, float("-inf"))
+        if BIAS_TYPE != "none":
+            tl.debug_barrier()
+            if BIAS_TYPE == "vector":
+                if EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(b_ptrs, mask=offs_n < seqlen_k, other=0.0).to(
+                        tl.float32
+                    )
+                bias = bias[None, :]
+            elif BIAS_TYPE == "matrix":
+                if EVEN_M & EVEN_N:
+                    bias = tl.load(b_ptrs).to(tl.float32)
+                else:
+                    bias = tl.load(
+                        b_ptrs,
+                        mask=(offs_m_curr[:, None] < seqlen_q)
+                        & (offs_n[None, :] < seqlen_k),
+                        other=0.0,
+                    ).to(tl.float32)
+            qk = qk * softmax_scale + bias
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        lse_i = tl.load(LSE + offs_m_curr)
+        if BIAS_TYPE == "none":
+            p = tl.exp(qk * softmax_scale - lse_i[:, None])
+        else:
+            p = tl.exp(qk - lse_i[:, None])
+        if EVEN_M & EVEN_HEADDIM:
+            do = tl.load(do_ptrs)
+        else:
+            do = tl.load(
+                do_ptrs,
+                mask=(offs_m_curr[:, None] < seqlen_q) & (offs_d[None, :] < headdim),
+                other=0.0,
+            )
+        dv += tl.dot(p.to(do.dtype), do, trans_a=True)
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        dp = tl.dot(do, v, trans_b=True)
+        if not EVEN_HEADDIM:
+            tl.debug_barrier()
+        Di = tl.load(D + offs_m_curr)
+        ds = (p * (dp - Di[:, None]) * softmax_scale).to(q.dtype)
+        dk += tl.dot(ds, q, trans_a=True)
+        if not EVEN_M & EVEN_HEADDIM:
+            tl.debug_barrier()
+        if not ATOMIC_ADD:
+            if EVEN_M & EVEN_HEADDIM:
+                dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+                dq += tl.dot(ds, k)
+                tl.store(dq_ptrs, dq, eviction_policy="evict_last")
+            elif EVEN_HEADDIM:
+                dq = tl.load(
+                    dq_ptrs,
+                    mask=offs_m_curr[:, None] < seqlen_q,
+                    other=0.0,
+                    eviction_policy="evict_last",
+                )
+                dq += tl.dot(ds, k)
+                tl.store(
+                    dq_ptrs,
+                    dq,
+                    mask=offs_m_curr[:, None] < seqlen_q,
+                    eviction_policy="evict_last",
+                )
+            else:
+                dq = tl.load(
+                    dq_ptrs,
+                    mask=(offs_m_curr[:, None] < seqlen_q)
+                    & (offs_d[None, :] < headdim),
+                    other=0.0,
+                    eviction_policy="evict_last",
+                )
+                dq += tl.dot(ds, k)
+                tl.store(
+                    dq_ptrs,
+                    dq,
+                    mask=(offs_m_curr[:, None] < seqlen_q)
+                    & (offs_d[None, :] < headdim),
+                    eviction_policy="evict_last",
+                )
+        else:
+            dq = tl.dot(ds, k)
+            if EVEN_M & EVEN_HEADDIM:
+                tl.atomic_add(dq_ptrs, dq)
+            elif EVEN_HEADDIM:
+                tl.atomic_add(dq_ptrs, dq, mask=offs_m_curr[:, None] < seqlen_q)
+            else:
+                tl.atomic_add(
+                    dq_ptrs,
+                    dq,
+                    mask=(offs_m_curr[:, None] < seqlen_q)
+                    & (offs_d[None, :] < headdim),
+                )
+        dq_ptrs += BLOCK_M * stride_dqm
+        q_ptrs += BLOCK_M * stride_qm
+        do_ptrs += BLOCK_M * stride_dom
+        if BIAS_TYPE == "matrix":
+            b_ptrs += BLOCK_M * stride_bm
+    dv_ptrs = DV + (offs_n[:, None] * stride_dvn + offs_d[None, :])
+    dk_ptrs = DK + (offs_n[:, None] * stride_dkn + offs_d[None, :])
+    _bwd_store_dk_dv(
+        dk_ptrs,
+        dv_ptrs,
+        dk,
+        dv,
+        offs_n,
+        offs_d,
+        seqlen_k,
+        headdim,
+        EVEN_M=EVEN_M,
+        EVEN_N=EVEN_N,
+        EVEN_HEADDIM=EVEN_HEADDIM,
+    )
+def init_to_zero(name):
+    return lambda nargs: nargs[name].zero_()
+@triton.autotune(
+    configs=[
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": False},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+        triton.Config(
+            {"BLOCK_M": 128, "BLOCK_N": 128, "SEQUENCE_PARALLEL": True},
+            num_warps=8,
+            num_stages=1,
+            pre_hook=init_to_zero("DQ"),
+        ),
+    ],
+    key=[
+        "CACHE_KEY_SEQLEN_Q",
+        "CACHE_KEY_SEQLEN_K",
+        "BIAS_TYPE",
+        "IS_CAUSAL",
+        "BLOCK_HEADDIM",
+    ],
+)
+@triton.heuristics(
+    {
+        "EVEN_M": lambda args: args["seqlen_q"] % args["BLOCK_M"] == 0,
+        "EVEN_N": lambda args: args["seqlen_k"] % args["BLOCK_N"] == 0,
+        "EVEN_HEADDIM": lambda args: args["headdim"] == args["BLOCK_HEADDIM"],
+    }
+)
+@triton.jit
+def _bwd_kernel(
+    Q,
+    K,
+    V,
+    Bias,
+    DO,
+    DQ,
+    DK,
+    DV,
+    LSE,
+    D,
+    softmax_scale,
+    stride_qb,
+    stride_qh,
+    stride_qm,
+    stride_kb,
+    stride_kh,
+    stride_kn,
+    stride_vb,
+    stride_vh,
+    stride_vn,
+    stride_bb,
+    stride_bh,
+    stride_bm,
+    stride_dob,
+    stride_doh,
+    stride_dom,
+    stride_dqb,
+    stride_dqh,
+    stride_dqm,
+    stride_dkb,
+    stride_dkh,
+    stride_dkn,
+    stride_dvb,
+    stride_dvh,
+    stride_dvn,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    seqlen_q_rounded,
+    headdim,
+    CACHE_KEY_SEQLEN_Q,
+    CACHE_KEY_SEQLEN_K,
+    BIAS_TYPE: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    BLOCK_HEADDIM: tl.constexpr,
+    SEQUENCE_PARALLEL: tl.constexpr,
+    EVEN_M: tl.constexpr,
+    EVEN_N: tl.constexpr,
+    EVEN_HEADDIM: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+):
+    off_hb = tl.program_id(1)
+    off_b = off_hb // nheads
+    off_h = off_hb % nheads
+    Q += off_b * stride_qb + off_h * stride_qh
+    K += off_b * stride_kb + off_h * stride_kh
+    V += off_b * stride_vb + off_h * stride_vh
+    DO += off_b * stride_dob + off_h * stride_doh
+    DQ += off_b * stride_dqb + off_h * stride_dqh
+    DK += off_b * stride_dkb + off_h * stride_dkh
+    DV += off_b * stride_dvb + off_h * stride_dvh
+    if BIAS_TYPE != "none":
+        Bias += off_b * stride_bb + off_h * stride_bh
+    D += off_hb * seqlen_q_rounded
+    LSE += off_hb * seqlen_q_rounded
+    if not SEQUENCE_PARALLEL:
+        num_block_n = tl.cdiv(seqlen_k, BLOCK_N)
+        for start_n in range(0, num_block_n):
+            _bwd_kernel_one_col_block(
+                start_n,
+                Q,
+                K,
+                V,
+                Bias,
+                DO,
+                DQ,
+                DK,
+                DV,
+                LSE,
+                D,
+                softmax_scale,
+                stride_qm,
+                stride_kn,
+                stride_vn,
+                stride_bm,
+                stride_dom,
+                stride_dqm,
+                stride_dkn,
+                stride_dvn,
+                seqlen_q,
+                seqlen_k,
+                headdim,
+                ATOMIC_ADD=False,
+                BIAS_TYPE=BIAS_TYPE,
+                IS_CAUSAL=IS_CAUSAL,
+                BLOCK_HEADDIM=BLOCK_HEADDIM,
+                EVEN_M=EVEN_M,
+                EVEN_N=EVEN_N,
+                EVEN_HEADDIM=EVEN_HEADDIM,
+                BLOCK_M=BLOCK_M,
+                BLOCK_N=BLOCK_N,
+            )
+    else:
+        start_n = tl.program_id(0)
+        _bwd_kernel_one_col_block(
+            start_n,
+            Q,
+            K,
+            V,
+            Bias,
+            DO,
+            DQ,
+            DK,
+            DV,
+            LSE,
+            D,
+            softmax_scale,
+            stride_qm,
+            stride_kn,
+            stride_vn,
+            stride_bm,
+            stride_dom,
+            stride_dqm,
+            stride_dkn,
+            stride_dvn,
+            seqlen_q,
+            seqlen_k,
+            headdim,
+            ATOMIC_ADD=True,
+            BIAS_TYPE=BIAS_TYPE,
+            IS_CAUSAL=IS_CAUSAL,
+            BLOCK_HEADDIM=BLOCK_HEADDIM,
+            EVEN_M=EVEN_M,
+            EVEN_N=EVEN_N,
+            EVEN_HEADDIM=EVEN_HEADDIM,
+            BLOCK_M=BLOCK_M,
+            BLOCK_N=BLOCK_N,
+        )
+def _flash_attn_forward(q, k, v, bias=None, causal=False, softmax_scale=None):
+    (batch, seqlen_q, nheads, d) = q.shape
+    (_, seqlen_k, _, _) = k.shape
+    assert k.shape == (batch, seqlen_k, nheads, d)
+    assert v.shape == (batch, seqlen_k, nheads, d)
+    assert d <= 128, "FlashAttention only support head dimensions up to 128"
+    assert q.dtype == k.dtype == v.dtype, "All tensors must have the same type"
+    assert q.dtype in [torch.float16, torch.bfloat16], "Only support fp16 and bf16"
+    assert q.is_cuda and k.is_cuda and v.is_cuda
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        if bias.stride(-1) != 1:
+            bias = bias.contiguous()
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (
+        (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    )
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    lse = torch.empty(
+        (batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32
+    )
+    tmp = torch.empty(
+        (batch, nheads, seqlen_q_rounded), device=q.device, dtype=torch.float32
+    )
+    o = torch.empty_like(q)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    BLOCK = 128
+    num_warps = 4 if d <= 64 else 8
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _fwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        o,
+        lse,
+        tmp,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM,
+        BLOCK_M=BLOCK,
+        BLOCK_N=BLOCK,
+        num_warps=num_warps,
+        num_stages=1
+    )
+    return (o, lse, softmax_scale)
+def _flash_attn_backward(
+    do, q, k, v, o, lse, dq, dk, dv, bias=None, causal=False, softmax_scale=None
+):
+    if do.stride(-1) != 1:
+        do = do.contiguous()
+    (batch, seqlen_q, nheads, d) = q.shape
+    (_, seqlen_k, _, _) = k.shape
+    assert d <= 128
+    seqlen_q_rounded = math.ceil(seqlen_q / 128) * 128
+    assert lse.shape == (batch, nheads, seqlen_q_rounded)
+    assert q.stride(-1) == k.stride(-1) == v.stride(-1) == o.stride(-1) == 1
+    assert dq.stride(-1) == dk.stride(-1) == dv.stride(-1) == 1
+    softmax_scale = softmax_scale or 1.0 / math.sqrt(d)
+    dq_accum = torch.empty_like(q, dtype=torch.float32)
+    delta = torch.empty_like(lse)
+    BLOCK_HEADDIM = max(triton.next_power_of_2(d), 16)
+    grid = lambda META: (triton.cdiv(seqlen_q, META["BLOCK_M"]), batch * nheads)
+    _bwd_preprocess_do_o_dot[grid](
+        o,
+        do,
+        delta,
+        o.stride(0),
+        o.stride(2),
+        o.stride(1),
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_q_rounded,
+        d,
+        BLOCK_M=128,
+        BLOCK_HEADDIM=BLOCK_HEADDIM,
+    )
+    has_bias = bias is not None
+    bias_type = "none"
+    if has_bias:
+        assert bias.dtype in [q.dtype, torch.float]
+        assert bias.is_cuda
+        assert bias.dim() == 4
+        assert bias.stride(-1) == 1
+        if bias.shape[2:] == (1, seqlen_k):
+            bias_type = "vector"
+        elif bias.shape[2:] == (seqlen_q, seqlen_k):
+            bias_type = "matrix"
+        else:
+            raise RuntimeError(
+                "Last 2 dimensions of bias must be (1, seqlen_k) or (seqlen_q, seqlen_k)"
+            )
+        bias = bias.expand(batch, nheads, seqlen_q, seqlen_k)
+    bias_strides = (
+        (bias.stride(0), bias.stride(1), bias.stride(2)) if has_bias else (0, 0, 0)
+    )
+    grid = lambda META: (
+        triton.cdiv(seqlen_k, META["BLOCK_N"]) if META["SEQUENCE_PARALLEL"] else 1,
+        batch * nheads,
+    )
+    _bwd_kernel[grid](
+        q,
+        k,
+        v,
+        bias,
+        do,
+        dq_accum,
+        dk,
+        dv,
+        lse,
+        delta,
+        softmax_scale,
+        q.stride(0),
+        q.stride(2),
+        q.stride(1),
+        k.stride(0),
+        k.stride(2),
+        k.stride(1),
+        v.stride(0),
+        v.stride(2),
+        v.stride(1),
+        *bias_strides,
+        do.stride(0),
+        do.stride(2),
+        do.stride(1),
+        dq_accum.stride(0),
+        dq_accum.stride(2),
+        dq_accum.stride(1),
+        dk.stride(0),
+        dk.stride(2),
+        dk.stride(1),
+        dv.stride(0),
+        dv.stride(2),
+        dv.stride(1),
+        nheads,
+        seqlen_q,
+        seqlen_k,
+        seqlen_q_rounded,
+        d,
+        seqlen_q // 32,
+        seqlen_k // 32,
+        bias_type,
+        causal,
+        BLOCK_HEADDIM
+    )
+    dq.copy_(dq_accum)
+class FlashAttnQKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, qkv, bias=None, causal=False, softmax_scale=None):
+        """
+        qkv: (batch, seqlen, 3, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen, seqlen).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen, seqlen)
+        """
+        if qkv.stride(-1) != 1:
+            qkv = qkv.contiguous()
+        (o, lse, ctx.softmax_scale) = _flash_attn_forward(
+            qkv[:, :, 0],
+            qkv[:, :, 1],
+            qkv[:, :, 2],
+            bias=bias,
+            causal=causal,
+            softmax_scale=softmax_scale,
+        )
+        ctx.save_for_backward(qkv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        (qkv, o, lse, bias) = ctx.saved_tensors
+        assert not ctx.needs_input_grad[
+            1
+        ], "FlashAttention does not support bias gradient yet"
+        with torch.inference_mode():
+            dqkv = torch.empty_like(qkv)
+            _flash_attn_backward(
+                do,
+                qkv[:, :, 0],
+                qkv[:, :, 1],
+                qkv[:, :, 2],
+                o,
+                lse,
+                dqkv[:, :, 0],
+                dqkv[:, :, 1],
+                dqkv[:, :, 2],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return (dqkv, None, None, None)
+flash_attn_qkvpacked_func = FlashAttnQKVPackedFunc.apply
+class FlashAttnKVPackedFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, kv, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch, seqlen_q, nheads, headdim)
+        kv: (batch, seqlen_k, 2, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        (q, kv) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, kv]]
+        (o, lse, ctx.softmax_scale) = _flash_attn_forward(
+            q,
+            kv[:, :, 0],
+            kv[:, :, 1],
+            bias=bias,
+            causal=causal,
+            softmax_scale=softmax_scale,
+        )
+        ctx.save_for_backward(q, kv, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        (q, kv, o, lse, bias) = ctx.saved_tensors
+        if len(ctx.needs_input_grad) >= 3:
+            assert not ctx.needs_input_grad[
+                2
+            ], "FlashAttention does not support bias gradient yet"
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dkv = torch.empty_like(kv)
+            _flash_attn_backward(
+                do,
+                q,
+                kv[:, :, 0],
+                kv[:, :, 1],
+                o,
+                lse,
+                dq,
+                dkv[:, :, 0],
+                dkv[:, :, 1],
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return (dq, dkv, None, None, None)
+flash_attn_kvpacked_func = FlashAttnKVPackedFunc.apply
+class FlashAttnFunc(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, q, k, v, bias=None, causal=False, softmax_scale=None):
+        """
+        q: (batch_size, seqlen_q, nheads, headdim)
+        k, v: (batch_size, seqlen_k, nheads, headdim)
+        bias: optional, shape broadcastible to (batch, nheads, seqlen_q, seqlen_k).
+            For example, ALiBi mask for causal would have shape (1, nheads, 1, seqlen_k).
+            ALiBi mask for non-causal would have shape (1, nheads, seqlen_q, seqlen_k)
+        """
+        (q, k, v) = [x if x.stride(-1) == 1 else x.contiguous() for x in [q, k, v]]
+        (o, lse, ctx.softmax_scale) = _flash_attn_forward(
+            q, k, v, bias=bias, causal=causal, softmax_scale=softmax_scale
+        )
+        ctx.save_for_backward(q, k, v, o, lse, bias)
+        ctx.causal = causal
+        return o
+    @staticmethod
+    def backward(ctx, do):
+        (q, k, v, o, lse, bias) = ctx.saved_tensors
+        assert not ctx.needs_input_grad[
+            3
+        ], "FlashAttention does not support bias gradient yet"
+        with torch.inference_mode():
+            dq = torch.empty_like(q)
+            dk = torch.empty_like(k)
+            dv = torch.empty_like(v)
+            _flash_attn_backward(
+                do,
+                q,
+                k,
+                v,
+                o,
+                lse,
+                dq,
+                dk,
+                dv,
+                bias=bias,
+                causal=ctx.causal,
+                softmax_scale=ctx.softmax_scale,
+            )
+        return (dq, dk, dv, None, None, None)
+flash_attn_func = FlashAttnFunc.apply

model/llava/model/language_model/mpt/hf_prefixlm_converter.py ADDED Viewed

	@@ -0,0 +1,750 @@

+"""Converts Huggingface Causal LM to Prefix LM.
+Conversion does lightweight surgery on a HuggingFace
+Causal LM to convert it to a Prefix LM.
+Prefix LMs accepts a `bidirectional_mask` input in `forward`
+and treat the input prompt as the prefix in `generate`.
+"""
+import math
+import warnings
+from types import MethodType
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+from transformers.models.bloom.modeling_bloom import (
+    BaseModelOutputWithPastAndCrossAttentions, BloomForCausalLM, BloomModel,
+    CausalLMOutputWithCrossAttentions, CrossEntropyLoss)
+from transformers.models.bloom.modeling_bloom import \
+    _expand_mask as _expand_mask_bloom
+from transformers.models.bloom.modeling_bloom import \
+    _make_causal_mask as _make_causal_mask_bloom
+from transformers.models.bloom.modeling_bloom import logging
+from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
+from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoForCausalLM
+from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXForCausalLM
+from transformers.models.gptj.modeling_gptj import GPTJForCausalLM
+from transformers.models.opt.modeling_opt import OPTForCausalLM
+from transformers.models.opt.modeling_opt import \
+    _expand_mask as _expand_mask_opt
+from transformers.models.opt.modeling_opt import \
+    _make_causal_mask as _make_causal_mask_opt
+logger = logging.get_logger(__name__)
+_SUPPORTED_GPT_MODELS = (
+    GPT2LMHeadModel,
+    GPTJForCausalLM,
+    GPTNeoForCausalLM,
+    GPTNeoXForCausalLM,
+)
+CAUSAL_GPT_TYPES = Union[
+    GPT2LMHeadModel, GPTJForCausalLM, GPTNeoForCausalLM, GPTNeoXForCausalLM
+]
+def _convert_gpt_causal_lm_to_prefix_lm(model: CAUSAL_GPT_TYPES) -> CAUSAL_GPT_TYPES:
+    """Converts a GPT-style Causal LM to a Prefix LM.
+    Supported HuggingFace model classes:
+        - `GPT2LMHeadModel`
+        - `GPTNeoForCausalLM`
+        - `GPTNeoXForCausalLM`
+        - `GPTJForCausalLM`
+    See `convert_hf_causal_lm_to_prefix_lm` for more details.
+    """
+    if hasattr(model, "_prefix_lm_converted"):
+        return model
+    assert isinstance(model, _SUPPORTED_GPT_MODELS)
+    assert (
+        model.config.add_cross_attention == False
+    ), "Only supports GPT-style decoder-only models"
+    def _get_attn_modules(model: CAUSAL_GPT_TYPES) -> List[torch.nn.Module]:
+        """Helper that gets a list of the model's attention modules.
+        Each module has a `bias` buffer used for causal masking. The Prefix LM
+        conversion adds logic to dynamically manipulate these biases to support
+        Prefix LM attention masking.
+        """
+        attn_modules = []
+        if isinstance(model, GPTNeoXForCausalLM):
+            blocks = model.gpt_neox.layers
+        else:
+            blocks = model.transformer.h
+        for block in blocks:
+            if isinstance(model, GPTNeoForCausalLM):
+                if block.attn.attention_type != "global":
+                    continue
+                attn_module = block.attn.attention
+            elif isinstance(model, GPTNeoXForCausalLM):
+                attn_module = block.attention
+            else:
+                attn_module = block.attn
+            attn_modules.append(attn_module)
+        return attn_modules
+    setattr(model, "_original_forward", getattr(model, "forward"))
+    setattr(model, "_original_generate", getattr(model, "generate"))
+    def forward(
+        self: CAUSAL_GPT_TYPES,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        bidirectional_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        """Wraps original forward to enable PrefixLM attention."""
+        def call_og_forward():
+            if isinstance(self, GPTNeoXForCausalLM):
+                return self._original_forward(
+                    input_ids=input_ids,
+                    past_key_values=past_key_values,
+                    attention_mask=attention_mask,
+                    head_mask=head_mask,
+                    inputs_embeds=inputs_embeds,
+                    labels=labels,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                )
+            else:
+                return self._original_forward(
+                    input_ids=input_ids,
+                    past_key_values=past_key_values,
+                    attention_mask=attention_mask,
+                    token_type_ids=token_type_ids,
+                    position_ids=position_ids,
+                    head_mask=head_mask,
+                    inputs_embeds=inputs_embeds,
+                    labels=labels,
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                    return_dict=return_dict,
+                )
+        if bidirectional_mask is None:
+            return call_og_forward()
+        assert isinstance(bidirectional_mask, torch.Tensor)
+        attn_modules = _get_attn_modules(model)
+        (b, s) = bidirectional_mask.shape
+        max_length = attn_modules[0].bias.shape[-1]
+        if s > max_length:
+            raise ValueError(
+                f"bidirectional_mask sequence length (={s}) exceeds the "
+                + f"max length allowed by the model ({max_length})."
+            )
+        assert s <= max_length
+        if s < max_length:
+            pad = torch.zeros(
+                (int(b), int(max_length - s)),
+                dtype=bidirectional_mask.dtype,
+                device=bidirectional_mask.device,
+            )
+            bidirectional_mask = torch.cat([bidirectional_mask, pad], dim=1)
+        bidirectional = bidirectional_mask.unsqueeze(1).unsqueeze(1)
+        for attn_module in attn_modules:
+            attn_module.bias.data = torch.logical_or(
+                attn_module.bias.data, bidirectional
+            )
+        output = call_og_forward()
+        for attn_module in attn_modules:
+            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
+        return output
+    def generate(self: CAUSAL_GPT_TYPES, *args: tuple, **kwargs: Dict[str, Any]):
+        """Wraps original generate to enable PrefixLM attention."""
+        attn_modules = _get_attn_modules(model)
+        for attn_module in attn_modules:
+            attn_module.bias.data[:] = 1
+        output = self._original_generate(*args, **kwargs)
+        for attn_module in attn_modules:
+            attn_module.bias.data = torch.tril(attn_module.bias.data[0, 0])[None, None]
+        return output
+    setattr(model, "forward", MethodType(forward, model))
+    setattr(model, "generate", MethodType(generate, model))
+    setattr(model, "_prefix_lm_converted", True)
+    return model
+def _convert_bloom_causal_lm_to_prefix_lm(model: BloomForCausalLM) -> BloomForCausalLM:
+    """Converts a BLOOM Causal LM to a Prefix LM.
+    Supported HuggingFace model classes:
+        - `BloomForCausalLM`
+    See `convert_hf_causal_lm_to_prefix_lm` for more details.
+    """
+    if hasattr(model, "_prefix_lm_converted"):
+        return model
+    assert isinstance(model, BloomForCausalLM)
+    assert (
+        model.config.add_cross_attention == False
+    ), "Only supports BLOOM decoder-only models"
+    def _prepare_attn_mask(
+        self: BloomModel,
+        attention_mask: torch.Tensor,
+        bidirectional_mask: Optional[torch.Tensor],
+        input_shape: Tuple[int, int],
+        past_key_values_length: int,
+    ) -> torch.BoolTensor:
+        combined_attention_mask = None
+        device = attention_mask.device
+        (_, src_length) = input_shape
+        if src_length > 1:
+            combined_attention_mask = _make_causal_mask_bloom(
+                input_shape,
+                device=device,
+                past_key_values_length=past_key_values_length,
+            )
+            if bidirectional_mask is not None:
+                assert attention_mask.shape == bidirectional_mask.shape
+                expanded_bidirectional_mask = _expand_mask_bloom(
+                    bidirectional_mask, tgt_length=src_length
+                )
+                combined_attention_mask = torch.logical_and(
+                    combined_attention_mask, expanded_bidirectional_mask
+                )
+        expanded_attn_mask = _expand_mask_bloom(attention_mask, tgt_length=src_length)
+        combined_attention_mask = (
+            expanded_attn_mask
+            if combined_attention_mask is None
+            else expanded_attn_mask | combined_attention_mask
+        )
+        return combined_attention_mask
+    def _build_alibi_tensor(
+        self: BloomModel,
+        batch_size: int,
+        query_length: int,
+        key_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> torch.Tensor:
+        num_heads = self.config.n_head
+        closest_power_of_2 = 2 ** math.floor(math.log2(num_heads))
+        base = torch.tensor(
+            2 ** (-(2 ** (-(math.log2(closest_power_of_2) - 3)))),
+            device=device,
+            dtype=torch.float32,
+        )
+        powers = torch.arange(
+            1, 1 + closest_power_of_2, device=device, dtype=torch.int32
+        )
+        slopes = torch.pow(base, powers)
+        if closest_power_of_2 != num_heads:
+            extra_base = torch.tensor(
+                2 ** (-(2 ** (-(math.log2(2 * closest_power_of_2) - 3)))),
+                device=device,
+                dtype=torch.float32,
+            )
+            num_remaining_heads = min(
+                closest_power_of_2, num_heads - closest_power_of_2
+            )
+            extra_powers = torch.arange(
+                1, 1 + 2 * num_remaining_heads, 2, device=device, dtype=torch.int32
+            )
+            slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0)
+        qa = torch.arange(query_length, device=device, dtype=torch.int32).view(-1, 1)
+        ka = torch.arange(key_length, device=device, dtype=torch.int32).view(1, -1)
+        diffs = qa - ka + key_length - query_length
+        diffs = -diffs.abs()
+        alibi = slopes.view(1, num_heads, 1, 1) * diffs.view(
+            1, 1, query_length, key_length
+        )
+        alibi = alibi.expand(batch_size, -1, -1, -1).reshape(
+            -1, query_length, key_length
+        )
+        return alibi.to(dtype)
+    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
+    def forward(
+        self: BloomModel,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bidirectional_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. "
+                + "You can safely ignore passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            (batch_size, seq_length) = input_ids.shape
+        elif inputs_embeds is not None:
+            (batch_size, seq_length, _) = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.h))
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        hidden_states = self.word_embeddings_layernorm(inputs_embeds)
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+        if past_key_values[0] is not None:
+            tmp = past_key_values[0][0]
+            past_key_values_length = tmp.shape[2]
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), device=hidden_states.device
+            )
+        else:
+            attention_mask = attention_mask.to(hidden_states.device)
+        alibi = self._build_alibi_tensor(
+            batch_size=batch_size,
+            query_length=seq_length,
+            key_length=seq_length_with_past,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        causal_mask = self._prepare_attn_mask(
+            attention_mask,
+            bidirectional_mask,
+            input_shape=(batch_size, seq_length),
+            past_key_values_length=past_key_values_length,
+        )
+        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                hst = (hidden_states,)
+                all_hidden_states = all_hidden_states + hst
+            if self.gradient_checkpointing and self.training:
+                if use_cache:
+                    logger.warning(
+                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                    )
+                    use_cache = False
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(
+                            *inputs,
+                            use_cache=use_cache,
+                            output_attentions=output_attentions,
+                        )
+                    return custom_forward
+                outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    alibi,
+                    causal_mask,
+                    head_mask[i],
+                )
+            else:
+                outputs = block(
+                    hidden_states,
+                    layer_past=layer_past,
+                    attention_mask=causal_mask,
+                    head_mask=head_mask[i],
+                    use_cache=use_cache,
+                    output_attentions=output_attentions,
+                    alibi=alibi,
+                )
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+            if output_attentions:
+                oa = (outputs[2 if use_cache else 1],)
+                all_self_attentions = all_self_attentions + oa
+        hidden_states = self.ln_f(hidden_states)
+        if output_hidden_states:
+            hst = (hidden_states,)
+            all_hidden_states = all_hidden_states + hst
+        if not return_dict:
+            return tuple(
+                (
+                    v
+                    for v in [
+                        hidden_states,
+                        presents,
+                        all_hidden_states,
+                        all_self_attentions,
+                    ]
+                    if v is not None
+                )
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+    setattr(
+        model.transformer,
+        "_prepare_attn_mask",
+        MethodType(_prepare_attn_mask, model.transformer),
+    )
+    setattr(
+        model.transformer,
+        "_build_alibi_tensor",
+        MethodType(_build_alibi_tensor, model.transformer),
+    )
+    setattr(model.transformer, "forward", MethodType(forward, model.transformer))
+    KeyValueT = Tuple[torch.Tensor, torch.Tensor]
+    def forward(
+        self: BloomForCausalLM,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[KeyValueT, ...]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bidirectional_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **deprecated_arguments,
+    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        """Replacement forward method for BloomCausalLM."""
+        if deprecated_arguments.pop("position_ids", False) is not False:
+            warnings.warn(
+                "`position_ids` have no functionality in BLOOM and will be removed "
+                + "in v5.0.0. You can safely ignore passing `position_ids`.",
+                FutureWarning,
+            )
+        if len(deprecated_arguments) > 0:
+            raise ValueError(f"Got unexpected arguments: {deprecated_arguments}")
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            bidirectional_mask=bidirectional_mask,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            (batch_size, seq_length, vocab_size) = shift_logits.shape
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(batch_size * seq_length, vocab_size),
+                shift_labels.view(batch_size * seq_length),
+            )
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self: BloomForCausalLM,
+        input_ids: torch.LongTensor,
+        past: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> dict:
+        if past:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+            bidirectional_mask = None
+            if past[0][0].shape[0] == input_ids.shape[0]:
+                past = self._convert_to_bloom_cache(past)
+        else:
+            bidirectional_mask = torch.ones_like(input_ids)
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past,
+            "use_cache": True,
+            "attention_mask": attention_mask,
+            "bidirectional_mask": bidirectional_mask,
+        }
+    setattr(model, "forward", MethodType(forward, model))
+    setattr(
+        model,
+        "prepare_inputs_for_generation",
+        MethodType(prepare_inputs_for_generation, model),
+    )
+    setattr(model, "_prefix_lm_converted", True)
+    return model
+def _convert_opt_causal_lm_to_prefix_lm(model: OPTForCausalLM) -> OPTForCausalLM:
+    """Converts an OPT Causal LM to a Prefix LM.
+    Supported HuggingFace model classes:
+        - `OPTForCausalLM`
+    See `convert_hf_causal_lm_to_prefix_lm` for more details.
+    """
+    if hasattr(model, "_prefix_lm_converted"):
+        return model
+    assert isinstance(model, OPTForCausalLM)
+    assert (
+        model.config.add_cross_attention == False
+    ), "Only supports OPT decoder-only models"
+    setattr(model, "_original_forward", getattr(model, "forward"))
+    setattr(model, "_original_generate", getattr(model, "generate"))
+    model.model.decoder.bidirectional_mask = None
+    def _prepare_decoder_attention_mask(
+        self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+    ):
+        combined_attention_mask = None
+        if input_shape[-1] > 1:
+            if self.bidirectional_mask == "g":
+                (bsz, src_length) = input_shape
+                combined_attention_mask = torch.zeros(
+                    (bsz, 1, src_length, src_length + past_key_values_length),
+                    dtype=inputs_embeds.dtype,
+                    device=inputs_embeds.device,
+                )
+            else:
+                combined_attention_mask = _make_causal_mask_opt(
+                    input_shape,
+                    inputs_embeds.dtype,
+                    past_key_values_length=past_key_values_length,
+                ).to(inputs_embeds.device)
+                if self.bidirectional_mask is not None:
+                    assert attention_mask.shape == self.bidirectional_mask.shape
+                    expanded_bidirectional_mask = _expand_mask_opt(
+                        self.bidirectional_mask,
+                        inputs_embeds.dtype,
+                        tgt_len=input_shape[-1],
+                    ).to(inputs_embeds.device)
+                    combined_attention_mask = torch.maximum(
+                        expanded_bidirectional_mask, combined_attention_mask
+                    )
+        if attention_mask is not None:
+            expanded_attn_mask = _expand_mask_opt(
+                attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            ).to(inputs_embeds.device)
+            combined_attention_mask = (
+                expanded_attn_mask
+                if combined_attention_mask is None
+                else expanded_attn_mask + combined_attention_mask
+            )
+        return combined_attention_mask
+    setattr(
+        model.model.decoder,
+        "_prepare_decoder_attention_mask",
+        MethodType(_prepare_decoder_attention_mask, model.model.decoder),
+    )
+    def forward(
+        self: OPTForCausalLM,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        bidirectional_mask: Optional[torch.ByteTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        def call_og_forward():
+            return self._original_forward(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        if bidirectional_mask is None:
+            return call_og_forward()
+        self.model.decoder.bidirectional_mask = bidirectional_mask
+        try:
+            outputs = call_og_forward()
+        except:
+            self.model.decoder.bidirectional_mask = None
+            raise
+        self.model.decoder.bidirectional_mask = None
+        return outputs
+    def generate(self: OPTForCausalLM, *args: tuple, **kwargs: Dict[str, Any]):
+        """Wraps original generate to enable PrefixLM-style attention."""
+        self.model.decoder.bidirectional_mask = "g"
+        try:
+            output = self._original_generate(*args, **kwargs)
+        except:
+            self.model.decoder.bidirectional_mask = None
+            raise
+        self.model.decoder.bidirectional_mask = None
+        return output
+    setattr(model, "forward", MethodType(forward, model))
+    setattr(model, "generate", MethodType(generate, model))
+    setattr(model, "_prefix_lm_converted", True)
+    return model
+_SUPPORTED_HF_MODELS = _SUPPORTED_GPT_MODELS + (BloomForCausalLM, OPTForCausalLM)
+CAUSAL_LM_TYPES = Union[
+    GPT2LMHeadModel,
+    GPTJForCausalLM,
+    GPTNeoForCausalLM,
+    GPTNeoXForCausalLM,
+    BloomForCausalLM,
+    OPTForCausalLM,
+]
+def convert_hf_causal_lm_to_prefix_lm(model: CAUSAL_LM_TYPES) -> CAUSAL_LM_TYPES:
+    """Converts a HuggingFace Causal LM to a Prefix LM.
+    Supported HuggingFace model classes:
+        - `GPT2LMHeadModel`
+        - `GPTNeoForCausalLM`
+        - `GPTNeoXForCausalLM`
+        - `GPTJForCausalLM`
+        - `BloomForCausalLM`
+        - `OPTForCausalLM`
+    Conversion to a Prefix LM is done by modifying the `forward` method, and possibly also the
+    `generate` method and/or select underlying methods depending on the model class.
+    These changes preserve the model API, but add a new input to `forward`: "bidirectional_mask".
+    Notes on training:
+        To actually train the converted model as a Prefix LM, training batches will need to indicate
+        the prefix/target structure by including `bidirectional_mask` as part of the batch inputs.
+        **This is not a standard input and requires custom layers either within or after your dataloader.**
+        In addition to adding `bidirectional_mask` to the batch, this custom code should modify `labels`
+        such that `batch['labels'][batch['bidirectional_mask'] == 1] == -100`.
+        That is, the prefix portion of the sequence should not generate any loss. Loss should only be
+        generated by the target portion of the sequence.
+    Notes on `GPTNeoForCausalLM`:
+        To simplify the implementation, "global" and "local" attention layers are handled differently.
+        For "global" layers, we handle conversion as described above. For "local" layers, which use a
+        causal attention mask within a restricted local window, we do not alter the masking.
+    Notes on `forward` method conversion:
+        After conversion, the `forward` method will handle a new input, `bidirectional_mask`,
+        which should be a [batch_size, seq_length] byte tensor, where 1 indicates token positions
+        belonging to the prefix (prefix tokens can attend to one another bidirectionally), and
+        0 indicates token positions belonging to the target.
+        The new `forward` method will incorporate `bidirectional_mask` (if supplied) into the existing
+        causal mask, call the original `forward` method, and (if the causal mask is a buffer) reset
+        the causal masks before returning the result.
+    Notes on `generate` method conversion:
+        After conversion, the `generate` method will have the same signature but will internally
+        convert all causal masks to be purely bidirectional, call the original `generate` method, and
+        (where appropriate) reset the causal masks before returning the result.
+        This works thanks to the logic of the HuggingFace `generate` API, which first encodes the token
+        "prompt" passed to `generate` (which is treated as the prefix) and then sequentially generates
+        each new token. Encodings are cached as generation happens, so all prefix tokens can attend to one
+        another (as expected in a Prefix LM) and generated tokens can only attend to prefix tokens and
+        previously-generated tokens (also as expected in a Prefix LM).
+    To preserve the API, the original methods are renamed to `_original_forward` and
+    `_original_generate`, and replaced with new `forward` and `generate` methods that wrap
+    them, respectively. Although implementation details vary by model class.
+    """
+    if isinstance(model, _SUPPORTED_GPT_MODELS):
+        return _convert_gpt_causal_lm_to_prefix_lm(model)
+    elif isinstance(model, BloomForCausalLM):
+        return _convert_bloom_causal_lm_to_prefix_lm(model)
+    elif isinstance(model, OPTForCausalLM):
+        return _convert_opt_causal_lm_to_prefix_lm(model)
+    else:
+        raise TypeError(
+            f"Cannot convert model to Prefix LM. "
+            + f"Model does not belong to set of supported HF models:"
+            + f"\n{_SUPPORTED_HF_MODELS}"
+        )
+def add_bidirectional_mask_if_missing(batch: Dict[str, Any]):
+    """Attempts to add bidirectional_mask to batch if missing.
+    Raises:
+        KeyError if bidirectional_mask is missing and can't be inferred
+    """
+    if "bidirectional_mask" not in batch:
+        if batch.get("mode", None) == "icl_task":
+            batch["bidirectional_mask"] = batch["attention_mask"].clone()
+            for i, continuation_indices in enumerate(batch["continuation_indices"]):
+                batch["bidirectional_mask"][i, continuation_indices] = 0
+        elif "labels" in batch and "attention_mask" in batch:
+            batch["bidirectional_mask"] = torch.logical_and(
+                torch.eq(batch["attention_mask"], 1), torch.eq(batch["labels"], -100)
+            ).type_as(batch["attention_mask"])
+        else:
+            raise KeyError(
+                "No bidirectional_mask in batch and not sure how to construct one."
+            )

model/llava/model/language_model/mpt/meta_init_context.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from contextlib import contextmanager
+import torch
+import torch.nn as nn
+@contextmanager
+def init_empty_weights(include_buffers: bool = False):
+    """Meta initialization context manager.
+    A context manager under which models are initialized with all parameters
+    on the meta device, therefore creating an empty model. Useful when just
+    initializing the model would blow the available RAM.
+    Args:
+        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
+            not to also put all buffers on the meta device while initializing.
+    Example:
+    ```python
+    import torch.nn as nn
+    # Initialize a model with 100 billions parameters in no time and without using any RAM.
+    with init_empty_weights():
+        tst = nn.Sequential(*[nn.Linear(10000, 10000) for _ in range(1000)])
+    ```
+    <Tip warning={true}>
+    Any model created under this context manager has no weights. As such you can't do something like
+    `model.to(some_device)` with it. To load weights inside your empty model, see [`load_checkpoint_and_dispatch`].
+    </Tip>
+    """
+    with init_on_device(torch.device("meta"), include_buffers=include_buffers) as f:
+        yield f
+@contextmanager
+def init_on_device(device: torch.device, include_buffers: bool = False):
+    """Device initialization context manager.
+    A context manager under which models are initialized with all parameters
+    on the specified device.
+    Args:
+        device (`torch.device`): Device to initialize all parameters on.
+        include_buffers (`bool`, *optional*, defaults to `False`): Whether or
+            not to also put all buffers on the meta device while initializing.
+    Example:
+    ```python
+    import torch.nn as nn
+    with init_on_device(device=torch.device("cuda")):
+        tst = nn.Liner(100, 100)  # on `cuda` device
+    ```
+    """
+    old_register_parameter = nn.Module.register_parameter
+    if include_buffers:
+        old_register_buffer = nn.Module.register_buffer
+    def register_empty_parameter(module, name, param):
+        old_register_parameter(module, name, param)
+        if param is not None:
+            param_cls = type(module._parameters[name])
+            kwargs = module._parameters[name].__dict__
+            module._parameters[name] = param_cls(
+                module._parameters[name].to(device), **kwargs
+            )
+    def register_empty_buffer(module, name, buffer):
+        old_register_buffer(module, name, buffer)
+        if buffer is not None:
+            module._buffers[name] = module._buffers[name].to(device)
+    if include_buffers:
+        tensor_constructors_to_patch = {
+            torch_function_name: getattr(torch, torch_function_name)
+            for torch_function_name in ["empty", "zeros", "ones", "full"]
+        }
+    else:
+        tensor_constructors_to_patch = {}
+    def patch_tensor_constructor(fn):
+        def wrapper(*args, **kwargs):
+            kwargs["device"] = device
+            return fn(*args, **kwargs)
+        return wrapper
+    try:
+        nn.Module.register_parameter = register_empty_parameter
+        if include_buffers:
+            nn.Module.register_buffer = register_empty_buffer
+        for torch_function_name in tensor_constructors_to_patch.keys():
+            setattr(
+                torch,
+                torch_function_name,
+                patch_tensor_constructor(getattr(torch, torch_function_name)),
+            )
+        yield
+    finally:
+        nn.Module.register_parameter = old_register_parameter
+        if include_buffers:
+            nn.Module.register_buffer = old_register_buffer
+        for (
+            torch_function_name,
+            old_torch_function,
+        ) in tensor_constructors_to_patch.items():
+            setattr(torch, torch_function_name, old_torch_function)

model/llava/model/language_model/mpt/modeling_mpt.py ADDED Viewed

	@@ -0,0 +1,538 @@

+"""A simple, flexible implementation of a GPT model.
+Inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py
+"""
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import (PreTrainedModel, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+from transformers.modeling_outputs import (BaseModelOutputWithPast,
+                                           CausalLMOutputWithPast)
+from .adapt_tokenizer import AutoTokenizerForMOD, adapt_tokenizer_for_denoising
+from .attention import attn_bias_shape, build_attn_bias
+from .blocks import MPTBlock
+from .configuration_mpt import MPTConfig
+from .custom_embedding import SharedEmbedding
+from .hf_prefixlm_converter import (add_bidirectional_mask_if_missing,
+                                    convert_hf_causal_lm_to_prefix_lm)
+from .meta_init_context import init_empty_weights
+from .norm import NORM_CLASS_REGISTRY
+from .param_init_fns import MODEL_INIT_REGISTRY, generic_param_init_fn_
+try:
+    from .flash_attn_triton import flash_attn_func
+except:
+    pass
+Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+class MPTPreTrainedModel(PreTrainedModel):
+    config_class = MPTConfig
+    base_model_prefix = "model"
+    _no_split_modules = ["MPTBlock"]
+class MPTModel(MPTPreTrainedModel):
+    def __init__(self, config: MPTConfig):
+        config._validate_config()
+        super().__init__(config)
+        self.attn_impl = config.attn_config["attn_impl"]
+        self.prefix_lm = config.attn_config["prefix_lm"]
+        self.attn_uses_sequence_id = config.attn_config["attn_uses_sequence_id"]
+        self.alibi = config.attn_config["alibi"]
+        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        if config.init_device == "mixed":
+            if dist.get_local_rank() == 0:
+                config.init_device = "cpu"
+            else:
+                config.init_device = "meta"
+        if config.norm_type.lower() not in NORM_CLASS_REGISTRY.keys():
+            norm_options = " | ".join(NORM_CLASS_REGISTRY.keys())
+            raise NotImplementedError(
+                f"Requested norm type ({config.norm_type}) is not implemented within this repo (Options: {norm_options})."
+            )
+        norm_class = NORM_CLASS_REGISTRY[config.norm_type.lower()]
+        self.embedding_fraction = config.embedding_fraction
+        self.wte = SharedEmbedding(
+            config.vocab_size, config.d_model, device=config.init_device
+        )
+        if not self.alibi:
+            self.wpe = torch.nn.Embedding(
+                config.max_seq_len, config.d_model, device=config.init_device
+            )
+        self.emb_drop = nn.Dropout(config.emb_pdrop)
+        self.blocks = nn.ModuleList(
+            [
+                MPTBlock(device=config.init_device, **config.to_dict())
+                for _ in range(config.n_layers)
+            ]
+        )
+        self.norm_f = norm_class(config.d_model, device=config.init_device)
+        if config.init_device != "meta":
+            print(
+                f'You are using config.init_device={config.init_device!r}, but you can also use config.init_device="meta" with Composer + FSDP for fast initialization.'
+            )
+            self.apply(self.param_init_fn)
+        self.is_causal = not self.prefix_lm
+        self._attn_bias_initialized = False
+        self.attn_bias = None
+        self.attn_bias_shape = attn_bias_shape(
+            self.attn_impl,
+            config.n_heads,
+            config.max_seq_len,
+            self.alibi,
+            prefix_lm=self.prefix_lm,
+            causal=self.is_causal,
+            use_sequence_id=self.attn_uses_sequence_id,
+        )
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, "bias") and isinstance(module.bias, nn.Parameter):
+                    if config.verbose:
+                        warnings.warn(f"Removing bias ({module.bias}) from {module}.")
+                    module.register_parameter("bias", None)
+        if config.verbose and config.verbose > 2:
+            print(self)
+        if "verbose" not in self.config.init_config:
+            self.config.init_config["verbose"] = self.config.verbose
+        if self.config.init_config["verbose"] > 1:
+            init_fn_name = self.config.init_config["name"]
+            warnings.warn(f"Using {init_fn_name} initialization.")
+        self.gradient_checkpointing = False
+    def get_input_embeddings(self):
+        return self.wte
+    def set_input_embeddings(self, value):
+        self.wte = value
+    @torch.no_grad()
+    def _attn_bias(
+        self,
+        device,
+        dtype,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+    ):
+        if not self._attn_bias_initialized:
+            if self.attn_bias_shape:
+                self.attn_bias = torch.zeros(
+                    self.attn_bias_shape, device=device, dtype=dtype
+                )
+                self.attn_bias = build_attn_bias(
+                    self.attn_impl,
+                    self.attn_bias,
+                    self.config.n_heads,
+                    self.config.max_seq_len,
+                    causal=self.is_causal,
+                    alibi=self.alibi,
+                    alibi_bias_max=self.alibi_bias_max,
+                )
+            self._attn_bias_initialized = True
+        if self.attn_impl == "flash":
+            return (self.attn_bias, attention_mask)
+        if self.attn_bias is not None:
+            self.attn_bias = self.attn_bias.to(dtype=dtype, device=device)
+        attn_bias = self.attn_bias
+        if self.prefix_lm:
+            assert isinstance(attn_bias, torch.Tensor)
+            assert isinstance(prefix_mask, torch.Tensor)
+            attn_bias = self._apply_prefix_mask(attn_bias, prefix_mask)
+        if self.attn_uses_sequence_id and sequence_id is not None:
+            assert isinstance(attn_bias, torch.Tensor)
+            attn_bias = self._apply_sequence_id(attn_bias, sequence_id)
+        if attention_mask is not None:
+            s_k = attention_mask.shape[-1]
+            if attn_bias is None:
+                attn_bias = torch.zeros((1, 1, 1, s_k), device=device, dtype=dtype)
+            else:
+                _s_k = max(0, attn_bias.size(-1) - s_k)
+                attn_bias = attn_bias[:, :, :, _s_k:]
+            if prefix_mask is not None and attention_mask.shape != prefix_mask.shape:
+                raise ValueError(
+                    f"attention_mask shape={attention_mask.shape} "
+                    + f"and prefix_mask shape={prefix_mask.shape} are not equal."
+                )
+            min_val = torch.finfo(attn_bias.dtype).min
+            attn_bias = attn_bias.masked_fill(
+                ~attention_mask.view(-1, 1, 1, s_k), min_val
+            )
+        return (attn_bias, None)
+    def _apply_prefix_mask(self, attn_bias: torch.Tensor, prefix_mask: torch.Tensor):
+        (s_k, s_q) = attn_bias.shape[-2:]
+        if s_k != self.config.max_seq_len or s_q != self.config.max_seq_len:
+            raise ValueError(
+                "attn_bias does not match the expected shape. "
+                + f"The last two dimensions should both be {self.config.max_length} "
+                + f"but are {s_k} and {s_q}."
+            )
+        seq_len = prefix_mask.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f"prefix_mask sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
+            )
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        causal = torch.tril(
+            torch.ones((seq_len, seq_len), dtype=torch.bool, device=prefix_mask.device)
+        ).view(1, 1, seq_len, seq_len)
+        prefix = prefix_mask.view(-1, 1, 1, seq_len)
+        cannot_attend = ~torch.logical_or(causal, prefix.bool())
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def _apply_sequence_id(
+        self, attn_bias: torch.Tensor, sequence_id: torch.LongTensor
+    ):
+        seq_len = sequence_id.shape[-1]
+        if seq_len > self.config.max_seq_len:
+            raise ValueError(
+                f"sequence_id sequence length cannot exceed max_seq_len={self.config.max_seq_len}"
+            )
+        attn_bias = attn_bias[..., :seq_len, :seq_len]
+        cannot_attend = torch.logical_not(
+            torch.eq(sequence_id.view(-1, seq_len, 1), sequence_id.view(-1, 1, seq_len))
+        ).unsqueeze(1)
+        min_val = torch.finfo(attn_bias.dtype).min
+        attn_bias = attn_bias.masked_fill(cannot_attend, min_val)
+        return attn_bias
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        if prefix_mask is not None:
+            prefix_mask = prefix_mask.bool()
+        if not return_dict:
+            raise NotImplementedError(
+                "return_dict False is not implemented yet for MPT"
+            )
+        if output_attentions:
+            if self.attn_impl != "torch":
+                raise NotImplementedError(
+                    "output_attentions is not implemented for MPT when using attn_impl `flash` or `triton`."
+                )
+        if (
+            attention_mask is not None
+            and attention_mask[:, 0].sum() != attention_mask.shape[0]
+            and self.training
+        ):
+            raise NotImplementedError(
+                "MPT does not support training with left padding."
+            )
+        if self.prefix_lm and prefix_mask is None:
+            raise ValueError(
+                "prefix_mask is a required argument when MPT is configured with prefix_lm=True."
+            )
+        if self.training:
+            if self.attn_uses_sequence_id and sequence_id is None:
+                raise ValueError(
+                    "sequence_id is a required argument when MPT is configured with attn_uses_sequence_id=True "
+                    + "and the model is in train mode."
+                )
+            elif self.attn_uses_sequence_id is False and sequence_id is not None:
+                warnings.warn(
+                    "MPT received non-None input for `sequence_id` but is configured with attn_uses_sequence_id=False. "
+                    + "This input will be ignored. If you want the model to use `sequence_id`, set attn_uses_sequence_id to True."
+                )
+        if input_ids is not None:
+            S = input_ids.size(1)
+            assert (
+                S <= self.config.max_seq_len
+            ), f"Cannot forward input with seq_len={S}, this model only supports seq_len<={self.config.max_seq_len}"
+            tok_emb = self.wte(input_ids)
+        else:
+            assert inputs_embeds is not None
+            assert (
+                self.alibi
+            ), "inputs_embeds is not implemented for MPT unless for alibi."
+            S = inputs_embeds.size(1)
+            tok_emb = inputs_embeds
+        if self.alibi:
+            x = tok_emb
+        else:
+            past_position = 0
+            if past_key_values is not None:
+                if len(past_key_values) != self.config.n_layers:
+                    raise ValueError(
+                        f"past_key_values must provide a past_key_value for each attention "
+                        + f"layer in the network (len(past_key_values)={len(past_key_values)!r}; self.config.n_layers={self.config.n_layers!r})."
+                    )
+                past_position = past_key_values[0][0].size(1)
+                if self.attn_impl == "torch":
+                    past_position = past_key_values[0][0].size(3)
+            if S + past_position > self.config.max_seq_len:
+                raise ValueError(
+                    f"Cannot forward input with past sequence length {past_position} and current sequence length {S + 1}, this model only supports total sequence length <= {self.config.max_seq_len}."
+                )
+            pos = torch.arange(
+                past_position,
+                S + past_position,
+                dtype=torch.long,
+                device=input_ids.device,
+            ).unsqueeze(0)
+            if attention_mask is not None:
+                pos = torch.clamp(
+                    pos
+                    - torch.cumsum((~attention_mask).to(torch.int32), dim=1)[
+                        :, past_position:
+                    ],
+                    min=0,
+                )
+            pos_emb = self.wpe(pos)
+            x = tok_emb + pos_emb
+        if self.embedding_fraction == 1:
+            x = self.emb_drop(x)
+        else:
+            x_shrunk = x * self.embedding_fraction + x.detach() * (
+                1 - self.embedding_fraction
+            )
+            assert isinstance(self.emb_drop, nn.Module)
+            x = self.emb_drop(x_shrunk)
+        (attn_bias, attention_mask) = self._attn_bias(
+            device=x.device,
+            dtype=torch.float32,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+        )
+        if use_cache and past_key_values is None:
+            past_key_values = [() for _ in range(self.config.n_layers)]
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        for b_idx, block in enumerate(self.blocks):
+            if output_hidden_states:
+                assert all_hidden_states is not None
+                all_hidden_states = all_hidden_states + (x,)
+            past_key_value = (
+                past_key_values[b_idx] if past_key_values is not None else None
+            )
+            if self.gradient_checkpointing and self.training:
+                (x, attn_weights, past_key_value) = torch.utils.checkpoint.checkpoint(
+                    block, x, past_key_value, attn_bias, attention_mask, self.is_causal
+                )
+            else:
+                (x, attn_weights, past_key_value) = block(
+                    x,
+                    past_key_value=past_key_value,
+                    attn_bias=attn_bias,
+                    attention_mask=attention_mask,
+                    is_causal=self.is_causal,
+                )
+            if past_key_values is not None:
+                past_key_values[b_idx] = past_key_value
+            if output_attentions:
+                assert all_self_attns is not None
+                all_self_attns = all_self_attns + (attn_weights,)
+        x = self.norm_f(x)
+        if output_hidden_states:
+            assert all_hidden_states is not None
+            all_hidden_states = all_hidden_states + (x,)
+        return BaseModelOutputWithPast(
+            last_hidden_state=x,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config["name"]
+        MODEL_INIT_REGISTRY[init_fn_name](
+            module=module,
+            n_layers=self.config.n_layers,
+            d_model=self.config.d_model,
+            **self.config.init_config,
+        )
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+class MPTForCausalLM(MPTPreTrainedModel):
+    def __init__(self, config: MPTConfig):
+        super().__init__(config)
+        if not config.tie_word_embeddings:
+            raise ValueError("MPTForCausalLM only supports tied word embeddings")
+        print(f"Instantiating an MPTForCausalLM model from {__file__}")
+        self.transformer = MPTModel(config)
+        for child in self.transformer.children():
+            if isinstance(child, torch.nn.ModuleList):
+                continue
+            if isinstance(child, torch.nn.Module):
+                child._fsdp_wrap = True
+        self.logit_scale = None
+        if config.logit_scale is not None:
+            logit_scale = config.logit_scale
+            if isinstance(logit_scale, str):
+                if logit_scale == "inv_sqrt_d_model":
+                    logit_scale = 1 / math.sqrt(config.d_model)
+                else:
+                    raise ValueError(
+                        f"logit_scale={logit_scale!r} is not recognized as an option; use numeric value or 'inv_sqrt_d_model'."
+                    )
+            self.logit_scale = logit_scale
+    def get_input_embeddings(self):
+        return self.transformer.wte
+    def set_input_embeddings(self, value):
+        self.transformer.wte = value
+    def get_output_embeddings(self):
+        return self.transformer.wte
+    def set_output_embeddings(self, new_embeddings):
+        self.transformer.wte = new_embeddings
+    def set_decoder(self, decoder):
+        self.transformer = decoder
+    def get_decoder(self):
+        return self.transformer
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[List[Tuple[torch.FloatTensor]]] = None,
+        attention_mask: Optional[torch.ByteTensor] = None,
+        prefix_mask: Optional[torch.ByteTensor] = None,
+        sequence_id: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ):
+        return_dict = (
+            return_dict if return_dict is not None else self.config.return_dict
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                "inputs_embeds has to be None (for hf/peft support)."
+            )
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            prefix_mask=prefix_mask,
+            sequence_id=sequence_id,
+            return_dict=return_dict,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+        )
+        logits = self.transformer.wte(
+            outputs.last_hidden_state.to(self.transformer.wte.weight.device), True
+        )
+        if self.logit_scale is not None:
+            if self.logit_scale == 0:
+                warnings.warn(
+                    f"Multiplying logits by self.logit_scale={self.logit_scale!r}. This will produce uniform (uninformative) outputs."
+                )
+            logits *= self.logit_scale
+        loss = None
+        if labels is not None:
+            labels = torch.roll(labels, shifts=-1)
+            labels[:, -1] = -100
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), labels.to(logits.device).view(-1)
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def param_init_fn(self, module):
+        init_fn_name = self.config.init_config["name"]
+        MODEL_INIT_REGISTRY[init_fn_name](
+            module=module,
+            n_layers=self.config.n_layers,
+            d_model=self.config.d_model,
+            **self.config.init_config,
+        )
+    def fsdp_wrap_fn(self, module):
+        return isinstance(module, MPTBlock)
+    def activation_checkpointing_fn(self, module):
+        return isinstance(module, MPTBlock)
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs
+    ):
+        if inputs_embeds is not None:
+            raise NotImplementedError("inputs_embeds is not implemented for MPT yet")
+        attention_mask = kwargs["attention_mask"].bool()
+        if attention_mask[:, -1].sum() != attention_mask.shape[0]:
+            raise NotImplementedError(
+                "MPT does not support generation with right padding."
+            )
+        if self.transformer.attn_uses_sequence_id and self.training:
+            sequence_id = torch.zeros_like(input_ids[:1])
+        else:
+            sequence_id = None
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        if self.transformer.prefix_lm:
+            prefix_mask = torch.ones_like(attention_mask)
+            if kwargs.get("use_cache") == False:
+                raise NotImplementedError(
+                    "MPT with prefix_lm=True does not support use_cache=False."
+                )
+        else:
+            prefix_mask = None
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "prefix_mask": prefix_mask,
+            "sequence_id": sequence_id,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache", True),
+        }
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        """Used by HuggingFace generate when using beam search with kv-caching.
+        See https://github.com/huggingface/transformers/blob/3ec7a47664ebe40c40f4b722f6bb1cd30c3821ec/src/transformers/models/gpt2/modeling_gpt2.py#L1122-L1133
+        for an example in transformers.
+        """
+        reordered_past = []
+        for layer_past in past_key_values:
+            reordered_past += [
+                tuple(
+                    (past_state.index_select(0, beam_idx) for past_state in layer_past)
+                )
+            ]
+        return reordered_past

model/llava/model/language_model/mpt/norm.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import torch
+def _cast_if_autocast_enabled(tensor):
+    if torch.is_autocast_enabled():
+        if tensor.device.type == "cuda":
+            dtype = torch.get_autocast_gpu_dtype()
+        elif tensor.device.type == "cpu":
+            dtype = torch.get_autocast_cpu_dtype()
+        else:
+            raise NotImplementedError()
+        return tensor.to(dtype=dtype)
+    return tensor
+class LPLayerNorm(torch.nn.LayerNorm):
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-05,
+        elementwise_affine=True,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            normalized_shape=normalized_shape,
+            eps=eps,
+            elementwise_affine=elementwise_affine,
+            device=device,
+            dtype=dtype,
+        )
+    def forward(self, x):
+        module_device = x.device
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (
+            _cast_if_autocast_enabled(self.weight)
+            if self.weight is not None
+            else self.weight
+        )
+        downcast_bias = (
+            _cast_if_autocast_enabled(self.bias) if self.bias is not None else self.bias
+        )
+        with torch.autocast(enabled=False, device_type=module_device.type):
+            return torch.nn.functional.layer_norm(
+                downcast_x,
+                self.normalized_shape,
+                downcast_weight,
+                downcast_bias,
+                self.eps,
+            )
+def rms_norm(x, weight=None, eps=1e-05):
+    output = x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + eps)
+    if weight is not None:
+        return output * weight
+    return output
+class RMSNorm(torch.nn.Module):
+    def __init__(
+        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
+    ):
+        super().__init__()
+        self.eps = eps
+        if weight:
+            self.weight = torch.nn.Parameter(
+                torch.ones(normalized_shape, dtype=dtype, device=device)
+            )
+        else:
+            self.register_parameter("weight", None)
+    def forward(self, x):
+        return rms_norm(x.float(), self.weight, self.eps).to(dtype=x.dtype)
+class LPRMSNorm(RMSNorm):
+    def __init__(
+        self, normalized_shape, eps=1e-05, weight=True, dtype=None, device=None
+    ):
+        super().__init__(
+            normalized_shape=normalized_shape,
+            eps=eps,
+            weight=weight,
+            dtype=dtype,
+            device=device,
+        )
+    def forward(self, x):
+        downcast_x = _cast_if_autocast_enabled(x)
+        downcast_weight = (
+            _cast_if_autocast_enabled(self.weight)
+            if self.weight is not None
+            else self.weight
+        )
+        with torch.autocast(enabled=False, device_type=x.device.type):
+            return rms_norm(downcast_x, downcast_weight, self.eps).to(dtype=x.dtype)
+NORM_CLASS_REGISTRY = {
+    "layernorm": torch.nn.LayerNorm,
+    "low_precision_layernorm": LPLayerNorm,
+    "rmsnorm": RMSNorm,
+    "low_precision_rmsnorm": LPRMSNorm,
+}

model/llava/model/language_model/mpt/param_init_fns.py ADDED Viewed

	@@ -0,0 +1,419 @@

+import math
+import warnings
+from collections.abc import Sequence
+from functools import partial
+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from .norm import NORM_CLASS_REGISTRY
+def torch_default_param_init_fn_(module: nn.Module, verbose: int = 0, **kwargs):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(f"Initializing network using module's reset_parameters attribute")
+    if hasattr(module, "reset_parameters"):
+        module.reset_parameters()
+def fused_init_helper_(module: nn.Module, init_fn_):
+    _fused = getattr(module, "_fused", None)
+    if _fused is None:
+        raise RuntimeError(f"Internal logic error")
+    (dim, splits) = _fused
+    splits = (0, *splits, module.weight.size(dim))
+    for s, e in zip(splits[:-1], splits[1:]):
+        slice_indices = [slice(None)] * module.weight.ndim
+        slice_indices[dim] = slice(s, e)
+        init_fn_(module.weight[slice_indices])
+def generic_param_init_fn_(
+    module: nn.Module,
+    init_fn_,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(f"If model has bias parameters they are initialized to 0.")
+    init_div_is_residual = init_div_is_residual
+    if init_div_is_residual is False:
+        div_is_residual = 1.0
+    elif init_div_is_residual is True:
+        div_is_residual = math.sqrt(2 * n_layers)
+    elif isinstance(init_div_is_residual, float) or isinstance(
+        init_div_is_residual, int
+    ):
+        div_is_residual = init_div_is_residual
+    elif isinstance(init_div_is_residual, str) and init_div_is_residual.isnumeric():
+        div_is_residual = float(init_div_is_residual)
+    else:
+        div_is_residual = 1.0
+        raise ValueError(
+            f"Expected init_div_is_residual to be boolean or numeric, got {init_div_is_residual}"
+        )
+    if init_div_is_residual is not False:
+        if verbose > 1:
+            warnings.warn(
+                f"Initializing _is_residual layers then dividing them by {div_is_residual:.3f}. "
+                + f"Set `init_div_is_residual: false` in init config to disable this."
+            )
+    if isinstance(module, nn.Linear):
+        if hasattr(module, "_fused"):
+            fused_init_helper_(module, init_fn_)
+        else:
+            init_fn_(module.weight)
+        if module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+        if init_div_is_residual is not False and getattr(module, "_is_residual", False):
+            with torch.no_grad():
+                module.weight.div_(div_is_residual)
+    elif isinstance(module, nn.Embedding):
+        if emb_init_std is not None:
+            std = emb_init_std
+            if std == 0:
+                warnings.warn(f"Embedding layer initialized to 0.")
+            emb_init_fn_ = partial(torch.nn.init.normal_, mean=0.0, std=std)
+            if verbose > 1:
+                warnings.warn(
+                    f"Embedding layer initialized using normal distribution with mean=0 and std={std!r}."
+                )
+        elif emb_init_uniform_lim is not None:
+            lim = emb_init_uniform_lim
+            if isinstance(lim, Sequence):
+                if len(lim) > 2:
+                    raise ValueError(
+                        f"Uniform init requires a min and a max limit. User input: {lim}."
+                    )
+                if lim[0] == lim[1]:
+                    warnings.warn(f"Embedding layer initialized to {lim[0]}.")
+            else:
+                if lim == 0:
+                    warnings.warn(f"Embedding layer initialized to 0.")
+                lim = [-lim, lim]
+            (a, b) = lim
+            emb_init_fn_ = partial(torch.nn.init.uniform_, a=a, b=b)
+            if verbose > 1:
+                warnings.warn(
+                    f"Embedding layer initialized using uniform distribution in range {lim}."
+                )
+        else:
+            emb_init_fn_ = init_fn_
+        emb_init_fn_(module.weight)
+    elif isinstance(module, tuple(set(NORM_CLASS_REGISTRY.values()))):
+        if verbose > 1:
+            warnings.warn(
+                f"Norm weights are set to 1. If norm layer has a bias it is initialized to 0."
+            )
+        if hasattr(module, "weight") and module.weight is not None:
+            torch.nn.init.ones_(module.weight)
+        if hasattr(module, "bias") and module.bias is not None:
+            torch.nn.init.zeros_(module.bias)
+    elif isinstance(module, nn.MultiheadAttention):
+        if module._qkv_same_embed_dim:
+            assert module.in_proj_weight is not None
+            assert (
+                module.q_proj_weight is None
+                and module.k_proj_weight is None
+                and (module.v_proj_weight is None)
+            )
+            assert d_model is not None
+            _d = d_model
+            splits = (0, _d, 2 * _d, 3 * _d)
+            for s, e in zip(splits[:-1], splits[1:]):
+                init_fn_(module.in_proj_weight[s:e])
+        else:
+            assert (
+                module.q_proj_weight is not None
+                and module.k_proj_weight is not None
+                and (module.v_proj_weight is not None)
+            )
+            assert module.in_proj_weight is None
+            init_fn_(module.q_proj_weight)
+            init_fn_(module.k_proj_weight)
+            init_fn_(module.v_proj_weight)
+        if module.in_proj_bias is not None:
+            torch.nn.init.zeros_(module.in_proj_bias)
+        if module.bias_k is not None:
+            torch.nn.init.zeros_(module.bias_k)
+        if module.bias_v is not None:
+            torch.nn.init.zeros_(module.bias_v)
+        init_fn_(module.out_proj.weight)
+        if init_div_is_residual is not False and getattr(
+            module.out_proj, "_is_residual", False
+        ):
+            with torch.no_grad():
+                module.out_proj.weight.div_(div_is_residual)
+        if module.out_proj.bias is not None:
+            torch.nn.init.zeros_(module.out_proj.bias)
+    else:
+        for _ in module.parameters(recurse=False):
+            raise NotImplementedError(
+                f"{module.__class__.__name__} parameters are not initialized by param_init_fn."
+            )
+def _normal_init_(std, mean=0.0):
+    return partial(torch.nn.init.normal_, mean=mean, std=std)
+def _normal_param_init_fn_(
+    module: nn.Module,
+    std: float,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    init_fn_ = _normal_init_(std=std)
+    if verbose > 1:
+        warnings.warn(f"Using torch.nn.init.normal_ init fn mean=0.0, std={std}")
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=init_fn_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def baseline_param_init_fn_(
+    module: nn.Module,
+    init_std: float,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    if init_std is None:
+        raise ValueError(
+            "You must set model.init_config['init_std'] to a float value to use the default initialization scheme."
+        )
+    _normal_param_init_fn_(
+        module=module,
+        std=init_std,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def small_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: int,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    std = math.sqrt(2 / (5 * d_model))
+    _normal_param_init_fn_(
+        module=module,
+        std=std,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def neox_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: int,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    verbose: int = 0,
+    **kwargs,
+):
+    """From section 2.3.1 of GPT-NeoX-20B:
+    An Open-Source AutoregressiveLanguage Model — Black et. al. (2022)
+    see https://github.com/EleutherAI/gpt-neox/blob/9610391ab319403cef079b438edd016a2443af54/megatron/model/init_functions.py#L151
+    and https://github.com/EleutherAI/gpt-neox/blob/main/megatron/model/transformer.py
+    """
+    del kwargs
+    residual_div = n_layers / math.sqrt(10)
+    if verbose > 1:
+        warnings.warn(f"setting init_div_is_residual to {residual_div}")
+    small_param_init_fn_(
+        module=module,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=residual_div,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def kaiming_uniform_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    fan_mode: str = "fan_in",
+    init_nonlinearity: str = "leaky_relu",
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(
+            f"Using nn.init.kaiming_uniform_ init fn with parameters: "
+            + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}"
+        )
+    kaiming_uniform_ = partial(
+        nn.init.kaiming_uniform_,
+        a=init_gain,
+        mode=fan_mode,
+        nonlinearity=init_nonlinearity,
+    )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=kaiming_uniform_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def kaiming_normal_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    fan_mode: str = "fan_in",
+    init_nonlinearity: str = "leaky_relu",
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    if verbose > 1:
+        warnings.warn(
+            f"Using nn.init.kaiming_normal_ init fn with parameters: "
+            + f"a={init_gain}, mode={fan_mode}, nonlinearity={init_nonlinearity}"
+        )
+    kaiming_normal_ = partial(
+        torch.nn.init.kaiming_normal_,
+        a=init_gain,
+        mode=fan_mode,
+        nonlinearity=init_nonlinearity,
+    )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=kaiming_normal_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def xavier_uniform_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    verbose: int = 0,
+    **kwargs,
+):
+    del kwargs
+    xavier_uniform_ = partial(torch.nn.init.xavier_uniform_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f"Using torch.nn.init.xavier_uniform_ init fn with parameters: "
+            + f"gain={init_gain}"
+        )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=xavier_uniform_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+def xavier_normal_param_init_fn_(
+    module: nn.Module,
+    n_layers: int,
+    d_model: Optional[int] = None,
+    init_div_is_residual: Union[int, float, str, bool] = True,
+    emb_init_std: Optional[float] = None,
+    emb_init_uniform_lim: Optional[Union[Tuple[float, float], float]] = None,
+    init_gain: float = 0,
+    verbose: int = 0,
+    **kwargs,
+):
+    xavier_normal_ = partial(torch.nn.init.xavier_normal_, gain=init_gain)
+    if verbose > 1:
+        warnings.warn(
+            f"Using torch.nn.init.xavier_normal_ init fn with parameters: "
+            + f"gain={init_gain}"
+        )
+    generic_param_init_fn_(
+        module=module,
+        init_fn_=xavier_normal_,
+        d_model=d_model,
+        n_layers=n_layers,
+        init_div_is_residual=init_div_is_residual,
+        emb_init_std=emb_init_std,
+        emb_init_uniform_lim=emb_init_uniform_lim,
+        verbose=verbose,
+    )
+MODEL_INIT_REGISTRY = {
+    "default_": torch_default_param_init_fn_,
+    "baseline_": baseline_param_init_fn_,
+    "kaiming_uniform_": kaiming_uniform_param_init_fn_,
+    "kaiming_normal_": kaiming_normal_param_init_fn_,
+    "neox_init_": neox_param_init_fn_,
+    "small_init_": small_param_init_fn_,
+    "xavier_uniform_": xavier_uniform_param_init_fn_,
+    "xavier_normal_": xavier_normal_param_init_fn_,
+}

model/llava/model/llava_arch.py ADDED Viewed

	@@ -0,0 +1,398 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+# from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from utils.utils import (DEFAULT_IM_END_TOKEN, DEFAULT_IM_START_TOKEN,
+                         DEFAULT_IMAGE_PATCH_TOKEN, IGNORE_INDEX,
+                         IMAGE_TOKEN_INDEX)
+from .multimodal_encoder.builder import build_vision_tower
+class LlavaMetaModel:
+    def __init__(self, config):
+        super(LlavaMetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+    def get_vision_tower(self):
+        vision_tower = getattr(self, "vision_tower", None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        vision_tower = build_vision_tower(model_args)
+        if fsdp is not None and len(fsdp) > 0:
+            self.vision_tower = [vision_tower]
+        else:
+            self.vision_tower = vision_tower
+        self.config.use_mm_proj = True
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if not hasattr(self, "mm_projector"):
+            self.mm_projector = nn.Linear(
+                self.config.mm_hidden_size, self.config.hidden_size
+            )
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(
+                pretrain_mm_mlp_adapter, map_location="cpu"
+            )
+            def get_w(weights, keyword):
+                return {
+                    k.split(keyword + ".")[1]: v
+                    for k, v in weights.items()
+                    if keyword in k
+                }
+            self.mm_projector.load_state_dict(
+                get_w(mm_projector_weights, "mm_projector")
+            )
+class LlavaMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images
+    ):
+        vision_tower = self.get_vision_tower()
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if (
+                past_key_values is not None
+                and vision_tower is not None
+                and images is not None
+                and input_ids.shape[1] == 1
+            ):
+                attention_mask = torch.ones(
+                    (attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+            return input_ids, attention_mask, past_key_values, None, labels
+        if type(images) is list or images.ndim == 5:
+            concat_images = torch.cat([image for image in images], dim=0)
+            image_features = self.encode_images(concat_images)
+            split_sizes = [image.shape[0] for image in images]
+            image_features = torch.split(image_features, split_sizes, dim=0)
+            image_features = [x.flatten(0, 1) for x in image_features]
+        else:
+            image_features = self.encode_images(images)
+        new_input_embeds = []
+        new_labels = [] if labels is not None else None
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
+                # multimodal LLM, but the current sample is not multimodal
+                cur_input_embeds = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = (
+                    cur_input_embeds
+                    + (
+                        0.0 * self.get_model().mm_projector(vision_tower.dummy_feature)
+                    ).sum()
+                )
+                new_input_embeds.append(cur_input_embeds)
+                if labels is not None:
+                    new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            cur_new_input_embeds = []
+            if labels is not None:
+                cur_labels = labels[batch_idx]
+                cur_new_labels = []
+                assert cur_labels.shape == cur_input_ids.shape
+            while image_token_indices.numel() > 0:
+                cur_image_features = image_features[cur_image_idx]
+                image_token_start = image_token_indices[0]
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_new_input_embeds.append(
+                        self.get_model()
+                        .embed_tokens(cur_input_ids[: image_token_start - 1])
+                        .detach()
+                    )
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(
+                            cur_input_ids[image_token_start - 1 : image_token_start]
+                        )
+                    )
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(
+                            cur_input_ids[image_token_start + 1 : image_token_start + 2]
+                        )
+                    )
+                    if labels is not None:
+                        cur_new_labels.append(cur_labels[:image_token_start])
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=labels.device,
+                                dtype=labels.dtype,
+                            )
+                        )
+                        cur_new_labels.append(
+                            cur_labels[image_token_start : image_token_start + 1]
+                        )
+                        cur_labels = cur_labels[image_token_start + 2 :]
+                elif getattr(self.config, "mm_use_im_start_end", False):
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids[:image_token_start])
+                    )
+                    cur_new_input_embeds.append(cur_image_features)
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(
+                            cur_input_ids[image_token_start + 1 : image_token_start + 2]
+                        )
+                    )
+                    if labels is not None:
+                        cur_new_labels.append(cur_labels[:image_token_start])
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=labels.device,
+                                dtype=labels.dtype,
+                            )
+                        )
+                        cur_new_labels.append(
+                            cur_labels[image_token_start + 1 : image_token_start + 2]
+                        )
+                        cur_labels = cur_labels[image_token_start + 2 :]
+                else:
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids[:image_token_start])
+                    )
+                    cur_new_input_embeds.append(cur_image_features)
+                    if labels is not None:
+                        cur_new_labels.append(cur_labels[:image_token_start])
+                        cur_new_labels.append(
+                            torch.full(
+                                (cur_image_features.shape[0],),
+                                IGNORE_INDEX,
+                                device=labels.device,
+                                dtype=labels.dtype,
+                            )
+                        )
+                        cur_labels = cur_labels[image_token_start + 1 :]
+                cur_image_idx += 1
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_input_ids = cur_input_ids[image_token_start + 2 :]
+                elif getattr(self.config, "mm_use_im_start_end", False):
+                    cur_input_ids = cur_input_ids[image_token_start + 2 :]
+                else:
+                    cur_input_ids = cur_input_ids[image_token_start + 1 :]
+                image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
+            if cur_input_ids.numel() > 0:
+                if getattr(self.config, "tune_mm_mlp_adapter", False) and getattr(
+                    self.config, "mm_use_im_start_end", False
+                ):
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids).detach()
+                    )
+                elif getattr(self.config, "mm_use_im_start_end", False):
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids)
+                    )
+                else:
+                    cur_new_input_embeds.append(
+                        self.get_model().embed_tokens(cur_input_ids)
+                    )
+                if labels is not None:
+                    cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [
+                x.to(device=self.device) for x in cur_new_input_embeds
+            ]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+            if labels is not None:
+                cur_new_labels = torch.cat(cur_new_labels, dim=0)
+                new_labels.append(cur_new_labels)
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat(
+                    (
+                        cur_new_embed,
+                        torch.zeros(
+                            (max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]),
+                            dtype=cur_new_embed.dtype,
+                            device=cur_new_embed.device,
+                        ),
+                    ),
+                    dim=0,
+                )
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if labels is not None:
+                new_labels_align = []
+                _new_labels = new_labels
+                for cur_new_label in new_labels:
+                    cur_new_label = torch.cat(
+                        (
+                            cur_new_label,
+                            torch.full(
+                                (max_len - cur_new_label.shape[0],),
+                                IGNORE_INDEX,
+                                dtype=cur_new_label.dtype,
+                                device=cur_new_label.device,
+                            ),
+                        ),
+                        dim=0,
+                    )
+                    new_labels_align.append(cur_new_label)
+                new_labels = torch.stack(new_labels_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(
+                    attention_mask, _new_labels, new_labels
+                ):
+                    new_attn_mask_pad_left = torch.full(
+                        (cur_new_labels.shape[0] - labels.shape[1],),
+                        True,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    new_attn_mask_pad_right = torch.full(
+                        (cur_new_labels_align.shape[0] - cur_new_labels.shape[0],),
+                        False,
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    cur_new_attention_mask = torch.cat(
+                        (
+                            new_attn_mask_pad_left,
+                            cur_attention_mask,
+                            new_attn_mask_pad_right,
+                        ),
+                        dim=0,
+                    )
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_labels.shape
+        else:
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if labels is not None:
+                new_labels = torch.stack(new_labels, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full(
+                    (
+                        attention_mask.shape[0],
+                        new_input_embeds.shape[1] - input_ids.shape[1],
+                    ),
+                    True,
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                attention_mask = torch.cat(
+                    (new_attn_mask_pad_left, attention_mask), dim=1
+                )
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels
+    # def initialize_vision_tokenizer(self, model_args, tokenizer):
+    def initialize_vision_tokenizer(self, model_args, num_new_tokens):
+        # if model_args.mm_use_im_patch_token:
+        #     tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        #     self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            # num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+            # self.resize_token_embeddings(len(tokenizer))
+            # if num_new_tokens > 0:
+            #     input_embeddings = self.get_input_embeddings().weight.data
+            #     output_embeddings = self.get_output_embeddings().weight.data
+            #     input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            #         dim=0, keepdim=True)
+            #     output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            #         dim=0, keepdim=True)
+            #     input_embeddings[-num_new_tokens:] = input_embeddings_avg
+            #     output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(
+                    model_args.pretrain_mm_mlp_adapter, map_location="cpu"
+                )
+                embed_tokens_weight = mm_projector_weights["model.embed_tokens.weight"]
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[
+                        -num_new_tokens:
+                    ]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(
+                        f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}."
+                    )
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

model/llava/model/make_delta.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+import torch
+from llava.model.utils import auto_upgrade
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(
+        target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
+    )
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in [
+                "model.mm_projector.weight",
+                "model.mm_projector.bias",
+            ], f"{name} not in base model"
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in [
+                "model.embed_tokens.weight",
+                "lm_head.weight",
+            ], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
+            bparam = base.state_dict()[name]
+            param.data[: bparam.shape[0], : bparam.shape[1]] -= bparam
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+    make_delta(
+        args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id
+    )

model/llava/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,17 @@

+from .clip_encoder import CLIPVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(
+        vision_tower_cfg,
+        "mm_vision_tower",
+        getattr(vision_tower_cfg, "vision_tower", None),
+    )
+    if (
+        vision_tower.startswith("openai")
+        or vision_tower.startswith("laion")
+        or "clip" in vision_tower
+    ):
+        return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    raise ValueError(f"Unknown vision tower: {vision_tower}")

model/llava/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPImageProcessor, CLIPVisionConfig, CLIPVisionModel
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, "mm_vision_select_feature", "patch")
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(
+            self.vision_tower_name
+        )
+        self.vision_tower = CLIPVisionModel.from_pretrained(
+            self.vision_tower_name, low_cpu_mem_usage=True
+        )
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    @torch.no_grad()
+    def forward(self, images):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(
+                    image.to(device=self.device, dtype=self.dtype).unsqueeze(0),
+                    output_hidden_states=True,
+                )
+                image_feature = self.feature_select(image_forward_out).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(
+                images.to(device=self.device, dtype=self.dtype),
+                output_hidden_states=True,
+            )
+            image_features = self.feature_select(image_forward_outs).to(images.dtype)
+        torch.cuda.empty_cache()
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

model/llava/model/utils.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import AutoConfig
+def auto_upgrade(config):
+    cfg = AutoConfig.from_pretrained(config)
+    if "llava" in config and "llava" not in cfg.model_type:
+        assert cfg.model_type == "llama"
+        print(
+            "You are using newer LLaVA code base, while the checkpoint of v0 is from older code base."
+        )
+        print(
+            "You must upgrade the checkpoint to the new code base (this can be done automatically)."
+        )
+        confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
+        if confirm.lower() in ["y", "yes"]:
+            print("Upgrading checkpoint...")
+            assert len(cfg.architectures) == 1
+            setattr(cfg.__class__, "model_type", "llava")
+            cfg.architectures[0] = "LlavaLlamaForCausalLM"
+            cfg.save_pretrained(config)
+            print("Checkpoint upgraded.")
+        else:
+            print("Checkpoint upgrade aborted.")
+            exit(1)

model/llava/train/llama_flash_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import logging
+from typing import List, Optional, Tuple
+import torch
+import transformers
+from einops import rearrange
+from torch import nn
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+try:
+    from flash_attn.flash_attn_interface import \
+        flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import (
+        flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func,
+    )
+from flash_attn.bert_padding import pad_input, unpad_input
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    assert past_key_value is None, "past_key_value is not supported"
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+    if key_padding_mask is None:
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        max_s = q_len
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    else:
+        nheads = qkv.shape[-2]
+        x = rearrange(qkv, "b s three h d -> b s (three h d)")
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(
+            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+        )
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(
+            pad_input(
+                rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
+            ),
+            "b s (h d) -> b s h d",
+            h=nheads,
+        )
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        logging.warning(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward

model/llava/train/llava_trainer.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import os
+from typing import Optional
+import torch
+from transformers import Trainer
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, "no ignore status")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {
+        k: t
+        for k, t in named_params
+        if any(key_match in k for key_match in keys_to_match)
+    }
+    to_return = {
+        k: maybe_zero_3(v, ignore_status=True, name=k).cpu()
+        for k, v in to_return.items()
+    }
+    return to_return
+class LLaVATrainer(Trainer):
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+            # Only save Adapter
+            keys_to_match = ["mm_projector"]
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(["embed_tokens", "embed_in"])
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(
+                self.model.named_parameters(), keys_to_match
+            )
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(
+                    weight_to_save, os.path.join(output_dir, f"mm_projector.bin")
+                )
+        else:
+            super(LLaVATrainer, self)._save_checkpoint(model, trial, metrics)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if getattr(self.args, "tune_mm_mlp_adapter", False):
+            pass
+        else:
+            super(LLaVATrainer, self)._save(output_dir, state_dict)